aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/Locking2
-rw-r--r--Documentation/filesystems/debugfs.txt2
-rw-r--r--Documentation/filesystems/f2fs.txt7
-rw-r--r--Documentation/filesystems/nfs/Exporting23
-rw-r--r--Documentation/filesystems/overlayfs.txt198
-rw-r--r--Documentation/filesystems/porting8
-rw-r--r--Documentation/filesystems/proc.txt2
-rw-r--r--Documentation/filesystems/seq_file.txt22
-rw-r--r--Documentation/filesystems/squashfs.txt8
-rw-r--r--Documentation/filesystems/vfs.txt9
10 files changed, 247 insertions, 34 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 94d93b1f8b53..b30753cbf431 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -67,6 +67,7 @@ prototypes:
struct file *, unsigned open_flag,
umode_t create_mode, int *opened);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+ int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
locking rules:
all may block
@@ -96,6 +97,7 @@ fiemap: no
update_time: no
atomic_open: yes
tmpfile: no
+dentry_open: no
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
victim.
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 3a863f692728..88ab81c79109 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -140,7 +140,7 @@ file.
struct dentry *parent,
struct debugfs_regset32 *regset);
- int debugfs_print_regs32(struct seq_file *s, struct debugfs_reg32 *regs,
+ void debugfs_print_regs32(struct seq_file *s, struct debugfs_reg32 *regs,
int nregs, void __iomem *base, char *prefix);
The "base" argument may be 0, but you may want to build the reg32 array
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index 2cca5a25ef89..e0950c483c22 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -122,6 +122,10 @@ disable_ext_identify Disable the extension list configured by mkfs, so f2fs
inline_xattr Enable the inline xattrs feature.
inline_data Enable the inline data feature: New created small(<~3.4k)
files can be written into inode block.
+inline_dentry Enable the inline dir feature: data in new created
+ directory entries can be written into inode block. The
+ space of inode block which is used to store inline
+ dentries is limited to ~3.4k.
flush_merge Merge concurrent cache_flush commands as much as possible
to eliminate redundant command issues. If the underlying
device handles the cache_flush command relatively slowly,
@@ -131,6 +135,9 @@ nobarrier This option can be used if underlying storage guarantees
If this option is set, no cache_flush commands are issued
but f2fs still guarantees the write ordering of all the
data writes.
+fastboot This option is used when a system wants to reduce mount
+ time as much as possible, even though normal performance
+ can be sacrificed.
================================================================================
DEBUGFS ENTRIES
diff --git a/Documentation/filesystems/nfs/Exporting b/Documentation/filesystems/nfs/Exporting
index c8f036a9b13f..520a4becb75c 100644
--- a/Documentation/filesystems/nfs/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
@@ -72,24 +72,11 @@ c/ Helper routines to allocate anonymous dentries, and to help attach
DCACHE_DISCONNECTED) dentry is allocated and attached.
In the case of a directory, care is taken that only one dentry
can ever be attached.
- d_splice_alias(inode, dentry) or d_materialise_unique(dentry, inode)
- will introduce a new dentry into the tree; either the passed-in
- dentry or a preexisting alias for the given inode (such as an
- anonymous one created by d_obtain_alias), if appropriate. The two
- functions differ in their handling of directories with preexisting
- aliases:
- d_splice_alias will use any existing IS_ROOT dentry, but it will
- return -EIO rather than try to move a dentry with a different
- parent. This is appropriate for local filesystems, which
- should never see such an alias unless the filesystem is
- corrupted somehow (for example, if two on-disk directory
- entries refer to the same directory.)
- d_materialise_unique will attempt to move any dentry. This is
- appropriate for distributed filesystems, where finding a
- directory other than where we last cached it may be a normal
- consequence of concurrent operations on other hosts.
- Both functions return NULL when the passed-in dentry is used,
- following the calling convention of ->lookup.
+ d_splice_alias(inode, dentry) will introduce a new dentry into the tree;
+ either the passed-in dentry or a preexisting alias for the given inode
+ (such as an anonymous one created by d_obtain_alias), if appropriate.
+ It returns NULL when the passed-in dentry is used, following the calling
+ convention of ->lookup.
Filesystem Issues
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
new file mode 100644
index 000000000000..a27c950ece61
--- /dev/null
+++ b/Documentation/filesystems/overlayfs.txt
@@ -0,0 +1,198 @@
+Written by: Neil Brown <neilb@suse.de>
+
+Overlay Filesystem
+==================
+
+This document describes a prototype for a new approach to providing
+overlay-filesystem functionality in Linux (sometimes referred to as
+union-filesystems). An overlay-filesystem tries to present a
+filesystem which is the result over overlaying one filesystem on top
+of the other.
+
+The result will inevitably fail to look exactly like a normal
+filesystem for various technical reasons. The expectation is that
+many use cases will be able to ignore these differences.
+
+This approach is 'hybrid' because the objects that appear in the
+filesystem do not all appear to belong to that filesystem. In many
+cases an object accessed in the union will be indistinguishable
+from accessing the corresponding object from the original filesystem.
+This is most obvious from the 'st_dev' field returned by stat(2).
+
+While directories will report an st_dev from the overlay-filesystem,
+all non-directory objects will report an st_dev from the lower or
+upper filesystem that is providing the object. Similarly st_ino will
+only be unique when combined with st_dev, and both of these can change
+over the lifetime of a non-directory object. Many applications and
+tools ignore these values and will not be affected.
+
+Upper and Lower
+---------------
+
+An overlay filesystem combines two filesystems - an 'upper' filesystem
+and a 'lower' filesystem. When a name exists in both filesystems, the
+object in the 'upper' filesystem is visible while the object in the
+'lower' filesystem is either hidden or, in the case of directories,
+merged with the 'upper' object.
+
+It would be more correct to refer to an upper and lower 'directory
+tree' rather than 'filesystem' as it is quite possible for both
+directory trees to be in the same filesystem and there is no
+requirement that the root of a filesystem be given for either upper or
+lower.
+
+The lower filesystem can be any filesystem supported by Linux and does
+not need to be writable. The lower filesystem can even be another
+overlayfs. The upper filesystem will normally be writable and if it
+is it must support the creation of trusted.* extended attributes, and
+must provide valid d_type in readdir responses, so NFS is not suitable.
+
+A read-only overlay of two read-only filesystems may use any
+filesystem type.
+
+Directories
+-----------
+
+Overlaying mainly involves directories. If a given name appears in both
+upper and lower filesystems and refers to a non-directory in either,
+then the lower object is hidden - the name refers only to the upper
+object.
+
+Where both upper and lower objects are directories, a merged directory
+is formed.
+
+At mount time, the two directories given as mount options "lowerdir" and
+"upperdir" are combined into a merged directory:
+
+ mount -t overlay overlay -olowerdir=/lower,upperdir=/upper,\
+workdir=/work /merged
+
+The "workdir" needs to be an empty directory on the same filesystem
+as upperdir.
+
+Then whenever a lookup is requested in such a merged directory, the
+lookup is performed in each actual directory and the combined result
+is cached in the dentry belonging to the overlay filesystem. If both
+actual lookups find directories, both are stored and a merged
+directory is created, otherwise only one is stored: the upper if it
+exists, else the lower.
+
+Only the lists of names from directories are merged. Other content
+such as metadata and extended attributes are reported for the upper
+directory only. These attributes of the lower directory are hidden.
+
+whiteouts and opaque directories
+--------------------------------
+
+In order to support rm and rmdir without changing the lower
+filesystem, an overlay filesystem needs to record in the upper filesystem
+that files have been removed. This is done using whiteouts and opaque
+directories (non-directories are always opaque).
+
+A whiteout is created as a character device with 0/0 device number.
+When a whiteout is found in the upper level of a merged directory, any
+matching name in the lower level is ignored, and the whiteout itself
+is also hidden.
+
+A directory is made opaque by setting the xattr "trusted.overlay.opaque"
+to "y". Where the upper filesystem contains an opaque directory, any
+directory in the lower filesystem with the same name is ignored.
+
+readdir
+-------
+
+When a 'readdir' request is made on a merged directory, the upper and
+lower directories are each read and the name lists merged in the
+obvious way (upper is read first, then lower - entries that already
+exist are not re-added). This merged name list is cached in the
+'struct file' and so remains as long as the file is kept open. If the
+directory is opened and read by two processes at the same time, they
+will each have separate caches. A seekdir to the start of the
+directory (offset 0) followed by a readdir will cause the cache to be
+discarded and rebuilt.
+
+This means that changes to the merged directory do not appear while a
+directory is being read. This is unlikely to be noticed by many
+programs.
+
+seek offsets are assigned sequentially when the directories are read.
+Thus if
+ - read part of a directory
+ - remember an offset, and close the directory
+ - re-open the directory some time later
+ - seek to the remembered offset
+
+there may be little correlation between the old and new locations in
+the list of filenames, particularly if anything has changed in the
+directory.
+
+Readdir on directories that are not merged is simply handled by the
+underlying directory (upper or lower).
+
+
+Non-directories
+---------------
+
+Objects that are not directories (files, symlinks, device-special
+files etc.) are presented either from the upper or lower filesystem as
+appropriate. When a file in the lower filesystem is accessed in a way
+the requires write-access, such as opening for write access, changing
+some metadata etc., the file is first copied from the lower filesystem
+to the upper filesystem (copy_up). Note that creating a hard-link
+also requires copy_up, though of course creation of a symlink does
+not.
+
+The copy_up may turn out to be unnecessary, for example if the file is
+opened for read-write but the data is not modified.
+
+The copy_up process first makes sure that the containing directory
+exists in the upper filesystem - creating it and any parents as
+necessary. It then creates the object with the same metadata (owner,
+mode, mtime, symlink-target etc.) and then if the object is a file, the
+data is copied from the lower to the upper filesystem. Finally any
+extended attributes are copied up.
+
+Once the copy_up is complete, the overlay filesystem simply
+provides direct access to the newly created file in the upper
+filesystem - future operations on the file are barely noticed by the
+overlay filesystem (though an operation on the name of the file such as
+rename or unlink will of course be noticed and handled).
+
+
+Non-standard behavior
+---------------------
+
+The copy_up operation essentially creates a new, identical file and
+moves it over to the old name. The new file may be on a different
+filesystem, so both st_dev and st_ino of the file may change.
+
+Any open files referring to this inode will access the old data and
+metadata. Similarly any file locks obtained before copy_up will not
+apply to the copied up file.
+
+On a file opened with O_RDONLY fchmod(2), fchown(2), futimesat(2) and
+fsetxattr(2) will fail with EROFS.
+
+If a file with multiple hard links is copied up, then this will
+"break" the link. Changes will not be propagated to other names
+referring to the same inode.
+
+Symlinks in /proc/PID/ and /proc/PID/fd which point to a non-directory
+object in overlayfs will not contain valid absolute paths, only
+relative paths leading up to the filesystem's root. This will be
+fixed in the future.
+
+Some operations are not atomic, for example a crash during copy_up or
+rename will leave the filesystem in an inconsistent state. This will
+be addressed in the future.
+
+Changes to underlying filesystems
+---------------------------------
+
+Offline changes, when the overlay is not mounted, are allowed to either
+the upper or the lower trees.
+
+Changes to the underlying filesystems while part of a mounted overlay
+filesystem are not allowed. If the underlying filesystem is changed,
+the behavior of the overlay is undefined, though it will not result in
+a crash or deadlock.
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 0f3a1390bf00..fa2db081505e 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -463,3 +463,11 @@ in your dentry operations instead.
of the in-tree instances did). inode_hash_lock is still held,
of course, so they are still serialized wrt removal from inode hash,
as well as wrt set() callback of iget5_locked().
+--
+[mandatory]
+ d_materialise_unique() is gone; d_splice_alias() does everything you
+ need now. Remember that they have opposite orders of arguments ;-/
+--
+[mandatory]
+ f_dentry is gone; use f_path.dentry, or, better yet, see if you can avoid
+ it entirely.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index eb8a10e22f7c..aae9dd13c91f 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1272,7 +1272,7 @@ softirq.
1.9 Ext4 file system parameters
-------------------------------
+-------------------------------
Information about mounted ext4 file systems can be found in
/proc/fs/ext4. Each mounted filesystem will have a directory in
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index 8ea3e90ace07..b797ed38de46 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -180,23 +180,19 @@ output must be passed to the seq_file code. Some utility functions have
been defined which make this task easy.
Most code will simply use seq_printf(), which works pretty much like
-printk(), but which requires the seq_file pointer as an argument. It is
-common to ignore the return value from seq_printf(), but a function
-producing complicated output may want to check that value and quit if
-something non-zero is returned; an error return means that the seq_file
-buffer has been filled and further output will be discarded.
+printk(), but which requires the seq_file pointer as an argument.
For straight character output, the following functions may be used:
- int seq_putc(struct seq_file *m, char c);
- int seq_puts(struct seq_file *m, const char *s);
- int seq_escape(struct seq_file *m, const char *s, const char *esc);
+ seq_putc(struct seq_file *m, char c);
+ seq_puts(struct seq_file *m, const char *s);
+ seq_escape(struct seq_file *m, const char *s, const char *esc);
The first two output a single character and a string, just like one would
expect. seq_escape() is like seq_puts(), except that any character in s
which is in the string esc will be represented in octal form in the output.
-There is also a pair of functions for printing filenames:
+There are also a pair of functions for printing filenames:
int seq_path(struct seq_file *m, struct path *path, char *esc);
int seq_path_root(struct seq_file *m, struct path *path,
@@ -209,6 +205,14 @@ root is desired, it can be used with seq_path_root(). Note that, if it
turns out that path cannot be reached from root, the value of root will be
changed in seq_file_root() to a root which *does* work.
+A function producing complicated output may want to check
+ bool seq_has_overflowed(struct seq_file *m);
+and avoid further seq_<output> calls if true is returned.
+
+A true return from seq_has_overflowed means that the seq_file buffer will
+be discarded and the seq_show function will attempt to allocate a larger
+buffer and retry printing.
+
Making it all work
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
index 403c090aca39..e5274f84dc56 100644
--- a/Documentation/filesystems/squashfs.txt
+++ b/Documentation/filesystems/squashfs.txt
@@ -2,10 +2,10 @@ SQUASHFS 4.0 FILESYSTEM
=======================
Squashfs is a compressed read-only filesystem for Linux.
-It uses zlib/lzo/xz compression to compress files, inodes and directories.
-Inodes in the system are very small and all blocks are packed to minimise
-data overhead. Block sizes greater than 4K are supported up to a maximum
-of 1Mbytes (default block size 128K).
+It uses zlib, lz4, lzo, or xz compression to compress files, inodes and
+directories. Inodes in the system are very small and all blocks are packed to
+minimise data overhead. Block sizes greater than 4K are supported up to a
+maximum of 1Mbytes (default block size 128K).
Squashfs is intended for general read-only filesystem use, for archival
use (i.e. in cases where a .tar.gz file may be used), and in constrained
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index fceff7c00a3c..43ce0507ee25 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -364,6 +364,7 @@ struct inode_operations {
int (*atomic_open)(struct inode *, struct dentry *, struct file *,
unsigned open_flag, umode_t create_mode, int *opened);
int (*tmpfile) (struct inode *, struct dentry *, umode_t);
+ int (*dentry_open)(struct dentry *, struct file *, const struct cred *);
};
Again, all methods are called without any locks being held, unless
@@ -696,6 +697,12 @@ struct address_space_operations {
but instead uses bmap to find out where the blocks in the file
are and uses those addresses directly.
+ dentry_open: *WARNING: probably going away soon, do not use!* This is an
+ alternative to f_op->open(), the difference is that this method may open
+ a file not necessarily originating from the same filesystem as the one
+ i_op->open() was called on. It may be useful for stacking filesystems
+ which want to allow native I/O directly on underlying files.
+
invalidatepage: If a page has PagePrivate set, then invalidatepage
will be called when part or all of the page is to be removed
@@ -828,7 +835,7 @@ struct file_operations {
ssize_t (*splice_read)(struct file *, struct pipe_inode_info *, size_t, unsigned int);
int (*setlease)(struct file *, long arg, struct file_lock **, void **);
long (*fallocate)(struct file *, int mode, loff_t offset, loff_t len);
- int (*show_fdinfo)(struct seq_file *m, struct file *f);
+ void (*show_fdinfo)(struct seq_file *m, struct file *f);
};
Again, all methods are called without any locks being held, unless