aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c5
-rw-r--r--fs/affs/super.c6
-rw-r--r--fs/afs/cell.c20
-rw-r--r--fs/afs/cmservice.c16
-rw-r--r--fs/afs/dir.c142
-rw-r--r--fs/afs/dir_edit.c12
-rw-r--r--fs/afs/dir_silly.c23
-rw-r--r--fs/afs/dynroot.c3
-rw-r--r--fs/afs/fs_probe.c20
-rw-r--r--fs/afs/fsclient.c101
-rw-r--r--fs/afs/inode.c12
-rw-r--r--fs/afs/internal.h17
-rw-r--r--fs/afs/mntpt.c6
-rw-r--r--fs/afs/proc.c1
-rw-r--r--fs/afs/rotate.c6
-rw-r--r--fs/afs/rxrpc.c77
-rw-r--r--fs/afs/server.c28
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/vl_probe.c18
-rw-r--r--fs/afs/volume.c8
-rw-r--r--fs/afs/write.c5
-rw-r--r--fs/afs/yfsclient.c101
-rw-r--r--fs/aio.c54
-rw-r--r--fs/autofs/expire.c5
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/block_dev.c12
-rw-r--r--fs/btrfs/async-thread.c110
-rw-r--r--fs/btrfs/async-thread.h34
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/check-integrity.c3
-rw-r--r--fs/btrfs/ctree.c10
-rw-r--r--fs/btrfs/ctree.h11
-rw-r--r--fs/btrfs/delayed-inode.c30
-rw-r--r--fs/btrfs/delayed-ref.c8
-rw-r--r--fs/btrfs/disk-io.c102
-rw-r--r--fs/btrfs/extent-tree.c51
-rw-r--r--fs/btrfs/extent_io.c170
-rw-r--r--fs/btrfs/extent_map.c11
-rw-r--r--fs/btrfs/file-item.c16
-rw-r--r--fs/btrfs/file.c25
-rw-r--r--fs/btrfs/free-space-cache.c10
-rw-r--r--fs/btrfs/inode-map.c5
-rw-r--r--fs/btrfs/inode.c378
-rw-r--r--fs/btrfs/ioctl.c33
-rw-r--r--fs/btrfs/ordered-data.c8
-rw-r--r--fs/btrfs/qgroup.c49
-rw-r--r--fs/btrfs/qgroup.h1
-rw-r--r--fs/btrfs/raid56.c5
-rw-r--r--fs/btrfs/reada.c13
-rw-r--r--fs/btrfs/ref-verify.c7
-rw-r--r--fs/btrfs/relocation.c72
-rw-r--r--fs/btrfs/root-tree.c10
-rw-r--r--fs/btrfs/scrub.c17
-rw-r--r--fs/btrfs/send.c100
-rw-r--r--fs/btrfs/super.c47
-rw-r--r--fs/btrfs/tests/btrfs-tests.c1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c112
-rw-r--r--fs/btrfs/tests/free-space-tree-tests.c4
-rw-r--r--fs/btrfs/tests/qgroup-tests.c4
-rw-r--r--fs/btrfs/transaction.c23
-rw-r--r--fs/btrfs/tree-checker.c26
-rw-r--r--fs/btrfs/tree-log.c535
-rw-r--r--fs/btrfs/uuid-tree.c2
-rw-r--r--fs/btrfs/volumes.c105
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/buffer.c44
-rw-r--r--fs/cachefiles/rdwr.c2
-rw-r--r--fs/ceph/caps.c21
-rw-r--r--fs/ceph/dir.c15
-rw-r--r--fs/ceph/export.c14
-rw-r--r--fs/ceph/file.c29
-rw-r--r--fs/ceph/inode.c1
-rw-r--r--fs/ceph/mds_client.c21
-rw-r--r--fs/ceph/quota.c4
-rw-r--r--fs/ceph/snap.c1
-rw-r--r--fs/ceph/super.c72
-rw-r--r--fs/ceph/super.h2
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/cifs/cifs_debug.c5
-rw-r--r--fs/cifs/cifs_dfs_ref.c99
-rw-r--r--fs/cifs/cifsacl.c4
-rw-r--r--fs/cifs/cifsfs.c6
-rw-r--r--fs/cifs/cifsglob.h7
-rw-r--r--fs/cifs/cifsproto.h1
-rw-r--r--fs/cifs/cifssmb.c2
-rw-r--r--fs/cifs/connect.c151
-rw-r--r--fs/cifs/dfs_cache.c5
-rw-r--r--fs/cifs/dir.c1
-rw-r--r--fs/cifs/file.c42
-rw-r--r--fs/cifs/inode.c42
-rw-r--r--fs/cifs/netmisc.c4
-rw-r--r--fs/cifs/smb2file.c4
-rw-r--r--fs/cifs/smb2misc.c139
-rw-r--r--fs/cifs/smb2ops.c71
-rw-r--r--fs/cifs/smb2pdu.c78
-rw-r--r--fs/cifs/smb2pdu.h1
-rw-r--r--fs/cifs/smb2proto.h3
-rw-r--r--fs/cifs/smb2transport.c2
-rw-r--r--fs/cifs/smbdirect.c36
-rw-r--r--fs/cifs/transport.c92
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/configfs/symlink.c33
-rw-r--r--fs/coredump.c8
-rw-r--r--fs/dax.c3
-rw-r--r--fs/debugfs/file.c18
-rw-r--r--fs/dlm/dlm_internal.h1
-rw-r--r--fs/dlm/lockspace.c6
-rw-r--r--fs/drop_caches.c2
-rw-r--r--fs/ecryptfs/crypto.c6
-rw-r--r--fs/ecryptfs/inode.c84
-rw-r--r--fs/ecryptfs/keystore.c2
-rw-r--r--fs/ecryptfs/messaging.c1
-rw-r--r--fs/eventfd.c15
-rw-r--r--fs/eventpoll.c59
-rw-r--r--fs/exec.c6
-rw-r--r--fs/exportfs/expfs.c31
-rw-r--r--fs/ext2/ialloc.c3
-rw-r--r--fs/ext2/inode.c7
-rw-r--r--fs/ext2/super.c6
-rw-r--r--fs/ext2/xattr.c8
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/block_validity.c1
-rw-r--r--fs/ext4/dir.c45
-rw-r--r--fs/ext4/ext4.h83
-rw-r--r--fs/ext4/ext4_extents.h9
-rw-r--r--fs/ext4/extents.c57
-rw-r--r--fs/ext4/file.c10
-rw-r--r--fs/ext4/fsync.c28
-rw-r--r--fs/ext4/hash.c2
-rw-r--r--fs/ext4/ialloc.c25
-rw-r--r--fs/ext4/inode.c102
-rw-r--r--fs/ext4/mballoc.c67
-rw-r--r--fs/ext4/migrate.c27
-rw-r--r--fs/ext4/mmp.c12
-rw-r--r--fs/ext4/namei.c123
-rw-r--r--fs/ext4/page-io.c19
-rw-r--r--fs/ext4/resize.c62
-rw-r--r--fs/ext4/super.c331
-rw-r--r--fs/ext4/xattr.c7
-rw-r--r--fs/f2fs/checkpoint.c20
-rw-r--r--fs/f2fs/data.c15
-rw-r--r--fs/f2fs/f2fs.h34
-rw-r--r--fs/f2fs/file.c61
-rw-r--r--fs/f2fs/inode.c6
-rw-r--r--fs/f2fs/namei.c42
-rw-r--r--fs/f2fs/node.c17
-rw-r--r--fs/f2fs/super.c88
-rw-r--r--fs/f2fs/sysfs.c12
-rw-r--r--fs/f2fs/xattr.c14
-rw-r--r--fs/fat/inode.c25
-rw-r--r--fs/file.c2
-rw-r--r--fs/filesystems.c4
-rw-r--r--fs/fs-writeback.c12
-rw-r--r--fs/fuse/dev.c3
-rw-r--r--fs/fuse/dir.c38
-rw-r--r--fs/fuse/file.c66
-rw-r--r--fs/fuse/fuse_i.h2
-rw-r--r--fs/fuse/readdir.c2
-rw-r--r--fs/gfs2/bmap.c33
-rw-r--r--fs/gfs2/file.c102
-rw-r--r--fs/gfs2/glops.c27
-rw-r--r--fs/gfs2/inode.c2
-rw-r--r--fs/gfs2/log.c27
-rw-r--r--fs/gfs2/log.h2
-rw-r--r--fs/gfs2/lops.c77
-rw-r--r--fs/gfs2/ops_fstype.c14
-rw-r--r--fs/gfs2/quota.c3
-rw-r--r--fs/gfs2/quota.h3
-rw-r--r--fs/gfs2/trans.c2
-rw-r--r--fs/hfsplus/attributes.c4
-rw-r--r--fs/hugetlbfs/inode.c33
-rw-r--r--fs/inode.c8
-rw-r--r--fs/internal.h2
-rw-r--r--fs/io_uring.c101
-rw-r--r--fs/ioctl.c35
-rw-r--r--fs/iomap.c28
-rw-r--r--fs/jbd2/checkpoint.c2
-rw-r--r--fs/jbd2/commit.c55
-rw-r--r--fs/jbd2/journal.c122
-rw-r--r--fs/jbd2/transaction.c26
-rw-r--r--fs/kernfs/dir.c5
-rw-r--r--fs/kernfs/file.c2
-rw-r--r--fs/libfs.c8
-rw-r--r--fs/locks.c62
-rw-r--r--fs/minix/inode.c42
-rw-r--r--fs/minix/itree_common.c8
-rw-r--r--fs/minix/itree_v1.c12
-rw-r--r--fs/minix/itree_v2.c13
-rw-r--r--fs/minix/minix.h1
-rw-r--r--fs/mpage.c2
-rw-r--r--fs/namei.c19
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/Kconfig2
-rw-r--r--fs/nfs/callback_proc.c2
-rw-r--r--fs/nfs/client.c1
-rw-r--r--fs/nfs/delegation.c12
-rw-r--r--fs/nfs/delegation.h1
-rw-r--r--fs/nfs/dir.c47
-rw-r--r--fs/nfs/direct.c6
-rw-r--r--fs/nfs/file.c17
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c11
-rw-r--r--fs/nfs/fscache-index.c6
-rw-r--r--fs/nfs/fscache.c33
-rw-r--r--fs/nfs/fscache.h8
-rw-r--r--fs/nfs/inode.c24
-rw-r--r--fs/nfs/mount_clnt.c5
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs3acl.c22
-rw-r--r--fs/nfs/nfs3xdr.c5
-rw-r--r--fs/nfs/nfs42proc.c4
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4client.c1
-rw-r--r--fs/nfs/nfs4file.c9
-rw-r--r--fs/nfs/nfs4proc.c115
-rw-r--r--fs/nfs/nfs4renewd.c5
-rw-r--r--fs/nfs/nfs4state.c6
-rw-r--r--fs/nfs/nfs4xdr.c11
-rw-r--r--fs/nfs/pagelist.c59
-rw-r--r--fs/nfs/pnfs.c48
-rw-r--r--fs/nfs/pnfs_nfs.c7
-rw-r--r--fs/nfs/write.c25
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs4callback.c6
-rw-r--r--fs/nfsd/nfs4layouts.c2
-rw-r--r--fs/nfsd/nfs4proc.c6
-rw-r--r--fs/nfsd/nfs4state.c19
-rw-r--r--fs/nfsd/nfs4xdr.c20
-rw-r--r--fs/nfsd/nfscache.c3
-rw-r--r--fs/nfsd/nfssvc.c3
-rw-r--r--fs/nfsd/state.h2
-rw-r--r--fs/nfsd/vfs.c34
-rw-r--r--fs/nfsd/vfs.h2
-rw-r--r--fs/nilfs2/segment.c2
-rw-r--r--fs/notify/fanotify/fanotify.c25
-rw-r--r--fs/notify/fsnotify.c4
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c4
-rw-r--r--fs/notify/inotify/inotify_user.c2
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/alloc.c4
-rw-r--r--fs/ocfs2/aops.c25
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmast.c8
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c8
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c8
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c8
-rw-r--r--fs/ocfs2/dlm/dlmlock.c8
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c8
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c8
-rw-r--r--fs/ocfs2/dlm/dlmthread.c8
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c8
-rw-r--r--fs/ocfs2/dlmfs/Makefile2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c31
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c6
-rw-r--r--fs/ocfs2/dlmglue.c25
-rw-r--r--fs/ocfs2/file.c132
-rw-r--r--fs/ocfs2/ioctl.c2
-rw-r--r--fs/ocfs2/journal.c8
-rw-r--r--fs/ocfs2/journal.h8
-rw-r--r--fs/ocfs2/ocfs2.h5
-rw-r--r--fs/ocfs2/ocfs2_fs.h4
-rw-r--r--fs/ocfs2/quota_global.c2
-rw-r--r--fs/ocfs2/suballoc.c13
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/open.c3
-rw-r--r--fs/orangefs/file.c26
-rw-r--r--fs/orangefs/inode.c39
-rw-r--r--fs/orangefs/orangefs-debugfs.c1
-rw-r--r--fs/orangefs/orangefs-kernel.h4
-rw-r--r--fs/overlayfs/copy_up.c2
-rw-r--r--fs/overlayfs/dir.c2
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/file.c12
-rw-r--r--fs/overlayfs/inode.c12
-rw-r--r--fs/overlayfs/namei.c8
-rw-r--r--fs/overlayfs/ovl_entry.h2
-rw-r--r--fs/overlayfs/readdir.c8
-rw-r--r--fs/overlayfs/super.c47
-rw-r--r--fs/pnode.c9
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/self.c2
-rw-r--r--fs/proc/thread_self.c2
-rw-r--r--fs/proc/vmcore.c5
-rw-r--r--fs/pstore/inode.c5
-rw-r--r--fs/pstore/platform.c9
-rw-r--r--fs/pstore/ram.c13
-rw-r--r--fs/pstore/ram_core.c2
-rw-r--r--fs/quota/dquot.c41
-rw-r--r--fs/read_write.c10
-rw-r--r--fs/readdir.c79
-rw-r--r--fs/reiserfs/inode.c12
-rw-r--r--fs/reiserfs/namei.c7
-rw-r--r--fs/reiserfs/reiserfs.h2
-rw-r--r--fs/reiserfs/stree.c3
-rw-r--r--fs/reiserfs/super.c6
-rw-r--r--fs/reiserfs/xattr.c27
-rw-r--r--fs/reiserfs/xattr_acl.c4
-rw-r--r--fs/select.c34
-rw-r--r--fs/splice.c14
-rw-r--r--fs/super.c4
-rw-r--r--fs/ubifs/auth.c17
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c10
-rw-r--r--fs/ubifs/ioctl.c3
-rw-r--r--fs/ubifs/journal.c12
-rw-r--r--fs/ubifs/orphan.c52
-rw-r--r--fs/ubifs/replay.c13
-rw-r--r--fs/ubifs/sb.c2
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/ubifs/tnc_commit.c34
-rw-r--r--fs/udf/super.c23
-rw-r--r--fs/ufs/super.c2
-rw-r--r--fs/unicode/utf8-core.c28
-rw-r--r--fs/userfaultfd.c18
-rw-r--r--fs/xattr.c84
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c16
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h2
-rw-r--r--fs/xfs/scrub/bmap.c22
-rw-r--r--fs/xfs/scrub/common.h9
-rw-r--r--fs/xfs/xfs_bmap_util.c2
-rw-r--r--fs/xfs/xfs_buf.c8
-rw-r--r--fs/xfs/xfs_dquot.c9
-rw-r--r--fs/xfs/xfs_icache.c10
-rw-r--r--fs/xfs/xfs_ioctl.c5
-rw-r--r--fs/xfs/xfs_log.c2
-rw-r--r--fs/xfs/xfs_quotaops.c3
-rw-r--r--fs/xfs/xfs_reflink.c22
-rw-r--r--fs/xfs/xfs_trans_ail.c4
329 files changed, 5344 insertions, 3068 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 15a99f9c7253..39def020a074 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -500,10 +500,9 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
}
#ifdef CONFIG_9P_FSCACHE
- if (v9ses->fscache) {
+ if (v9ses->fscache)
v9fs_cache_session_put_cookie(v9ses);
- kfree(v9ses->cachetag);
- }
+ kfree(v9ses->cachetag);
#endif
kfree(v9ses->uname);
kfree(v9ses->aname);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index e7d036efbaa1..7050d463cdb2 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -557,14 +557,9 @@ affs_remount(struct super_block *sb, int *flags, char *data)
int root_block;
unsigned long mount_flags;
int res = 0;
- char *new_opts;
char volume[32];
char *prefix = NULL;
- new_opts = kstrdup(data, GFP_KERNEL);
- if (data && !new_opts)
- return -ENOMEM;
-
pr_debug("%s(flags=0x%x,opts=\"%s\")\n", __func__, *flags, data);
sync_filesystem(sb);
@@ -575,7 +570,6 @@ affs_remount(struct super_block *sb, int *flags, char *data)
&blocksize, &prefix, volume,
&mount_flags)) {
kfree(prefix);
- kfree(new_opts);
return -EINVAL;
}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index fd5133e26a38..296b489861a9 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -134,8 +134,17 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
_leave(" = -ENAMETOOLONG");
return ERR_PTR(-ENAMETOOLONG);
}
- if (namelen == 5 && memcmp(name, "@cell", 5) == 0)
+
+ /* Prohibit cell names that contain unprintable chars, '/' and '@' or
+ * that begin with a dot. This also precludes "@cell".
+ */
+ if (name[0] == '.')
return ERR_PTR(-EINVAL);
+ for (i = 0; i < namelen; i++) {
+ char ch = name[i];
+ if (!isprint(ch) || ch == '/' || ch == '@')
+ return ERR_PTR(-EINVAL);
+ }
_enter("%*.*s,%s", namelen, namelen, name, addresses);
@@ -145,10 +154,17 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
return ERR_PTR(-ENOMEM);
}
+ cell->name = kmalloc(namelen + 1, GFP_KERNEL);
+ if (!cell->name) {
+ kfree(cell);
+ return ERR_PTR(-ENOMEM);
+ }
+
cell->net = net;
cell->name_len = namelen;
for (i = 0; i < namelen; i++)
cell->name[i] = tolower(name[i]);
+ cell->name[i] = 0;
atomic_set(&cell->usage, 2);
INIT_WORK(&cell->manager, afs_manage_cell);
@@ -194,6 +210,7 @@ parse_failed:
if (ret == -EINVAL)
printk(KERN_ERR "kAFS: bad VL server IP address\n");
error:
+ kfree(cell->name);
kfree(cell);
_leave(" = %d", ret);
return ERR_PTR(ret);
@@ -474,6 +491,7 @@ static void afs_cell_destroy(struct rcu_head *rcu)
afs_put_vlserverlist(cell->net, rcu_access_pointer(cell->vl_servers));
key_put(cell->anonymous_key);
+ kfree(cell->name);
kfree(cell);
_leave(" [destroyed]");
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 00033a481ba0..5778a9b9e9bf 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -169,7 +169,7 @@ static int afs_record_cm_probe(struct afs_call *call, struct afs_server *server)
spin_lock(&server->probe_lock);
- if (!test_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) {
+ if (!test_and_set_bit(AFS_SERVER_FL_HAVE_EPOCH, &server->flags)) {
server->cm_epoch = call->epoch;
server->probe.cm_epoch = call->epoch;
goto out;
@@ -244,6 +244,17 @@ static void afs_cm_destructor(struct afs_call *call)
}
/*
+ * Abort a service call from within an action function.
+ */
+static void afs_abort_service_call(struct afs_call *call, u32 abort_code, int error,
+ const char *why)
+{
+ rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+ abort_code, error, why);
+ afs_set_call_complete(call, error, 0);
+}
+
+/*
* The server supplied a list of callbacks that it wanted to break.
*/
static void SRXAFSCB_CallBack(struct work_struct *work)
@@ -507,8 +518,7 @@ static void SRXAFSCB_ProbeUuid(struct work_struct *work)
if (memcmp(r, &call->net->uuid, sizeof(call->net->uuid)) == 0)
afs_send_empty_reply(call);
else
- rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
- 1, 1, "K-1");
+ afs_abort_service_call(call, 1, 1, "K-1");
afs_put_call(call);
_leave("");
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 9bd5c067d55d..429698d0a596 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -659,7 +659,8 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
cookie->ctx.actor = afs_lookup_filldir;
cookie->name = dentry->d_name;
- cookie->nr_fids = 1; /* slot 0 is saved for the fid we actually want */
+ cookie->nr_fids = 2; /* slot 0 is saved for the fid we actually want
+ * and slot 1 for the directory */
read_seqlock_excl(&dvnode->cb_lock);
dcbi = rcu_dereference_protected(dvnode->cb_interest,
@@ -710,7 +711,11 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
if (!cookie->inodes)
goto out_s;
- for (i = 1; i < cookie->nr_fids; i++) {
+ cookie->fids[1] = dvnode->fid;
+ cookie->statuses[1].cb_break = afs_calc_vnode_cb_break(dvnode);
+ cookie->inodes[1] = igrab(&dvnode->vfs_inode);
+
+ for (i = 2; i < cookie->nr_fids; i++) {
scb = &cookie->statuses[i];
/* Find any inodes that already exist and get their
@@ -804,7 +809,12 @@ success:
continue;
if (cookie->inodes[i]) {
- afs_vnode_commit_status(&fc, AFS_FS_I(cookie->inodes[i]),
+ struct afs_vnode *iv = AFS_FS_I(cookie->inodes[i]);
+
+ if (test_bit(AFS_VNODE_UNSET, &iv->flags))
+ continue;
+
+ afs_vnode_commit_status(&fc, iv,
scb->cb_break, NULL, scb);
continue;
}
@@ -904,6 +914,7 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
struct afs_vnode *dvnode = AFS_FS_I(dir);
+ struct afs_fid fid = {};
struct inode *inode;
struct dentry *d;
struct key *key;
@@ -947,21 +958,18 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
afs_stat_v(dvnode, n_lookup);
inode = afs_do_lookup(dir, dentry, key);
key_put(key);
- if (inode == ERR_PTR(-ENOENT)) {
+ if (inode == ERR_PTR(-ENOENT))
inode = afs_try_auto_mntpt(dentry, dir);
- } else {
- dentry->d_fsdata =
- (void *)(unsigned long)dvnode->status.data_version;
- }
+
+ if (!IS_ERR_OR_NULL(inode))
+ fid = AFS_FS_I(inode)->fid;
+
d = d_splice_alias(inode, dentry);
if (!IS_ERR_OR_NULL(d)) {
d->d_fsdata = dentry->d_fsdata;
- trace_afs_lookup(dvnode, &d->d_name,
- inode ? AFS_FS_I(inode) : NULL);
+ trace_afs_lookup(dvnode, &d->d_name, &fid);
} else {
- trace_afs_lookup(dvnode, &dentry->d_name,
- IS_ERR_OR_NULL(inode) ? NULL
- : AFS_FS_I(inode));
+ trace_afs_lookup(dvnode, &dentry->d_name, &fid);
}
return d;
}
@@ -978,7 +986,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct inode *inode;
struct key *key;
- afs_dataversion_t dir_version;
+ afs_dataversion_t dir_version, invalid_before;
long de_version;
int ret;
@@ -1030,8 +1038,8 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
if (de_version == (long)dir_version)
goto out_valid_noupdate;
- dir_version = dir->invalid_before;
- if (de_version - (long)dir_version >= 0)
+ invalid_before = dir->invalid_before;
+ if (de_version - (long)invalid_before >= 0)
goto out_valid;
_debug("dir modified");
@@ -1221,6 +1229,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
struct afs_fs_cursor fc;
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct key *key;
+ afs_dataversion_t data_version;
int ret;
mode |= S_IFDIR;
@@ -1241,7 +1250,7 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1262,10 +1271,14 @@ static int afs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
goto error_key;
}
- if (ret == 0 &&
- test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
- afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
- afs_edit_dir_for_create);
+ if (ret == 0) {
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
+ afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
+ afs_edit_dir_for_create);
+ up_write(&dvnode->validate_lock);
+ }
key_put(key);
kfree(scb);
@@ -1306,6 +1319,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
struct afs_fs_cursor fc;
struct afs_vnode *dvnode = AFS_FS_I(dir), *vnode = NULL;
struct key *key;
+ afs_dataversion_t data_version;
int ret;
_enter("{%llx:%llu},{%pd}",
@@ -1337,7 +1351,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1350,9 +1364,12 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
ret = afs_end_vnode_operation(&fc);
if (ret == 0) {
afs_dir_remove_subdir(dentry);
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_remove(dvnode, &dentry->d_name,
afs_edit_dir_for_rmdir);
+ up_write(&dvnode->validate_lock);
}
}
@@ -1492,10 +1509,15 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
ret = afs_end_vnode_operation(&fc);
if (ret == 0 && !(scb[1].have_status || scb[1].have_error))
ret = afs_dir_remove_link(dvnode, dentry, key);
- if (ret == 0 &&
- test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
- afs_edit_dir_remove(dvnode, &dentry->d_name,
- afs_edit_dir_for_unlink);
+
+ if (ret == 0) {
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
+ afs_edit_dir_remove(dvnode, &dentry->d_name,
+ afs_edit_dir_for_unlink);
+ up_write(&dvnode->validate_lock);
+ }
}
if (need_rehash && ret < 0 && ret != -ENOENT)
@@ -1521,6 +1543,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
struct afs_status_cb *scb;
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct key *key;
+ afs_dataversion_t data_version;
int ret;
mode |= S_IFREG;
@@ -1545,7 +1568,7 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1566,9 +1589,12 @@ static int afs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
goto error_key;
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
afs_edit_dir_for_create);
+ up_write(&dvnode->validate_lock);
kfree(scb);
key_put(key);
@@ -1596,6 +1622,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct afs_vnode *vnode = AFS_FS_I(d_inode(from));
struct key *key;
+ afs_dataversion_t data_version;
int ret;
_enter("{%llx:%llu},{%llx:%llu},{%pd}",
@@ -1620,7 +1647,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
if (mutex_lock_interruptible_nested(&vnode->io_lock, 1) < 0) {
afs_end_vnode_operation(&fc);
@@ -1650,9 +1677,12 @@ static int afs_link(struct dentry *from, struct inode *dir,
goto error_key;
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_add(dvnode, &dentry->d_name, &vnode->fid,
afs_edit_dir_for_link);
+ up_write(&dvnode->validate_lock);
key_put(key);
kfree(scb);
@@ -1680,6 +1710,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
struct afs_status_cb *scb;
struct afs_vnode *dvnode = AFS_FS_I(dir);
struct key *key;
+ afs_dataversion_t data_version;
int ret;
_enter("{%llx:%llu},{%pd},%s",
@@ -1707,7 +1738,7 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t data_version = dvnode->status.data_version + 1;
+ data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -1728,9 +1759,12 @@ static int afs_symlink(struct inode *dir, struct dentry *dentry,
goto error_key;
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == data_version)
afs_edit_dir_add(dvnode, &dentry->d_name, &iget_data.fid,
afs_edit_dir_for_symlink);
+ up_write(&dvnode->validate_lock);
key_put(key);
kfree(scb);
@@ -1760,6 +1794,8 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *tmp = NULL, *rehash = NULL;
struct inode *new_inode;
struct key *key;
+ afs_dataversion_t orig_data_version;
+ afs_dataversion_t new_data_version;
bool new_negative = d_is_negative(new_dentry);
int ret;
@@ -1838,10 +1874,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
ret = -ERESTARTSYS;
if (afs_begin_vnode_operation(&fc, orig_dvnode, key, true)) {
- afs_dataversion_t orig_data_version;
- afs_dataversion_t new_data_version;
- struct afs_status_cb *new_scb = &scb[1];
-
orig_data_version = orig_dvnode->status.data_version + 1;
if (orig_dvnode != new_dvnode) {
@@ -1852,7 +1884,6 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_data_version = new_dvnode->status.data_version + 1;
} else {
new_data_version = orig_data_version;
- new_scb = &scb[0];
}
while (afs_select_fileserver(&fc)) {
@@ -1860,7 +1891,7 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
fc.cb_break_2 = afs_calc_vnode_cb_break(new_dvnode);
afs_fs_rename(&fc, old_dentry->d_name.name,
new_dvnode, new_dentry->d_name.name,
- &scb[0], new_scb);
+ &scb[0], &scb[1]);
}
afs_vnode_commit_status(&fc, orig_dvnode, fc.cb_break,
@@ -1878,18 +1909,25 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret == 0) {
if (rehash)
d_rehash(rehash);
- if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags))
- afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name,
- afs_edit_dir_for_rename_0);
+ down_write(&orig_dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &orig_dvnode->flags) &&
+ orig_dvnode->status.data_version == orig_data_version)
+ afs_edit_dir_remove(orig_dvnode, &old_dentry->d_name,
+ afs_edit_dir_for_rename_0);
+ if (orig_dvnode != new_dvnode) {
+ up_write(&orig_dvnode->validate_lock);
- if (!new_negative &&
- test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags))
- afs_edit_dir_remove(new_dvnode, &new_dentry->d_name,
- afs_edit_dir_for_rename_1);
+ down_write(&new_dvnode->validate_lock);
+ }
+ if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags) &&
+ orig_dvnode->status.data_version == new_data_version) {
+ if (!new_negative)
+ afs_edit_dir_remove(new_dvnode, &new_dentry->d_name,
+ afs_edit_dir_for_rename_1);
- if (test_bit(AFS_VNODE_DIR_VALID, &new_dvnode->flags))
afs_edit_dir_add(new_dvnode, &new_dentry->d_name,
&vnode->fid, afs_edit_dir_for_rename_2);
+ }
new_inode = d_inode(new_dentry);
if (new_inode) {
@@ -1905,14 +1943,10 @@ static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
* Note that if we ever implement RENAME_EXCHANGE, we'll have
* to update both dentries with opposing dir versions.
*/
- if (new_dvnode != orig_dvnode) {
- afs_update_dentry_version(&fc, old_dentry, &scb[1]);
- afs_update_dentry_version(&fc, new_dentry, &scb[1]);
- } else {
- afs_update_dentry_version(&fc, old_dentry, &scb[0]);
- afs_update_dentry_version(&fc, new_dentry, &scb[0]);
- }
+ afs_update_dentry_version(&fc, old_dentry, &scb[1]);
+ afs_update_dentry_version(&fc, new_dentry, &scb[1]);
d_move(old_dentry, new_dentry);
+ up_write(&new_dvnode->validate_lock);
goto error_tmp;
}
diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c
index d4fbe5f85f1b..b108528bf010 100644
--- a/fs/afs/dir_edit.c
+++ b/fs/afs/dir_edit.c
@@ -68,13 +68,11 @@ static int afs_find_contig_bits(union afs_xdr_dir_block *block, unsigned int nr_
static void afs_set_contig_bits(union afs_xdr_dir_block *block,
int bit, unsigned int nr_slots)
{
- u64 mask, before, after;
+ u64 mask;
mask = (1 << nr_slots) - 1;
mask <<= bit;
- before = *(u64 *)block->hdr.bitmap;
-
block->hdr.bitmap[0] |= (u8)(mask >> 0 * 8);
block->hdr.bitmap[1] |= (u8)(mask >> 1 * 8);
block->hdr.bitmap[2] |= (u8)(mask >> 2 * 8);
@@ -83,8 +81,6 @@ static void afs_set_contig_bits(union afs_xdr_dir_block *block,
block->hdr.bitmap[5] |= (u8)(mask >> 5 * 8);
block->hdr.bitmap[6] |= (u8)(mask >> 6 * 8);
block->hdr.bitmap[7] |= (u8)(mask >> 7 * 8);
-
- after = *(u64 *)block->hdr.bitmap;
}
/*
@@ -93,13 +89,11 @@ static void afs_set_contig_bits(union afs_xdr_dir_block *block,
static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
int bit, unsigned int nr_slots)
{
- u64 mask, before, after;
+ u64 mask;
mask = (1 << nr_slots) - 1;
mask <<= bit;
- before = *(u64 *)block->hdr.bitmap;
-
block->hdr.bitmap[0] &= ~(u8)(mask >> 0 * 8);
block->hdr.bitmap[1] &= ~(u8)(mask >> 1 * 8);
block->hdr.bitmap[2] &= ~(u8)(mask >> 2 * 8);
@@ -108,8 +102,6 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block,
block->hdr.bitmap[5] &= ~(u8)(mask >> 5 * 8);
block->hdr.bitmap[6] &= ~(u8)(mask >> 6 * 8);
block->hdr.bitmap[7] &= ~(u8)(mask >> 7 * 8);
-
- after = *(u64 *)block->hdr.bitmap;
}
/*
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 057b8d322422..bb3fcbfd3afc 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -21,6 +21,7 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
{
struct afs_fs_cursor fc;
struct afs_status_cb *scb;
+ afs_dataversion_t dir_data_version;
int ret = -ERESTARTSYS;
_enter("%pd,%pd", old, new);
@@ -31,7 +32,7 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
trace_afs_silly_rename(vnode, false);
if (afs_begin_vnode_operation(&fc, dvnode, key, true)) {
- afs_dataversion_t dir_data_version = dvnode->status.data_version + 1;
+ dir_data_version = dvnode->status.data_version + 1;
while (afs_select_fileserver(&fc)) {
fc.cb_break = afs_calc_vnode_cb_break(dvnode);
@@ -54,13 +55,17 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
dvnode->silly_key = key_get(key);
}
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == dir_data_version) {
afs_edit_dir_remove(dvnode, &old->d_name,
afs_edit_dir_for_silly_0);
- if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
afs_edit_dir_add(dvnode, &new->d_name,
&vnode->fid, afs_edit_dir_for_silly_1);
+ }
+ up_write(&dvnode->validate_lock);
+
/* vfs_unlink and the like do not issue this when a file is
* sillyrenamed, so do it here.
*/
@@ -186,10 +191,14 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
}
}
- if (ret == 0 &&
- test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags))
- afs_edit_dir_remove(dvnode, &dentry->d_name,
- afs_edit_dir_for_unlink);
+ if (ret == 0) {
+ down_write(&dvnode->validate_lock);
+ if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags) &&
+ dvnode->status.data_version == dir_data_version)
+ afs_edit_dir_remove(dvnode, &dentry->d_name,
+ afs_edit_dir_for_unlink);
+ up_write(&dvnode->validate_lock);
+ }
}
kfree(scb);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 9b3b2f1f1fc0..13522d2b0ec9 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -141,6 +141,9 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr
ASSERTCMP(d_inode(dentry), ==, NULL);
+ if (flags & LOOKUP_CREATE)
+ return ERR_PTR(-EOPNOTSUPP);
+
if (dentry->d_name.len >= AFSNAMEMAX) {
_leave(" = -ENAMETOOLONG");
return ERR_PTR(-ENAMETOOLONG);
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index cfe62b154f68..51ee3dd79700 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -32,9 +32,8 @@ void afs_fileserver_probe_result(struct afs_call *call)
struct afs_server *server = call->server;
unsigned int server_index = call->server_index;
unsigned int index = call->addr_ix;
- unsigned int rtt = UINT_MAX;
+ unsigned int rtt_us;
bool have_result = false;
- u64 _rtt;
int ret = call->error;
_enter("%pU,%u", &server->uuid, index);
@@ -93,15 +92,9 @@ responded:
}
}
- /* Get the RTT and scale it to fit into a 32-bit value that represents
- * over a minute of time so that we can access it with one instruction
- * on a 32-bit system.
- */
- _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
- _rtt /= 64;
- rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt;
- if (rtt < server->probe.rtt) {
- server->probe.rtt = rtt;
+ if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
+ rtt_us < server->probe.rtt) {
+ server->probe.rtt = rtt_us;
alist->preferred = index;
have_result = true;
}
@@ -113,8 +106,7 @@ out:
spin_unlock(&server->probe_lock);
_debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
- server_index, index, &alist->addrs[index].transport,
- (unsigned int)rtt, ret);
+ server_index, index, &alist->addrs[index].transport, rtt_us, ret);
have_result |= afs_fs_probe_done(server);
if (have_result) {
@@ -145,6 +137,7 @@ static int afs_do_probe_fileserver(struct afs_net *net,
read_lock(&server->fs_lock);
ac.alist = rcu_dereference_protected(server->addresses,
lockdep_is_held(&server->fs_lock));
+ afs_get_addrlist(ac.alist);
read_unlock(&server->fs_lock);
atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
@@ -163,6 +156,7 @@ static int afs_do_probe_fileserver(struct afs_net *net,
if (!in_progress)
afs_fs_probe_done(server);
+ afs_put_addrlist(ac.alist);
return in_progress;
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index a1ef0266422a..1a6f22700811 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -58,9 +58,9 @@ static void xdr_dump_bad(const __be32 *bp)
/*
* decode an AFSFetchStatus block
*/
-static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
- struct afs_call *call,
- struct afs_status_cb *scb)
+static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
+ struct afs_call *call,
+ struct afs_status_cb *scb)
{
const struct afs_xdr_AFSFetchStatus *xdr = (const void *)*_bp;
struct afs_file_status *status = &scb->status;
@@ -80,7 +80,7 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
*/
status->abort_code = abort_code;
scb->have_error = true;
- return 0;
+ goto advance;
}
pr_warn("Unknown AFSFetchStatus version %u\n", ntohl(xdr->if_version));
@@ -89,7 +89,8 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
if (abort_code != 0 && inline_error) {
status->abort_code = abort_code;
- return 0;
+ scb->have_error = true;
+ goto advance;
}
type = ntohl(xdr->type);
@@ -125,13 +126,14 @@ static int xdr_decode_AFSFetchStatus(const __be32 **_bp,
data_version |= (u64)ntohl(xdr->data_version_hi) << 32;
status->data_version = data_version;
scb->have_status = true;
-
+advance:
*_bp = (const void *)*_bp + sizeof(*xdr);
- return 0;
+ return;
bad:
xdr_dump_bad(*_bp);
- return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ goto advance;
}
static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
@@ -251,9 +253,7 @@ static int afs_deliver_fs_fetch_status_vnode(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_AFSCallBack(&bp, call, call->out_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
@@ -380,8 +380,6 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
ASSERTCMP(req->offset, <=, PAGE_SIZE);
if (req->offset == PAGE_SIZE) {
req->offset = 0;
- if (req->page_done)
- req->page_done(req);
req->index++;
if (req->remain > 0)
goto begin_page;
@@ -415,9 +413,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
return ret;
bp = call->buffer;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_AFSCallBack(&bp, call, call->out_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
@@ -434,11 +430,13 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
if (req->offset < PAGE_SIZE)
zero_user_segment(req->pages[req->index],
req->offset, PAGE_SIZE);
- if (req->page_done)
- req->page_done(req);
req->offset = 0;
}
+ if (req->page_done)
+ for (req->index = 0; req->index < req->nr_pages; req->index++)
+ req->page_done(req);
+
_leave(" = 0 [done]");
return 0;
}
@@ -573,12 +571,8 @@ static int afs_deliver_fs_create_vnode(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_AFSFid(&bp, call->out_fid);
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
xdr_decode_AFSCallBack(&bp, call, call->out_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
@@ -687,9 +681,7 @@ static int afs_deliver_fs_dir_status_and_vol(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
@@ -780,12 +772,8 @@ static int afs_deliver_fs_link(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
@@ -874,12 +862,8 @@ static int afs_deliver_fs_symlink(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_AFSFid(&bp, call->out_fid);
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
@@ -982,16 +966,12 @@ static int afs_deliver_fs_rename(struct afs_call *call)
if (ret < 0)
return ret;
- /* unmarshall the reply once we've received all of it */
bp = call->buffer;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
- if (ret < 0)
- return ret;
- if (call->out_dir_scb != call->out_scb) {
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- }
+ /* If the two dirs are the same, we have two copies of the same status
+ * report, so we just decode it twice.
+ */
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_dir_scb);
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
@@ -1099,9 +1079,7 @@ static int afs_deliver_fs_store_data(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
@@ -1279,9 +1257,7 @@ static int afs_deliver_fs_store_status(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
@@ -1941,9 +1917,7 @@ static int afs_deliver_fs_fetch_status(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_AFSCallBack(&bp, call, call->out_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
@@ -2049,10 +2023,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
bp = call->buffer;
scb = &call->out_scb[call->count];
- ret = xdr_decode_AFSFetchStatus(&bp, call, scb);
- if (ret < 0)
- return ret;
-
+ xdr_decode_AFSFetchStatus(&bp, call, scb);
call->count++;
if (call->count < call->count2)
goto more_counts;
@@ -2227,9 +2198,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
return ret;
bp = call->buffer;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
call->unmarshall++;
@@ -2310,9 +2279,7 @@ static int afs_deliver_fs_file_status_and_vol(struct afs_call *call)
return ret;
bp = call->buffer;
- ret = xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_AFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_AFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 18a50d4febcf..ae7a69112c43 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -171,6 +171,7 @@ static void afs_apply_status(struct afs_fs_cursor *fc,
struct timespec64 t;
umode_t mode;
bool data_changed = false;
+ bool change_size = false;
BUG_ON(test_bit(AFS_VNODE_UNSET, &vnode->flags));
@@ -226,6 +227,7 @@ static void afs_apply_status(struct afs_fs_cursor *fc,
} else {
set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
}
+ change_size = true;
} else if (vnode->status.type == AFS_FTYPE_DIR) {
/* Expected directory change is handled elsewhere so
* that we can locally edit the directory and save on a
@@ -233,11 +235,19 @@ static void afs_apply_status(struct afs_fs_cursor *fc,
*/
if (test_bit(AFS_VNODE_DIR_VALID, &vnode->flags))
data_changed = false;
+ change_size = true;
}
if (data_changed) {
inode_set_iversion_raw(&vnode->vfs_inode, status->data_version);
- afs_set_i_size(vnode, status->size);
+
+ /* Only update the size if the data version jumped. If the
+ * file is being modified locally, then we might have our own
+ * idea of what the size should be that's not the same as
+ * what's on the server.
+ */
+ if (change_size)
+ afs_set_i_size(vnode, status->size);
}
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7ee63526c6a2..e16b413d133e 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -154,13 +154,14 @@ struct afs_call {
};
unsigned char unmarshall; /* unmarshalling phase */
unsigned char addr_ix; /* Address in ->alist */
- bool incoming; /* T if incoming call */
+ bool drop_ref; /* T if need to drop ref for incoming call */
bool send_pages; /* T if data from mapping should be sent */
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
bool have_reply_time; /* T if have got reply_time */
bool intr; /* T if interruptible */
+ bool unmarshalling_error; /* T if an unmarshalling error occurred */
u16 service_id; /* Actual service ID (after upgrade) */
unsigned int debug_id; /* Trace ID */
u32 operation_ID; /* operation ID for an incoming call */
@@ -396,7 +397,7 @@ struct afs_cell {
struct afs_vlserver_list __rcu *vl_servers;
u8 name_len; /* Length of name */
- char name[64 + 1]; /* Cell name, case-flattened and NUL-padded */
+ char *name; /* Cell name, case-flattened and NUL-padded */
};
/*
@@ -1218,8 +1219,16 @@ static inline void afs_set_call_complete(struct afs_call *call,
ok = true;
}
spin_unlock_bh(&call->state_lock);
- if (ok)
+ if (ok) {
trace_afs_call_done(call);
+
+ /* Asynchronous calls have two refs to release - one from the alloc and
+ * one queued with the work item - and we can't just deallocate the
+ * call because the work item may be queued again.
+ */
+ if (call->drop_ref)
+ afs_put_call(call);
+ }
}
/*
@@ -1340,7 +1349,7 @@ extern struct afs_volume *afs_create_volume(struct afs_fs_context *);
extern void afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
extern void afs_put_volume(struct afs_cell *, struct afs_volume *);
-extern int afs_check_volume_status(struct afs_volume *, struct key *);
+extern int afs_check_volume_status(struct afs_volume *, struct afs_fs_cursor *);
/*
* write.c
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index f532d6d3bd28..79bc5f1338ed 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -126,7 +126,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
if (src_as->cell)
ctx->cell = afs_get_cell(src_as->cell);
- if (size > PAGE_SIZE - 1)
+ if (size < 2 || size > PAGE_SIZE - 1)
return -EINVAL;
page = read_mapping_page(d_inode(mntpt)->i_mapping, 0, NULL);
@@ -140,7 +140,9 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt)
}
buf = kmap(page);
- ret = vfs_parse_fs_string(fc, "source", buf, size);
+ ret = -EINVAL;
+ if (buf[size - 1] == '.')
+ ret = vfs_parse_fs_string(fc, "source", buf, size - 1);
kunmap(page);
put_page(page);
if (ret < 0)
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index fba2ec3a3a9c..106b27011f6d 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -562,6 +562,7 @@ void afs_put_sysnames(struct afs_sysnames *sysnames)
if (sysnames->subs[i] != afs_init_sysname &&
sysnames->subs[i] != sysnames->blank)
kfree(sysnames->subs[i]);
+ kfree(sysnames);
}
}
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index 172ba569cd60..2a3305e42b14 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -192,7 +192,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
write_unlock(&vnode->volume->servers_lock);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
- error = afs_check_volume_status(vnode->volume, fc->key);
+ error = afs_check_volume_status(vnode->volume, fc);
if (error < 0)
goto failed_set_error;
@@ -281,7 +281,7 @@ bool afs_select_fileserver(struct afs_fs_cursor *fc)
set_bit(AFS_VOLUME_WAIT, &vnode->volume->flags);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &vnode->volume->flags);
- error = afs_check_volume_status(vnode->volume, fc->key);
+ error = afs_check_volume_status(vnode->volume, fc);
if (error < 0)
goto failed_set_error;
@@ -341,7 +341,7 @@ start:
/* See if we need to do an update of the volume record. Note that the
* volume may have moved or even have been deleted.
*/
- error = afs_check_volume_status(vnode->volume, fc->key);
+ error = afs_check_volume_status(vnode->volume, fc);
if (error < 0)
goto failed_set_error;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index d1dde2834b6d..b4a589a17c99 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -18,7 +18,6 @@ struct workqueue_struct *afs_async_calls;
static void afs_wake_up_call_waiter(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned long);
-static void afs_delete_async_call(struct work_struct *);
static void afs_process_async_call(struct work_struct *);
static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
@@ -169,7 +168,7 @@ void afs_put_call(struct afs_call *call)
int n = atomic_dec_return(&call->usage);
int o = atomic_read(&net->nr_outstanding_calls);
- trace_afs_call(call, afs_call_trace_put, n + 1, o,
+ trace_afs_call(call, afs_call_trace_put, n, o,
__builtin_return_address(0));
ASSERTCMP(n, >=, 0);
@@ -402,8 +401,10 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
/* If the call is going to be asynchronous, we need an extra ref for
* the call to hold itself so the caller need not hang on to its ref.
*/
- if (call->async)
+ if (call->async) {
afs_get_call(call, afs_call_trace_get);
+ call->drop_ref = true;
+ }
/* create a call */
rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
@@ -413,7 +414,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
afs_wake_up_async_call :
afs_wake_up_call_waiter),
call->upgrade,
- call->intr,
+ (call->intr ? RXRPC_PREINTERRUPTIBLE :
+ RXRPC_UNINTERRUPTIBLE),
call->debug_id);
if (IS_ERR(rxcall)) {
ret = PTR_ERR(rxcall);
@@ -538,6 +540,8 @@ static void afs_deliver_to_call(struct afs_call *call)
ret = call->type->deliver(call);
state = READ_ONCE(call->state);
+ if (ret == 0 && call->unmarshalling_error)
+ ret = -EBADMSG;
switch (ret) {
case 0:
afs_queue_call_work(call);
@@ -584,8 +588,6 @@ static void afs_deliver_to_call(struct afs_call *call)
done:
if (call->type->done)
call->type->done(call);
- if (state == AFS_CALL_COMPLETE && call->incoming)
- afs_put_call(call);
out:
_leave("");
return;
@@ -604,11 +606,7 @@ call_complete:
long afs_wait_for_call_to_complete(struct afs_call *call,
struct afs_addr_cursor *ac)
{
- signed long rtt2, timeout;
long ret;
- bool stalled = false;
- u64 rtt;
- u32 life, last_life;
bool rxrpc_complete = false;
DECLARE_WAITQUEUE(myself, current);
@@ -619,14 +617,6 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
if (ret < 0)
goto out;
- rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
- rtt2 = nsecs_to_jiffies64(rtt) * 2;
- if (rtt2 < 2)
- rtt2 = 2;
-
- timeout = rtt2;
- rxrpc_kernel_check_life(call->net->socket, call->rxcall, &last_life);
-
add_wait_queue(&call->waitq, &myself);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -643,30 +633,13 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
if (afs_check_call_state(call, AFS_CALL_COMPLETE))
break;
- if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall, &life)) {
+ if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
/* rxrpc terminated the call. */
rxrpc_complete = true;
break;
}
- if (call->intr && timeout == 0 &&
- life == last_life && signal_pending(current)) {
- if (stalled)
- break;
- __set_current_state(TASK_RUNNING);
- rxrpc_kernel_probe_life(call->net->socket, call->rxcall);
- timeout = rtt2;
- stalled = true;
- continue;
- }
-
- if (life != last_life) {
- timeout = rtt2;
- last_life = life;
- stalled = false;
- }
-
- timeout = schedule_timeout(timeout);
+ schedule();
}
remove_wait_queue(&call->waitq, &myself);
@@ -734,7 +707,7 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
u = atomic_fetch_add_unless(&call->usage, 1, 0);
if (u != 0) {
- trace_afs_call(call, afs_call_trace_wake, u,
+ trace_afs_call(call, afs_call_trace_wake, u + 1,
atomic_read(&call->net->nr_outstanding_calls),
__builtin_return_address(0));
@@ -744,21 +717,6 @@ static void afs_wake_up_async_call(struct sock *sk, struct rxrpc_call *rxcall,
}
/*
- * Delete an asynchronous call. The work item carries a ref to the call struct
- * that we need to release.
- */
-static void afs_delete_async_call(struct work_struct *work)
-{
- struct afs_call *call = container_of(work, struct afs_call, async_work);
-
- _enter("");
-
- afs_put_call(call);
-
- _leave("");
-}
-
-/*
* Perform I/O processing on an asynchronous call. The work item carries a ref
* to the call struct that we either need to release or to pass on.
*/
@@ -773,16 +731,6 @@ static void afs_process_async_call(struct work_struct *work)
afs_deliver_to_call(call);
}
- if (call->state == AFS_CALL_COMPLETE) {
- /* We have two refs to release - one from the alloc and one
- * queued with the work item - and we can't just deallocate the
- * call because the work item may be queued again.
- */
- call->async_work.func = afs_delete_async_call;
- if (!queue_work(afs_async_calls, &call->async_work))
- afs_put_call(call);
- }
-
afs_put_call(call);
_leave("");
}
@@ -809,6 +757,7 @@ void afs_charge_preallocation(struct work_struct *work)
if (!call)
break;
+ call->drop_ref = true;
call->async = true;
call->state = AFS_CALL_SV_AWAIT_OP_ID;
init_waitqueue_head(&call->waitq);
@@ -1016,5 +965,7 @@ noinline int afs_protocol_error(struct afs_call *call, int error,
enum afs_eproto_cause cause)
{
trace_afs_protocol_error(call, error, cause);
+ if (call)
+ call->unmarshalling_error = true;
return error;
}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index e900cd74361b..1caea74b8c11 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -31,18 +31,11 @@ static void afs_dec_servers_outstanding(struct afs_net *net)
struct afs_server *afs_find_server(struct afs_net *net,
const struct sockaddr_rxrpc *srx)
{
- const struct sockaddr_in6 *a = &srx->transport.sin6, *b;
const struct afs_addr_list *alist;
struct afs_server *server = NULL;
unsigned int i;
- bool ipv6 = true;
int seq = 0, diff;
- if (srx->transport.sin6.sin6_addr.s6_addr32[0] == 0 ||
- srx->transport.sin6.sin6_addr.s6_addr32[1] == 0 ||
- srx->transport.sin6.sin6_addr.s6_addr32[2] == htonl(0xffff))
- ipv6 = false;
-
rcu_read_lock();
do {
@@ -51,7 +44,8 @@ struct afs_server *afs_find_server(struct afs_net *net,
server = NULL;
read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
- if (ipv6) {
+ if (srx->transport.family == AF_INET6) {
+ const struct sockaddr_in6 *a = &srx->transport.sin6, *b;
hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
alist = rcu_dereference(server->addresses);
for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
@@ -67,15 +61,16 @@ struct afs_server *afs_find_server(struct afs_net *net,
}
}
} else {
+ const struct sockaddr_in *a = &srx->transport.sin, *b;
hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) {
alist = rcu_dereference(server->addresses);
for (i = 0; i < alist->nr_ipv4; i++) {
- b = &alist->addrs[i].transport.sin6;
- diff = ((u16 __force)a->sin6_port -
- (u16 __force)b->sin6_port);
+ b = &alist->addrs[i].transport.sin;
+ diff = ((u16 __force)a->sin_port -
+ (u16 __force)b->sin_port);
if (diff == 0)
- diff = ((u32 __force)a->sin6_addr.s6_addr32[3] -
- (u32 __force)b->sin6_addr.s6_addr32[3]);
+ diff = ((u32 __force)a->sin_addr.s_addr -
+ (u32 __force)b->sin_addr.s_addr);
if (diff == 0)
goto found;
}
@@ -575,12 +570,9 @@ retry:
}
ret = wait_on_bit(&server->flags, AFS_SERVER_FL_UPDATING,
- TASK_INTERRUPTIBLE);
+ (fc->flags & AFS_FS_CURSOR_INTR) ?
+ TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
if (ret == -ERESTARTSYS) {
- if (!(fc->flags & AFS_FS_CURSOR_INTR) && server->addresses) {
- _leave(" = t [intr]");
- return true;
- }
fc->error = ret;
_leave(" = f [intr]");
return false;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index f18911e8d770..ce61340b5de6 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -404,6 +404,7 @@ static int afs_test_super(struct super_block *sb, struct fs_context *fc)
return (as->net_ns == fc->net_ns &&
as->volume &&
as->volume->vid == ctx->volume->vid &&
+ as->cell == ctx->cell &&
!as->dyn_root);
}
@@ -447,7 +448,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx)
/* allocate the root inode and dentry */
if (as->dyn_root) {
inode = afs_iget_pseudo_dir(sb, true);
- sb->s_flags |= SB_RDONLY;
} else {
sprintf(sb->s_id, "%llu", as->volume->vid);
afs_activate_volume(as->volume);
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index 858498cc1b05..081b7e5b13f5 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -31,10 +31,9 @@ void afs_vlserver_probe_result(struct afs_call *call)
struct afs_addr_list *alist = call->alist;
struct afs_vlserver *server = call->vlserver;
unsigned int server_index = call->server_index;
+ unsigned int rtt_us = 0;
unsigned int index = call->addr_ix;
- unsigned int rtt = UINT_MAX;
bool have_result = false;
- u64 _rtt;
int ret = call->error;
_enter("%s,%u,%u,%d,%d", server->name, server_index, index, ret, call->abort_code);
@@ -93,15 +92,9 @@ responded:
}
}
- /* Get the RTT and scale it to fit into a 32-bit value that represents
- * over a minute of time so that we can access it with one instruction
- * on a 32-bit system.
- */
- _rtt = rxrpc_kernel_get_rtt(call->net->socket, call->rxcall);
- _rtt /= 64;
- rtt = (_rtt > UINT_MAX) ? UINT_MAX : _rtt;
- if (rtt < server->probe.rtt) {
- server->probe.rtt = rtt;
+ if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
+ rtt_us < server->probe.rtt) {
+ server->probe.rtt = rtt_us;
alist->preferred = index;
have_result = true;
}
@@ -113,8 +106,7 @@ out:
spin_unlock(&server->probe_lock);
_debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
- server_index, index, &alist->addrs[index].transport,
- (unsigned int)rtt, ret);
+ server_index, index, &alist->addrs[index].transport, rtt_us, ret);
have_result |= afs_vl_probe_done(server);
if (have_result) {
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 1a414300b654..9e03e593b548 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -283,7 +283,7 @@ error:
/*
* Make sure the volume record is up to date.
*/
-int afs_check_volume_status(struct afs_volume *volume, struct key *key)
+int afs_check_volume_status(struct afs_volume *volume, struct afs_fs_cursor *fc)
{
time64_t now = ktime_get_real_seconds();
int ret, retries = 0;
@@ -301,7 +301,7 @@ retry:
}
if (!test_and_set_bit_lock(AFS_VOLUME_UPDATING, &volume->flags)) {
- ret = afs_update_volume_status(volume, key);
+ ret = afs_update_volume_status(volume, fc->key);
clear_bit_unlock(AFS_VOLUME_WAIT, &volume->flags);
clear_bit_unlock(AFS_VOLUME_UPDATING, &volume->flags);
wake_up_bit(&volume->flags, AFS_VOLUME_WAIT);
@@ -314,7 +314,9 @@ retry:
return 0;
}
- ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT, TASK_INTERRUPTIBLE);
+ ret = wait_on_bit(&volume->flags, AFS_VOLUME_WAIT,
+ (fc->flags & AFS_FS_CURSOR_INTR) ?
+ TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
if (ret == -ERESTARTSYS) {
_leave(" = %d", ret);
return ret;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 98eb7adbce91..34b50b2eeb97 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -195,11 +195,11 @@ int afs_write_end(struct file *file, struct address_space *mapping,
i_size = i_size_read(&vnode->vfs_inode);
if (maybe_i_size > i_size) {
- spin_lock(&vnode->wb_lock);
+ write_seqlock(&vnode->cb_lock);
i_size = i_size_read(&vnode->vfs_inode);
if (maybe_i_size > i_size)
i_size_write(&vnode->vfs_inode, maybe_i_size);
- spin_unlock(&vnode->wb_lock);
+ write_sequnlock(&vnode->cb_lock);
}
if (!PageUptodate(page)) {
@@ -812,6 +812,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
vmf->page->index, priv);
SetPagePrivate(vmf->page);
set_page_private(vmf->page, priv);
+ file_update_time(file);
sb_end_pagefault(inode->i_sb);
return VM_FAULT_LOCKED;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index a1baf3f1f14d..1f94278f3bca 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -165,23 +165,23 @@ static void xdr_dump_bad(const __be32 *bp)
int i;
pr_notice("YFS XDR: Bad status record\n");
- for (i = 0; i < 5 * 4 * 4; i += 16) {
+ for (i = 0; i < 6 * 4 * 4; i += 16) {
memcpy(x, bp, 16);
bp += 4;
pr_notice("%03x: %08x %08x %08x %08x\n",
i, ntohl(x[0]), ntohl(x[1]), ntohl(x[2]), ntohl(x[3]));
}
- memcpy(x, bp, 4);
- pr_notice("0x50: %08x\n", ntohl(x[0]));
+ memcpy(x, bp, 8);
+ pr_notice("0x60: %08x %08x\n", ntohl(x[0]), ntohl(x[1]));
}
/*
* Decode a YFSFetchStatus block
*/
-static int xdr_decode_YFSFetchStatus(const __be32 **_bp,
- struct afs_call *call,
- struct afs_status_cb *scb)
+static void xdr_decode_YFSFetchStatus(const __be32 **_bp,
+ struct afs_call *call,
+ struct afs_status_cb *scb)
{
const struct yfs_xdr_YFSFetchStatus *xdr = (const void *)*_bp;
struct afs_file_status *status = &scb->status;
@@ -192,7 +192,7 @@ static int xdr_decode_YFSFetchStatus(const __be32 **_bp,
if (status->abort_code == VNOVNODE)
status->nlink = 0;
scb->have_error = true;
- return 0;
+ goto advance;
}
type = ntohl(xdr->type);
@@ -220,13 +220,14 @@ static int xdr_decode_YFSFetchStatus(const __be32 **_bp,
status->size = xdr_to_u64(xdr->size);
status->data_version = xdr_to_u64(xdr->data_version);
scb->have_status = true;
-
+advance:
*_bp += xdr_size(xdr);
- return 0;
+ return;
bad:
xdr_dump_bad(*_bp);
- return afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ afs_protocol_error(call, -EBADMSG, afs_eproto_bad_status);
+ goto advance;
}
/*
@@ -344,9 +345,7 @@ static int yfs_deliver_fs_status_cb_and_volsync(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_YFSCallBack(&bp, call, call->out_scb);
xdr_decode_YFSVolSync(&bp, call->out_volsync);
@@ -368,9 +367,7 @@ static int yfs_deliver_status_and_volsync(struct afs_call *call)
return ret;
bp = call->buffer;
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_YFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
@@ -491,8 +488,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
ASSERTCMP(req->offset, <=, PAGE_SIZE);
if (req->offset == PAGE_SIZE) {
req->offset = 0;
- if (req->page_done)
- req->page_done(req);
req->index++;
if (req->remain > 0)
goto begin_page;
@@ -529,9 +524,7 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
return ret;
bp = call->buffer;
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_YFSCallBack(&bp, call, call->out_scb);
xdr_decode_YFSVolSync(&bp, call->out_volsync);
@@ -549,11 +542,13 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
if (req->offset < PAGE_SIZE)
zero_user_segment(req->pages[req->index],
req->offset, PAGE_SIZE);
- if (req->page_done)
- req->page_done(req);
req->offset = 0;
}
+ if (req->page_done)
+ for (req->index = 0; req->index < req->nr_pages; req->index++)
+ req->page_done(req);
+
_leave(" = 0 [done]");
return 0;
}
@@ -638,12 +633,8 @@ static int yfs_deliver_fs_create_vnode(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_YFSFid(&bp, call->out_fid);
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
xdr_decode_YFSCallBack(&bp, call, call->out_scb);
xdr_decode_YFSVolSync(&bp, call->out_volsync);
@@ -796,14 +787,9 @@ static int yfs_deliver_fs_remove_file2(struct afs_call *call)
return ret;
bp = call->buffer;
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
- if (ret < 0)
- return ret;
-
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
xdr_decode_YFSFid(&bp, &fid);
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
/* Was deleted if vnode->status.abort_code == VNOVNODE. */
xdr_decode_YFSVolSync(&bp, call->out_volsync);
@@ -883,10 +869,7 @@ static int yfs_deliver_fs_remove(struct afs_call *call)
return ret;
bp = call->buffer;
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
- if (ret < 0)
- return ret;
-
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
xdr_decode_YFSVolSync(&bp, call->out_volsync);
return 0;
}
@@ -968,12 +951,8 @@ static int yfs_deliver_fs_link(struct afs_call *call)
return ret;
bp = call->buffer;
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
xdr_decode_YFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
return 0;
@@ -1055,12 +1034,8 @@ static int yfs_deliver_fs_symlink(struct afs_call *call)
/* unmarshall the reply once we've received all of it */
bp = call->buffer;
xdr_decode_YFSFid(&bp, call->out_fid);
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
xdr_decode_YFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
@@ -1148,15 +1123,11 @@ static int yfs_deliver_fs_rename(struct afs_call *call)
return ret;
bp = call->buffer;
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
- if (ret < 0)
- return ret;
- if (call->out_dir_scb != call->out_scb) {
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
- }
-
+ /* If the two dirs are the same, we have two copies of the same status
+ * report, so we just decode it twice.
+ */
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_dir_scb);
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_YFSVolSync(&bp, call->out_volsync);
_leave(" = 0 [done]");
return 0;
@@ -1834,9 +1805,7 @@ static int yfs_deliver_fs_inline_bulk_status(struct afs_call *call)
bp = call->buffer;
scb = &call->out_scb[call->count];
- ret = xdr_decode_YFSFetchStatus(&bp, call, scb);
- if (ret < 0)
- return ret;
+ xdr_decode_YFSFetchStatus(&bp, call, scb);
call->count++;
if (call->count < call->count2)
@@ -2051,9 +2020,7 @@ static int yfs_deliver_fs_fetch_opaque_acl(struct afs_call *call)
bp = call->buffer;
yacl->inherit_flag = ntohl(*bp++);
yacl->num_cleaned = ntohl(*bp++);
- ret = xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
- if (ret < 0)
- return ret;
+ xdr_decode_YFSFetchStatus(&bp, call, call->out_scb);
xdr_decode_YFSVolSync(&bp, call->out_volsync);
call->unmarshall++;
diff --git a/fs/aio.c b/fs/aio.c
index c1e581dd32f5..9bc3ffabd775 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -175,6 +175,7 @@ struct fsync_iocb {
struct file *file;
struct work_struct work;
bool datasync;
+ struct cred *creds;
};
struct poll_iocb {
@@ -1590,8 +1591,11 @@ static int aio_write(struct kiocb *req, const struct iocb *iocb,
static void aio_fsync_work(struct work_struct *work)
{
struct aio_kiocb *iocb = container_of(work, struct aio_kiocb, fsync.work);
+ const struct cred *old_cred = override_creds(iocb->fsync.creds);
iocb->ki_res.res = vfs_fsync(iocb->fsync.file, iocb->fsync.datasync);
+ revert_creds(old_cred);
+ put_cred(iocb->fsync.creds);
iocb_put(iocb);
}
@@ -1605,12 +1609,24 @@ static int aio_fsync(struct fsync_iocb *req, const struct iocb *iocb,
if (unlikely(!req->file->f_op->fsync))
return -EINVAL;
+ req->creds = prepare_creds();
+ if (!req->creds)
+ return -ENOMEM;
+
req->datasync = datasync;
INIT_WORK(&req->work, aio_fsync_work);
schedule_work(&req->work);
return 0;
}
+static void aio_poll_put_work(struct work_struct *work)
+{
+ struct poll_iocb *req = container_of(work, struct poll_iocb, work);
+ struct aio_kiocb *iocb = container_of(req, struct aio_kiocb, poll);
+
+ iocb_put(iocb);
+}
+
static void aio_poll_complete_work(struct work_struct *work)
{
struct poll_iocb *req = container_of(work, struct poll_iocb, work);
@@ -1675,6 +1691,8 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
list_del_init(&req->wait.entry);
if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+ struct kioctx *ctx = iocb->ki_ctx;
+
/*
* Try to complete the iocb inline if we can. Use
* irqsave/irqrestore because not all filesystems (e.g. fuse)
@@ -1684,8 +1702,14 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
list_del(&iocb->ki_list);
iocb->ki_res.res = mangle_poll(mask);
req->done = true;
- spin_unlock_irqrestore(&iocb->ki_ctx->ctx_lock, flags);
- iocb_put(iocb);
+ if (iocb->ki_eventfd && eventfd_signal_count()) {
+ iocb = NULL;
+ INIT_WORK(&req->work, aio_poll_put_work);
+ schedule_work(&req->work);
+ }
+ spin_unlock_irqrestore(&ctx->ctx_lock, flags);
+ if (iocb)
+ iocb_put(iocb);
} else {
schedule_work(&req->work);
}
@@ -2093,7 +2117,6 @@ SYSCALL_DEFINE6(io_pgetevents,
const struct __aio_sigset __user *, usig)
{
struct __aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 ts;
bool interrupted;
int ret;
@@ -2104,14 +2127,14 @@ SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
@@ -2129,7 +2152,6 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
const struct __aio_sigset __user *, usig)
{
struct __aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
struct timespec64 ts;
bool interrupted;
int ret;
@@ -2141,14 +2163,14 @@ SYSCALL_DEFINE6(io_pgetevents_time32,
return -EFAULT;
- ret = set_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_user_sigmask(ksig.sigmask, ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &ts : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
@@ -2182,7 +2204,7 @@ SYSCALL_DEFINE5(io_getevents_time32, __u32, ctx_id,
#ifdef CONFIG_COMPAT
struct __compat_aio_sigset {
- compat_sigset_t __user *sigmask;
+ compat_uptr_t sigmask;
compat_size_t sigsetsize;
};
@@ -2196,8 +2218,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
struct old_timespec32 __user *, timeout,
const struct __compat_aio_sigset __user *, usig)
{
- struct __compat_aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
+ struct __compat_aio_sigset ksig = { 0, };
struct timespec64 t;
bool interrupted;
int ret;
@@ -2208,14 +2229,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
@@ -2232,8 +2253,7 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
struct __kernel_timespec __user *, timeout,
const struct __compat_aio_sigset __user *, usig)
{
- struct __compat_aio_sigset ksig = { NULL, };
- sigset_t ksigmask, sigsaved;
+ struct __compat_aio_sigset ksig = { 0, };
struct timespec64 t;
bool interrupted;
int ret;
@@ -2244,14 +2264,14 @@ COMPAT_SYSCALL_DEFINE6(io_pgetevents_time64,
if (usig && copy_from_user(&ksig, usig, sizeof(ksig)))
return -EFAULT;
- ret = set_compat_user_sigmask(ksig.sigmask, &ksigmask, &sigsaved, ksig.sigsetsize);
+ ret = set_compat_user_sigmask(compat_ptr(ksig.sigmask), ksig.sigsetsize);
if (ret)
return ret;
ret = do_io_getevents(ctx_id, min_nr, nr, events, timeout ? &t : NULL);
interrupted = signal_pending(current);
- restore_user_sigmask(ksig.sigmask, &sigsaved, interrupted);
+ restore_saved_sigmask_unless(interrupted);
if (interrupted && !ret)
ret = -ERESTARTNOHAND;
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index cdff0567aacb..2d01553a6d58 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -498,9 +498,10 @@ static struct dentry *autofs_expire_indirect(struct super_block *sb,
*/
how &= ~AUTOFS_EXP_LEAVES;
found = should_expire(expired, mnt, timeout, how);
- if (!found || found != expired)
- /* Something has changed, continue */
+ if (found != expired) { // something has changed, continue
+ dput(found);
goto next;
+ }
if (expired != dentry)
dput(dentry);
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 1f95ed1ad2ee..1ec2fa94d236 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1752,7 +1752,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
(!regset->active || regset->active(t->task, regset) > 0)) {
int ret;
size_t size = regset_size(t->task, regset);
- void *data = kmalloc(size, GFP_KERNEL);
+ void *data = kzalloc(size, GFP_KERNEL);
if (unlikely(!data))
return 0;
ret = regset->get(t->task, regset,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a9e7c1b69437..ff09b7be4878 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1548,10 +1548,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
*/
if (!for_part) {
ret = devcgroup_inode_permission(bdev->bd_inode, perm);
- if (ret != 0) {
- bdput(bdev);
+ if (ret != 0)
return ret;
- }
}
restart:
@@ -1623,8 +1621,10 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
goto out_clear;
BUG_ON(for_part);
ret = __blkdev_get(whole, mode, 1);
- if (ret)
+ if (ret) {
+ bdput(whole);
goto out_clear;
+ }
bdev->bd_contains = whole;
bdev->bd_part = disk_get_part(disk, partno);
if (!(disk->flags & GENHD_FL_UP) ||
@@ -1677,7 +1677,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
disk_unblock_events(disk);
put_disk_and_module(disk);
out:
- bdput(bdev);
return ret;
}
@@ -1744,6 +1743,9 @@ int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder)
bdput(whole);
}
+ if (res)
+ bdput(bdev);
+
return res;
}
EXPORT_SYMBOL(blkdev_get);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 122cb97c7909..29e28cee0e3e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -51,16 +51,6 @@ struct btrfs_workqueue {
struct __btrfs_workqueue *high;
};
-static void normal_work_helper(struct btrfs_work *work);
-
-#define BTRFS_WORK_HELPER(name) \
-noinline_for_stack void btrfs_##name(struct work_struct *arg) \
-{ \
- struct btrfs_work *work = container_of(arg, struct btrfs_work, \
- normal_work); \
- normal_work_helper(work); \
-}
-
struct btrfs_fs_info *
btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
{
@@ -87,29 +77,6 @@ bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
}
-BTRFS_WORK_HELPER(worker_helper);
-BTRFS_WORK_HELPER(delalloc_helper);
-BTRFS_WORK_HELPER(flush_delalloc_helper);
-BTRFS_WORK_HELPER(cache_helper);
-BTRFS_WORK_HELPER(submit_helper);
-BTRFS_WORK_HELPER(fixup_helper);
-BTRFS_WORK_HELPER(endio_helper);
-BTRFS_WORK_HELPER(endio_meta_helper);
-BTRFS_WORK_HELPER(endio_meta_write_helper);
-BTRFS_WORK_HELPER(endio_raid56_helper);
-BTRFS_WORK_HELPER(endio_repair_helper);
-BTRFS_WORK_HELPER(rmw_helper);
-BTRFS_WORK_HELPER(endio_write_helper);
-BTRFS_WORK_HELPER(freespace_write_helper);
-BTRFS_WORK_HELPER(delayed_meta_helper);
-BTRFS_WORK_HELPER(readahead_helper);
-BTRFS_WORK_HELPER(qgroup_rescan_helper);
-BTRFS_WORK_HELPER(extent_refs_helper);
-BTRFS_WORK_HELPER(scrub_helper);
-BTRFS_WORK_HELPER(scrubwrc_helper);
-BTRFS_WORK_HELPER(scrubnc_helper);
-BTRFS_WORK_HELPER(scrubparity_helper);
-
static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
unsigned int flags, int limit_active, int thresh)
@@ -250,16 +217,17 @@ out:
}
}
-static void run_ordered_work(struct __btrfs_workqueue *wq)
+static void run_ordered_work(struct __btrfs_workqueue *wq,
+ struct btrfs_work *self)
{
struct list_head *list = &wq->ordered_list;
struct btrfs_work *work;
spinlock_t *lock = &wq->list_lock;
unsigned long flags;
+ void *wtag;
+ bool free_self = false;
while (1) {
- void *wtag;
-
spin_lock_irqsave(lock, flags);
if (list_empty(list))
break;
@@ -285,20 +253,54 @@ static void run_ordered_work(struct __btrfs_workqueue *wq)
list_del(&work->ordered_list);
spin_unlock_irqrestore(lock, flags);
- /*
- * We don't want to call the ordered free functions with the
- * lock held though. Save the work as tag for the trace event,
- * because the callback could free the structure.
- */
- wtag = work;
- work->ordered_free(work);
- trace_btrfs_all_work_done(wq->fs_info, wtag);
+ if (work == self) {
+ /*
+ * This is the work item that the worker is currently
+ * executing.
+ *
+ * The kernel workqueue code guarantees non-reentrancy
+ * of work items. I.e., if a work item with the same
+ * address and work function is queued twice, the second
+ * execution is blocked until the first one finishes. A
+ * work item may be freed and recycled with the same
+ * work function; the workqueue code assumes that the
+ * original work item cannot depend on the recycled work
+ * item in that case (see find_worker_executing_work()).
+ *
+ * Note that different types of Btrfs work can depend on
+ * each other, and one type of work on one Btrfs
+ * filesystem may even depend on the same type of work
+ * on another Btrfs filesystem via, e.g., a loop device.
+ * Therefore, we must not allow the current work item to
+ * be recycled until we are really done, otherwise we
+ * break the above assumption and can deadlock.
+ */
+ free_self = true;
+ } else {
+ /*
+ * We don't want to call the ordered free functions with
+ * the lock held though. Save the work as tag for the
+ * trace event, because the callback could free the
+ * structure.
+ */
+ wtag = work;
+ work->ordered_free(work);
+ trace_btrfs_all_work_done(wq->fs_info, wtag);
+ }
}
spin_unlock_irqrestore(lock, flags);
+
+ if (free_self) {
+ wtag = self;
+ self->ordered_free(self);
+ trace_btrfs_all_work_done(wq->fs_info, wtag);
+ }
}
-static void normal_work_helper(struct btrfs_work *work)
+static void btrfs_work_helper(struct work_struct *normal_work)
{
+ struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
+ normal_work);
struct __btrfs_workqueue *wq;
void *wtag;
int need_order = 0;
@@ -322,21 +324,19 @@ static void normal_work_helper(struct btrfs_work *work)
work->func(work);
if (need_order) {
set_bit(WORK_DONE_BIT, &work->flags);
- run_ordered_work(wq);
+ run_ordered_work(wq, work);
}
if (!need_order)
trace_btrfs_all_work_done(wq->fs_info, wtag);
}
-void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t uniq_func,
- btrfs_func_t func,
- btrfs_func_t ordered_func,
- btrfs_func_t ordered_free)
+void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
+ btrfs_func_t ordered_func, btrfs_func_t ordered_free)
{
work->func = func;
work->ordered_func = ordered_func;
work->ordered_free = ordered_free;
- INIT_WORK(&work->normal_work, uniq_func);
+ INIT_WORK(&work->normal_work, btrfs_work_helper);
INIT_LIST_HEAD(&work->ordered_list);
work->flags = 0;
}
@@ -400,3 +400,11 @@ void btrfs_set_work_high_priority(struct btrfs_work *work)
{
set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
}
+
+void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
+{
+ if (wq->high)
+ flush_workqueue(wq->high->normal_wq);
+
+ flush_workqueue(wq->normal->normal_wq);
+}
diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h
index 7861c9feba5f..714ab6855423 100644
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -29,42 +29,13 @@ struct btrfs_work {
unsigned long flags;
};
-#define BTRFS_WORK_HELPER_PROTO(name) \
-void btrfs_##name(struct work_struct *arg)
-
-BTRFS_WORK_HELPER_PROTO(worker_helper);
-BTRFS_WORK_HELPER_PROTO(delalloc_helper);
-BTRFS_WORK_HELPER_PROTO(flush_delalloc_helper);
-BTRFS_WORK_HELPER_PROTO(cache_helper);
-BTRFS_WORK_HELPER_PROTO(submit_helper);
-BTRFS_WORK_HELPER_PROTO(fixup_helper);
-BTRFS_WORK_HELPER_PROTO(endio_helper);
-BTRFS_WORK_HELPER_PROTO(endio_meta_helper);
-BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper);
-BTRFS_WORK_HELPER_PROTO(endio_raid56_helper);
-BTRFS_WORK_HELPER_PROTO(endio_repair_helper);
-BTRFS_WORK_HELPER_PROTO(rmw_helper);
-BTRFS_WORK_HELPER_PROTO(endio_write_helper);
-BTRFS_WORK_HELPER_PROTO(freespace_write_helper);
-BTRFS_WORK_HELPER_PROTO(delayed_meta_helper);
-BTRFS_WORK_HELPER_PROTO(readahead_helper);
-BTRFS_WORK_HELPER_PROTO(qgroup_rescan_helper);
-BTRFS_WORK_HELPER_PROTO(extent_refs_helper);
-BTRFS_WORK_HELPER_PROTO(scrub_helper);
-BTRFS_WORK_HELPER_PROTO(scrubwrc_helper);
-BTRFS_WORK_HELPER_PROTO(scrubnc_helper);
-BTRFS_WORK_HELPER_PROTO(scrubparity_helper);
-
-
struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
const char *name,
unsigned int flags,
int limit_active,
int thresh);
-void btrfs_init_work(struct btrfs_work *work, btrfs_work_func_t helper,
- btrfs_func_t func,
- btrfs_func_t ordered_func,
- btrfs_func_t ordered_free);
+void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
+ btrfs_func_t ordered_func, btrfs_func_t ordered_free);
void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
@@ -73,5 +44,6 @@ void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info *btrfs_work_owner(const struct btrfs_work *work);
struct btrfs_fs_info *btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
+void btrfs_flush_workqueue(struct btrfs_workqueue *wq);
#endif
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 69f8ab4d91f2..110636757d95 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1422,6 +1422,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
+ *roots = NULL;
return ret;
}
node = ulist_next(tmp, &uiter);
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index b0c8094528d1..3d2e2fdbdfb6 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -629,7 +629,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev,
static int btrfsic_process_superblock(struct btrfsic_state *state,
struct btrfs_fs_devices *fs_devices)
{
- struct btrfs_fs_info *fs_info = state->fs_info;
struct btrfs_super_block *selected_super;
struct list_head *dev_head = &fs_devices->devices;
struct btrfs_device *device;
@@ -700,7 +699,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
break;
}
- num_copies = btrfs_num_copies(fs_info, next_bytenr,
+ num_copies = btrfs_num_copies(state->fs_info, next_bytenr,
state->metablock_size);
if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
pr_info("num_copies(log_bytenr=%llu) = %d\n",
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 322ec4b839ed..ebd2ab53e00d 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -308,12 +308,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
struct seq_list *elem)
{
write_lock(&fs_info->tree_mod_log_lock);
- spin_lock(&fs_info->tree_mod_seq_lock);
if (!elem->seq) {
elem->seq = btrfs_inc_tree_mod_seq(fs_info);
list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
}
- spin_unlock(&fs_info->tree_mod_seq_lock);
write_unlock(&fs_info->tree_mod_log_lock);
return elem->seq;
@@ -333,7 +331,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
if (!seq_putting)
return;
- spin_lock(&fs_info->tree_mod_seq_lock);
+ write_lock(&fs_info->tree_mod_log_lock);
list_del(&elem->list);
elem->seq = 0;
@@ -344,24 +342,22 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
* blocker with lower sequence number exists, we
* cannot remove anything from the log
*/
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ write_unlock(&fs_info->tree_mod_log_lock);
return;
}
min_seq = cur_elem->seq;
}
}
- spin_unlock(&fs_info->tree_mod_seq_lock);
/*
* anything that's lower than the lowest existing (read: blocked)
* sequence number can be removed from the tree.
*/
- write_lock(&fs_info->tree_mod_log_lock);
tm_root = &fs_info->tree_mod_log;
for (node = rb_first(tm_root); node; node = next) {
next = rb_next(node);
tm = rb_entry(node, struct tree_mod_elem, node);
- if (tm->seq > min_seq)
+ if (tm->seq >= min_seq)
continue;
rb_erase(node, tm_root);
kfree(tm);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b8e9938f7cd7..5d8b7a9820e1 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -939,14 +939,12 @@ struct btrfs_fs_info {
atomic_t nr_delayed_iputs;
wait_queue_head_t delayed_iputs_wait;
- /* this protects tree_mod_seq_list */
- spinlock_t tree_mod_seq_lock;
atomic64_t tree_mod_seq;
- struct list_head tree_mod_seq_list;
- /* this protects tree_mod_log */
+ /* this protects tree_mod_log and tree_mod_seq_list */
rwlock_t tree_mod_log_lock;
struct rb_root tree_mod_log;
+ struct list_head tree_mod_seq_list;
atomic_t async_delalloc_pages;
@@ -2829,8 +2827,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
int nitems, bool use_global_rsv);
void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *rsv);
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
- bool qgroup_free);
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes);
void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
@@ -3184,7 +3181,7 @@ int btrfs_find_name_in_ext_backref(struct extent_buffer *leaf, int slot,
/* file-item.c */
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr, u64 len);
+ struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u32 *dst);
blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio,
u64 logical_offset);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 6858a05606dd..1c8dd934bac9 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/iversion.h>
+#include <linux/sched/mm.h>
#include "delayed-inode.h"
#include "disk-io.h"
#include "transaction.h"
@@ -803,11 +804,14 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
struct btrfs_delayed_item *delayed_item)
{
struct extent_buffer *leaf;
+ unsigned int nofs_flag;
char *ptr;
int ret;
+ nofs_flag = memalloc_nofs_save();
ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
delayed_item->data_len);
+ memalloc_nofs_restore(nofs_flag);
if (ret < 0 && ret != -EEXIST)
return ret;
@@ -935,6 +939,7 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
struct btrfs_delayed_node *node)
{
struct btrfs_delayed_item *curr, *prev;
+ unsigned int nofs_flag;
int ret = 0;
do_again:
@@ -943,7 +948,9 @@ do_again:
if (!curr)
goto delete_fail;
+ nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
+ memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto delete_fail;
else if (ret > 0) {
@@ -1010,6 +1017,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_inode_item *inode_item;
struct extent_buffer *leaf;
+ unsigned int nofs_flag;
int mod;
int ret;
@@ -1022,7 +1030,9 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
else
mod = 1;
+ nofs_flag = memalloc_nofs_save();
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
+ memalloc_nofs_restore(nofs_flag);
if (ret > 0) {
btrfs_release_path(path);
return -ENOENT;
@@ -1073,7 +1083,10 @@ search:
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = -1;
+
+ nofs_flag = memalloc_nofs_save();
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ memalloc_nofs_restore(nofs_flag);
if (ret < 0)
goto err_out;
ASSERT(ret);
@@ -1366,8 +1379,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
return -ENOMEM;
async_work->delayed_root = delayed_root;
- btrfs_init_work(&async_work->work, btrfs_delayed_meta_helper,
- btrfs_async_run_delayed_root, NULL, NULL);
+ btrfs_init_work(&async_work->work, btrfs_async_run_delayed_root, NULL,
+ NULL);
async_work->nr = nr;
btrfs_queue_work(fs_info->delayed_workers, &async_work->work);
@@ -1948,12 +1961,19 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
}
inode_id = delayed_nodes[n - 1]->inode_id + 1;
-
- for (i = 0; i < n; i++)
- refcount_inc(&delayed_nodes[i]->refs);
+ for (i = 0; i < n; i++) {
+ /*
+ * Don't increase refs in case the node is dead and
+ * about to be removed from the tree in the loop below
+ */
+ if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
+ delayed_nodes[i] = NULL;
+ }
spin_unlock(&root->inode_lock);
for (i = 0; i < n; i++) {
+ if (!delayed_nodes[i])
+ continue;
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index a73fc23e2961..26aa22ce0c70 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -318,7 +318,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
if (head->is_data)
return;
- spin_lock(&fs_info->tree_mod_seq_lock);
+ read_lock(&fs_info->tree_mod_log_lock);
if (!list_empty(&fs_info->tree_mod_seq_list)) {
struct seq_list *elem;
@@ -326,7 +326,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct seq_list, list);
seq = elem->seq;
}
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ read_unlock(&fs_info->tree_mod_log_lock);
again:
for (node = rb_first_cached(&head->ref_tree); node;
@@ -344,7 +344,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
struct seq_list *elem;
int ret = 0;
- spin_lock(&fs_info->tree_mod_seq_lock);
+ read_lock(&fs_info->tree_mod_log_lock);
if (!list_empty(&fs_info->tree_mod_seq_list)) {
elem = list_first_entry(&fs_info->tree_mod_seq_list,
struct seq_list, list);
@@ -357,7 +357,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
}
}
- spin_unlock(&fs_info->tree_mod_seq_lock);
+ read_unlock(&fs_info->tree_mod_log_lock);
return ret;
}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 340f5fdb1bc3..fc5c9b797e31 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -712,43 +712,31 @@ static void end_workqueue_bio(struct bio *bio)
struct btrfs_end_io_wq *end_io_wq = bio->bi_private;
struct btrfs_fs_info *fs_info;
struct btrfs_workqueue *wq;
- btrfs_work_func_t func;
fs_info = end_io_wq->info;
end_io_wq->status = bio->bi_status;
if (bio_op(bio) == REQ_OP_WRITE) {
- if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) {
+ if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
wq = fs_info->endio_meta_write_workers;
- func = btrfs_endio_meta_write_helper;
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) {
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
wq = fs_info->endio_freespace_worker;
- func = btrfs_freespace_write_helper;
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
wq = fs_info->endio_raid56_workers;
- func = btrfs_endio_raid56_helper;
- } else {
+ else
wq = fs_info->endio_write_workers;
- func = btrfs_endio_write_helper;
- }
} else {
- if (unlikely(end_io_wq->metadata ==
- BTRFS_WQ_ENDIO_DIO_REPAIR)) {
+ if (unlikely(end_io_wq->metadata == BTRFS_WQ_ENDIO_DIO_REPAIR))
wq = fs_info->endio_repair_workers;
- func = btrfs_endio_repair_helper;
- } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) {
+ else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
wq = fs_info->endio_raid56_workers;
- func = btrfs_endio_raid56_helper;
- } else if (end_io_wq->metadata) {
+ else if (end_io_wq->metadata)
wq = fs_info->endio_meta_workers;
- func = btrfs_endio_meta_helper;
- } else {
+ else
wq = fs_info->endio_workers;
- func = btrfs_endio_helper;
- }
}
- btrfs_init_work(&end_io_wq->work, func, end_workqueue_fn, NULL, NULL);
+ btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL);
btrfs_queue_work(wq, &end_io_wq->work);
}
@@ -841,8 +829,8 @@ blk_status_t btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
- btrfs_init_work(&async->work, btrfs_worker_helper, run_one_async_start,
- run_one_async_done, run_one_async_free);
+ btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
+ run_one_async_free);
async->bio_offset = bio_offset;
@@ -1523,9 +1511,16 @@ int btrfs_init_fs_root(struct btrfs_root *root)
spin_lock_init(&root->ino_cache_lock);
init_waitqueue_head(&root->ino_cache_wait);
- ret = get_anon_bdev(&root->anon_dev);
- if (ret)
- goto fail;
+ /*
+ * Don't assign anonymous block device to roots that are not exposed to
+ * userspace, the id pool is limited to 1M
+ */
+ if (is_fstree(root->root_key.objectid) &&
+ btrfs_root_refs(&root->root_item) > 0) {
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto fail;
+ }
mutex_lock(&root->objectid_mutex);
ret = btrfs_find_highest_objectid(root,
@@ -1693,8 +1688,8 @@ static void end_workqueue_fn(struct btrfs_work *work)
bio->bi_status = end_io_wq->status;
bio->bi_private = end_io_wq->private;
bio->bi_end_io = end_io_wq->end_io;
- kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
bio_endio(bio);
+ kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq);
}
static int cleaner_kthread(void *arg)
@@ -2064,7 +2059,7 @@ static void free_root_extent_buffers(struct btrfs_root *root)
}
/* helper to cleanup tree roots */
-static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root)
{
free_root_extent_buffers(info->tree_root);
@@ -2073,7 +2068,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
free_root_extent_buffers(info->csum_root);
free_root_extent_buffers(info->quota_root);
free_root_extent_buffers(info->uuid_root);
- if (chunk_root)
+ if (free_chunk_root)
free_root_extent_buffers(info->chunk_root);
free_root_extent_buffers(info->free_space_root);
}
@@ -2678,7 +2673,6 @@ int open_ctree(struct super_block *sb,
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
- spin_lock_init(&fs_info->tree_mod_seq_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->buffer_lock);
spin_lock_init(&fs_info->unused_bgs_lock);
@@ -3063,6 +3057,18 @@ retry_root_backup:
fs_info->generation = generation;
fs_info->last_trans_committed = generation;
+ /*
+ * If we have a uuid root and we're not being told to rescan we need to
+ * check the generation here so we can set the
+ * BTRFS_FS_UPDATE_UUID_TREE_GEN bit. Otherwise we could commit the
+ * transaction during a balance or the log replay without updating the
+ * uuid generation, and then if we crash we would rescan the uuid tree,
+ * even though it was perfectly fine.
+ */
+ if (fs_info->uuid_root && !btrfs_test_opt(fs_info, RESCAN_UUID_TREE) &&
+ fs_info->generation == btrfs_super_uuid_tree_generation(disk_super))
+ set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
+
ret = btrfs_verify_dev_extents(fs_info);
if (ret) {
btrfs_err(fs_info,
@@ -3173,6 +3179,7 @@ retry_root_backup:
/* do not make disk changes in broken FS or nologreplay is given */
if (btrfs_super_log_root(disk_super) != 0 &&
!btrfs_test_opt(fs_info, NOLOGREPLAY)) {
+ btrfs_info(fs_info, "start tree-log replay");
ret = btrfs_replay_log(fs_info, fs_devices);
if (ret) {
err = ret;
@@ -3208,6 +3215,7 @@ retry_root_backup:
if (IS_ERR(fs_info->fs_root)) {
err = PTR_ERR(fs_info->fs_root);
btrfs_warn(fs_info, "failed to read fs tree: %d", err);
+ fs_info->fs_root = NULL;
goto fail_qgroup;
}
@@ -3291,8 +3299,6 @@ retry_root_backup:
close_ctree(fs_info);
return ret;
}
- } else {
- set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
}
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
@@ -3329,7 +3335,7 @@ fail_block_groups:
btrfs_put_block_group_cache(fs_info);
fail_tree_roots:
- free_root_pointers(fs_info, 1);
+ free_root_pointers(fs_info, true);
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
fail_sb_buffer:
@@ -3359,7 +3365,7 @@ recovery_tree_root:
if (!btrfs_test_opt(fs_info, USEBACKUPROOT))
goto fail_tree_roots;
- free_root_pointers(fs_info, 0);
+ free_root_pointers(fs_info, false);
/* don't use the log in recovery mode, it won't be valid */
btrfs_set_super_log_root(disk_super, 0);
@@ -4006,6 +4012,19 @@ void close_ctree(struct btrfs_fs_info *fs_info)
*/
btrfs_delete_unused_bgs(fs_info);
+ /*
+ * There might be existing delayed inode workers still running
+ * and holding an empty delayed inode item. We must wait for
+ * them to complete first because they can create a transaction.
+ * This happens when someone calls btrfs_balance_delayed_items()
+ * and then a transaction commit runs the same delayed nodes
+ * before any delayed worker has done something with the nodes.
+ * We must wait for any worker here and not at transaction
+ * commit time since that could cause a deadlock.
+ * This is a very rare case.
+ */
+ btrfs_flush_workqueue(fs_info->delayed_workers);
+
ret = btrfs_commit_super(fs_info);
if (ret)
btrfs_err(fs_info, "commit super ret %d", ret);
@@ -4047,10 +4066,17 @@ void close_ctree(struct btrfs_fs_info *fs_info)
invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
btrfs_stop_all_workers(fs_info);
- btrfs_free_block_groups(fs_info);
-
clear_bit(BTRFS_FS_OPEN, &fs_info->flags);
- free_root_pointers(fs_info, 1);
+ free_root_pointers(fs_info, true);
+
+ /*
+ * We must free the block groups after dropping the fs_roots as we could
+ * have had an IO error and have left over tree log blocks that aren't
+ * cleaned up until the fs roots are freed. This makes the block group
+ * accounting appear to be wrong because there's pending reserved bytes,
+ * so make sure we do the block group cleanup afterwards.
+ */
+ btrfs_free_block_groups(fs_info);
iput(fs_info->btree_inode);
@@ -4285,6 +4311,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
cond_resched();
spin_lock(&delayed_refs->lock);
}
+ btrfs_qgroup_destroy_extent_records(trans);
spin_unlock(&delayed_refs->lock);
@@ -4510,7 +4537,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans,
wake_up(&fs_info->transaction_wait);
btrfs_destroy_delayed_inodes(fs_info);
- btrfs_assert_delayed_root_empty(fs_info);
btrfs_destroy_marked_extents(fs_info, &cur_trans->dirty_pages,
EXTENT_DIRTY);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 75e2f5d1e31d..bcf6329cf3a1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -600,8 +600,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
caching_ctl->block_group = cache;
caching_ctl->progress = cache->key.objectid;
refcount_set(&caching_ctl->count, 1);
- btrfs_init_work(&caching_ctl->work, btrfs_cache_helper,
- caching_thread, NULL, NULL);
+ btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
spin_lock(&cache->lock);
/*
@@ -2524,8 +2523,8 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
btrfs_pin_extent(fs_info, head->bytenr,
head->num_bytes, 1);
if (head->is_data) {
- ret = btrfs_del_csums(trans, fs_info, head->bytenr,
- head->num_bytes);
+ ret = btrfs_del_csums(trans, fs_info->csum_root,
+ head->bytenr, head->num_bytes);
}
}
@@ -4746,6 +4745,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
struct reserve_ticket *ticket = NULL;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *trans;
u64 bytes_needed;
u64 reclaim_bytes = 0;
@@ -4796,6 +4796,11 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
spin_lock(&delayed_refs_rsv->lock);
reclaim_bytes += delayed_refs_rsv->reserved;
spin_unlock(&delayed_refs_rsv->lock);
+
+ spin_lock(&trans_rsv->lock);
+ reclaim_bytes += trans_rsv->reserved;
+ spin_unlock(&trans_rsv->lock);
+
if (reclaim_bytes >= bytes_needed)
goto commit;
bytes_needed -= reclaim_bytes;
@@ -6147,7 +6152,6 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
* btrfs_delalloc_release_extents - release our outstanding_extents
* @inode: the inode to balance the reservation for.
* @num_bytes: the number of bytes we originally reserved with
- * @qgroup_free: do we need to free qgroup meta reservation or convert them.
*
* When we reserve space we increase outstanding_extents for the extents we may
* add. Once we've set the range as delalloc or created our ordered extents we
@@ -6155,8 +6159,7 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
* temporarily tracked outstanding_extents. This _must_ be used in conjunction
* with btrfs_delalloc_reserve_metadata.
*/
-void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
- bool qgroup_free)
+void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
unsigned num_extents;
@@ -6170,7 +6173,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes,
if (btrfs_is_testing(fs_info))
return;
- btrfs_inode_rsv_release(inode, qgroup_free);
+ btrfs_inode_rsv_release(inode, true);
}
/**
@@ -7057,7 +7060,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
btrfs_release_path(path);
if (is_data) {
- ret = btrfs_del_csums(trans, info, bytenr, num_bytes);
+ ret = btrfs_del_csums(trans, info->csum_root, bytenr,
+ num_bytes);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -7729,6 +7733,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info,
u64 flags, int delalloc)
{
int ret = 0;
+ int cache_block_group_error = 0;
struct btrfs_free_cluster *last_ptr = NULL;
struct btrfs_block_group_cache *block_group = NULL;
struct find_free_extent_ctl ffe_ctl = {0};
@@ -7889,7 +7894,20 @@ have_block_group:
if (unlikely(!ffe_ctl.cached)) {
ffe_ctl.have_caching_bg = true;
ret = cache_block_group(block_group, 0);
- BUG_ON(ret < 0);
+
+ /*
+ * If we get ENOMEM here or something else we want to
+ * try other block groups, because it may not be fatal.
+ * However if we can't find anything else we need to
+ * save our return here so that we return the actual
+ * error that caused problems, not ENOSPC.
+ */
+ if (ret < 0) {
+ if (!cache_block_group_error)
+ cache_block_group_error = ret;
+ ret = 0;
+ goto loop;
+ }
ret = 0;
}
@@ -7976,7 +7994,7 @@ loop:
if (ret > 0)
goto search;
- if (ret == -ENOSPC) {
+ if (ret == -ENOSPC && !cache_block_group_error) {
/*
* Use ffe_ctl->total_free_space as fallback if we can't find
* any contiguous hole.
@@ -7987,6 +8005,8 @@ loop:
space_info->max_extent_size = ffe_ctl.max_extent_size;
spin_unlock(&space_info->lock);
ins->offset = ffe_ctl.max_extent_size;
+ } else if (ret == -ENOSPC) {
+ ret = cache_block_group_error;
}
return ret;
}
@@ -10736,6 +10756,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
&fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
+ /* Once for the block groups rbtree */
+ btrfs_put_block_group(block_group);
+
if (fs_info->first_logical_byte == block_group->key.objectid)
fs_info->first_logical_byte = (u64)-1;
spin_unlock(&fs_info->block_group_cache_lock);
@@ -10845,9 +10868,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- btrfs_put_block_group(block_group);
- btrfs_put_block_group(block_group);
-
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
ret = -EIO;
@@ -10868,7 +10888,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
/* once for the tree */
free_extent_map(em);
}
+
out:
+ /* Once for the lookup reference */
+ btrfs_put_block_group(block_group);
if (remove_rsv)
btrfs_delayed_refs_rsv_release(fs_info, 1);
btrfs_free_path(path);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0d16ed19e84b..38ac638665a6 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1537,8 +1537,8 @@ out:
}
/**
- * find_first_clear_extent_bit - finds the first range that has @bits not set
- * and that starts after @start
+ * find_first_clear_extent_bit - find the first range that has @bits not set.
+ * This range could start before @start.
*
* @tree - the tree to search
* @start - the offset at/after which the found extent should start
@@ -1562,28 +1562,72 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
/* Find first extent with bits cleared */
while (1) {
node = __etree_search(tree, start, &next, &prev, NULL, NULL);
- if (!node) {
+ if (!node && !next && !prev) {
+ /*
+ * Tree is completely empty, send full range and let
+ * caller deal with it
+ */
+ *start_ret = 0;
+ *end_ret = -1;
+ goto out;
+ } else if (!node && !next) {
+ /*
+ * We are past the last allocated chunk, set start at
+ * the end of the last extent.
+ */
+ state = rb_entry(prev, struct extent_state, rb_node);
+ *start_ret = state->end + 1;
+ *end_ret = -1;
+ goto out;
+ } else if (!node) {
node = next;
- if (!node) {
+ }
+ /*
+ * At this point 'node' either contains 'start' or start is
+ * before 'node'
+ */
+ state = rb_entry(node, struct extent_state, rb_node);
+
+ if (in_range(start, state->start, state->end - state->start + 1)) {
+ if (state->state & bits) {
/*
- * We are past the last allocated chunk,
- * set start at the end of the last extent. The
- * device alloc tree should never be empty so
- * prev is always set.
+ * |--range with bits sets--|
+ * |
+ * start
*/
- ASSERT(prev);
- state = rb_entry(prev, struct extent_state, rb_node);
- *start_ret = state->end + 1;
- *end_ret = -1;
- goto out;
+ start = state->end + 1;
+ } else {
+ /*
+ * 'start' falls within a range that doesn't
+ * have the bits set, so take its start as
+ * the beginning of the desired range
+ *
+ * |--range with bits cleared----|
+ * |
+ * start
+ */
+ *start_ret = state->start;
+ break;
}
- }
- state = rb_entry(node, struct extent_state, rb_node);
- if (in_range(start, state->start, state->end - state->start + 1) &&
- (state->state & bits)) {
- start = state->end + 1;
} else {
- *start_ret = start;
+ /*
+ * |---prev range---|---hole/unset---|---node range---|
+ * |
+ * start
+ *
+ * or
+ *
+ * |---hole/unset--||--first node--|
+ * 0 |
+ * start
+ */
+ if (prev) {
+ state = rb_entry(prev, struct extent_state,
+ rb_node);
+ *start_ret = state->end + 1;
+ } else {
+ *start_ret = 0;
+ }
break;
}
}
@@ -1838,7 +1882,7 @@ static int __process_pages_contig(struct address_space *mapping,
if (page_ops & PAGE_SET_PRIVATE2)
SetPagePrivate2(pages[i]);
- if (pages[i] == locked_page) {
+ if (locked_page && pages[i] == locked_page) {
put_page(pages[i]);
pages_locked++;
continue;
@@ -1858,7 +1902,8 @@ static int __process_pages_contig(struct address_space *mapping,
if (!PageDirty(pages[i]) ||
pages[i]->mapping != mapping) {
unlock_page(pages[i]);
- put_page(pages[i]);
+ for (; i < ret; i++)
+ put_page(pages[i]);
err = -EAGAIN;
goto out;
}
@@ -3901,6 +3946,11 @@ int btree_write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
+ /*
+ * Start from the beginning does not need to cycle over the
+ * range, mark it as scanned.
+ */
+ scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
@@ -3918,7 +3968,6 @@ retry:
tag))) {
unsigned i;
- scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
@@ -4043,6 +4092,11 @@ static int extent_write_cache_pages(struct address_space *mapping,
if (wbc->range_cyclic) {
index = mapping->writeback_index; /* Start from prev offset */
end = -1;
+ /*
+ * Start from the beginning does not need to cycle over the
+ * range, mark it as scanned.
+ */
+ scanned = (index == 0);
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
@@ -4076,11 +4130,10 @@ retry:
&index, end, tag))) {
unsigned i;
- scanned = 1;
for (i = 0; i < nr_pages; i++) {
struct page *page = pvec.pages[i];
- done_index = page->index;
+ done_index = page->index + 1;
/*
* At this point we hold neither the i_pages lock nor
* the page lock: the page may be truncated or
@@ -4115,16 +4168,6 @@ retry:
ret = __extent_writepage(page, wbc, epd);
if (ret < 0) {
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
- done_index = page->index + 1;
done = 1;
break;
}
@@ -4146,7 +4189,16 @@ retry:
*/
scanned = 1;
index = 0;
- goto retry;
+
+ /*
+ * If we're looping we could run into a page that is locked by a
+ * writer and that writer could be waiting on writeback for a
+ * page in our current bio, and thus deadlock, so flush the
+ * write bio here.
+ */
+ ret = flush_write_bio(epd);
+ if (!ret)
+ goto retry;
}
if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
@@ -4397,6 +4449,8 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
/* once for us */
free_extent_map(em);
+
+ cond_resched(); /* Allow large-extent preemption. */
}
}
return try_release_extent_state(tree, page, mask);
@@ -4930,25 +4984,28 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
- /* the ref bit is tricky. We have to make sure it is set
- * if we have the buffer dirty. Otherwise the
- * code to free a buffer can end up dropping a dirty
- * page
+ /*
+ * The TREE_REF bit is first set when the extent_buffer is added
+ * to the radix tree. It is also reset, if unset, when a new reference
+ * is created by find_extent_buffer.
*
- * Once the ref bit is set, it won't go away while the
- * buffer is dirty or in writeback, and it also won't
- * go away while we have the reference count on the
- * eb bumped.
+ * It is only cleared in two cases: freeing the last non-tree
+ * reference to the extent_buffer when its STALE bit is set or
+ * calling releasepage when the tree reference is the only reference.
*
- * We can't just set the ref bit without bumping the
- * ref on the eb because free_extent_buffer might
- * see the ref bit and try to clear it. If this happens
- * free_extent_buffer might end up dropping our original
- * ref by mistake and freeing the page before we are able
- * to add one more ref.
+ * In both cases, care is taken to ensure that the extent_buffer's
+ * pages are not under io. However, releasepage can be concurrently
+ * called with creating new references, which is prone to race
+ * conditions between the calls to check_buffer_tree_ref in those
+ * codepaths and clearing TREE_REF in try_release_extent_buffer.
*
- * So bump the ref count first, then set the bit. If someone
- * beat us to it, drop the ref we added.
+ * The actual lifetime of the extent_buffer in the radix tree is
+ * adequately protected by the refcount, but the TREE_REF bit and
+ * its corresponding reference are not. To protect against this
+ * class of races, we call check_buffer_tree_ref from the codepaths
+ * which trigger io after they set eb->io_pages. Note that once io is
+ * initiated, TREE_REF can no longer be cleared, so that is the
+ * moment at which any such race is best fixed.
*/
refs = atomic_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -5025,12 +5082,14 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
eb = alloc_dummy_extent_buffer(fs_info, start);
if (!eb)
- return NULL;
+ return ERR_PTR(-ENOMEM);
eb->fs_info = fs_info;
again:
ret = radix_tree_preload(GFP_NOFS);
- if (ret)
+ if (ret) {
+ exists = ERR_PTR(ret);
goto free_eb;
+ }
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
start >> PAGE_SHIFT, eb);
@@ -5396,6 +5455,11 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
+ /*
+ * It is possible for releasepage to clear the TREE_REF bit before we
+ * set io_pages. See check_buffer_tree_ref for a more detailed comment.
+ */
+ check_buffer_tree_ref(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 9558d79faf1e..11f39d975525 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -233,6 +233,17 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
struct extent_map *merge = NULL;
struct rb_node *rb;
+ /*
+ * We can't modify an extent map that is in the tree and that is being
+ * used by another task, as it can cause that other task to see it in
+ * inconsistent state during the merging. We always have 1 reference for
+ * the tree and 1 for this task (which is unpinning the extent map or
+ * clearing the logging flag), so anything > 2 means it's being used by
+ * other tasks too.
+ */
+ if (refcount_read(&em->refs) > 2)
+ return;
+
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
if (rb)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index d431ea8198e4..ce5a413d334f 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -269,7 +269,8 @@ found:
csum += count * csum_size;
nblocks -= count;
next:
- while (count--) {
+ while (count > 0) {
+ count--;
disk_bytenr += fs_info->sectorsize;
offset += fs_info->sectorsize;
page_bytes_left -= fs_info->sectorsize;
@@ -583,9 +584,9 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
* range of bytes.
*/
int btrfs_del_csums(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info, u64 bytenr, u64 len)
+ struct btrfs_root *root, u64 bytenr, u64 len)
{
- struct btrfs_root *root = fs_info->csum_root;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path;
struct btrfs_key key;
u64 end_byte = bytenr + len;
@@ -595,6 +596,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
int blocksize_bits = fs_info->sb->s_blocksize_bits;
+ ASSERT(root == fs_info->csum_root ||
+ root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -787,10 +791,12 @@ again:
nritems = btrfs_header_nritems(path->nodes[0]);
if (!nritems || (path->slots[0] >= nritems - 1)) {
ret = btrfs_next_leaf(root, path);
- if (ret == 1)
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
found_next = 1;
- if (ret != 0)
goto insert;
+ }
slot = path->slots[0];
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3868eb6a17f3..6dd68197419d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1645,6 +1645,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
break;
}
+ only_release_metadata = false;
sector_offset = pos & (fs_info->sectorsize - 1);
reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize);
@@ -1701,7 +1702,7 @@ again:
force_page_uptodate);
if (ret) {
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes, true);
+ reserve_bytes);
break;
}
@@ -1713,7 +1714,7 @@ again:
if (extents_locked == -EAGAIN)
goto again;
btrfs_delalloc_release_extents(BTRFS_I(inode),
- reserve_bytes, true);
+ reserve_bytes);
ret = extents_locked;
break;
}
@@ -1781,8 +1782,7 @@ again:
else
free_extent_state(cached_state);
- btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes,
- true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
if (ret) {
btrfs_drop_pages(pages, num_pages);
break;
@@ -1801,7 +1801,6 @@ again:
set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, EXTENT_NORESERVE, NULL,
NULL, GFP_NOFS);
- only_release_metadata = false;
}
btrfs_drop_pages(pages, num_pages);
@@ -1913,9 +1912,10 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
(iocb->ki_flags & IOCB_NOWAIT))
return -EOPNOTSUPP;
- if (!inode_trylock(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
return -EAGAIN;
+ } else {
inode_lock(inode);
}
@@ -2082,6 +2082,16 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
btrfs_init_log_ctx(&ctx, inode);
/*
+ * Set the range to full if the NO_HOLES feature is not enabled.
+ * This is to avoid missing file extent items representing holes after
+ * replaying the log.
+ */
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ start = 0;
+ end = LLONG_MAX;
+ }
+
+ /*
* We write the dirty pages in the range and wait until they complete
* out of the ->i_mutex. If so, we can flush the dirty pages by
* multi-task, and make the performance up. See
@@ -2135,6 +2145,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
+ up_write(&BTRFS_I(inode)->dio_sem);
inode_unlock(inode);
goto out;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 232546190656..eaf1a31ceaf2 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -382,6 +382,12 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode
if (uptodate && !PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "free space cache page truncated");
+ io_ctl_drop_pages(io_ctl);
+ return -EIO;
+ }
if (!PageUptodate(page)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"error reading free space cache");
@@ -2160,7 +2166,7 @@ out:
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, bool update_stat)
{
- struct btrfs_free_space *left_info;
+ struct btrfs_free_space *left_info = NULL;
struct btrfs_free_space *right_info;
bool merged = false;
u64 offset = info->offset;
@@ -2175,7 +2181,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
if (right_info && rb_prev(&right_info->offset_index))
left_info = rb_entry(rb_prev(&right_info->offset_index),
struct btrfs_free_space, offset_index);
- else
+ else if (!right_info)
left_info = tree_search_offset(ctl, offset - 1, 0, 0);
if (right_info && !right_info->bitmap) {
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index ffca2abf13d0..e1b50c62ba65 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -483,12 +483,13 @@ again:
ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
prealloc, prealloc, &alloc_hint);
if (ret) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
+ btrfs_delalloc_release_metadata(BTRFS_I(inode), prealloc, true);
goto out_put;
}
ret = btrfs_write_out_ino_cache(root, trans, path, inode);
- btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc, false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), prealloc);
out_put:
iput(inode);
out_release:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 40ac00730695..fe10da44a4d2 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -471,6 +471,7 @@ static noinline void compress_file_range(struct async_chunk *async_chunk,
u64 start = async_chunk->start;
u64 end = async_chunk->end;
u64 actual_end;
+ u64 i_size;
int ret = 0;
struct page **pages = NULL;
unsigned long nr_pages;
@@ -484,7 +485,19 @@ static noinline void compress_file_range(struct async_chunk *async_chunk,
inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
SZ_16K);
- actual_end = min_t(u64, i_size_read(inode), end + 1);
+ /*
+ * We need to save i_size before now because it could change in between
+ * us evaluating the size and assigning it. This is because we lock and
+ * unlock the page in truncate and fallocate, and then modify the i_size
+ * later on.
+ *
+ * The barriers are to emulate READ_ONCE, remove that once i_size_read
+ * does that for us.
+ */
+ barrier();
+ i_size = i_size_read(inode);
+ barrier();
+ actual_end = min_t(u64, i_size, end + 1);
again:
will_compress = 0;
nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
@@ -688,10 +701,12 @@ cleanup_and_bail_uncompressed:
* to our extent and set things up for the async work queue to run
* cow_file_range to do the normal delalloc dance.
*/
- if (page_offset(async_chunk->locked_page) >= start &&
- page_offset(async_chunk->locked_page) <= end)
+ if (async_chunk->locked_page &&
+ (page_offset(async_chunk->locked_page) >= start &&
+ page_offset(async_chunk->locked_page)) <= end) {
__set_page_dirty_nobuffers(async_chunk->locked_page);
/* unlocked later on in the async handlers */
+ }
if (redirty)
extent_range_redirty_for_io(inode, start, end);
@@ -781,7 +796,7 @@ retry:
async_extent->start +
async_extent->ram_size - 1,
WB_SYNC_ALL);
- else if (ret)
+ else if (ret && async_chunk->locked_page)
unlock_page(async_chunk->locked_page);
kfree(async_extent);
cond_resched();
@@ -963,6 +978,7 @@ static noinline int cow_file_range(struct inode *inode,
u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0;
+ u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
@@ -1014,10 +1030,26 @@ static noinline int cow_file_range(struct inode *inode,
btrfs_drop_extent_cache(BTRFS_I(inode), start,
start + num_bytes - 1, 0);
+ /*
+ * Relocation relies on the relocated extents to have exactly the same
+ * size as the original extents. Normally writeback for relocation data
+ * extents follows a NOCOW path because relocation preallocates the
+ * extents. However, due to an operation such as scrub turning a block
+ * group to RO mode, it may fallback to COW mode, so we must make sure
+ * an extent allocated during COW has exactly the requested size and can
+ * not be split into smaller extents, otherwise relocation breaks and
+ * fails during the stage where it updates the bytenr of file extent
+ * items.
+ */
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ min_alloc_size = num_bytes;
+ else
+ min_alloc_size = fs_info->sectorsize;
+
while (num_bytes > 0) {
cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
- fs_info->sectorsize, 0, alloc_hint,
+ min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -1122,7 +1154,7 @@ out_unlock:
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
- start + cur_alloc_size,
+ start + cur_alloc_size - 1,
start + cur_alloc_size,
locked_page,
clear_bits,
@@ -1258,14 +1290,27 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
async_chunk[i].inode = inode;
async_chunk[i].start = start;
async_chunk[i].end = cur_end;
- async_chunk[i].locked_page = locked_page;
async_chunk[i].write_flags = write_flags;
INIT_LIST_HEAD(&async_chunk[i].extents);
- btrfs_init_work(&async_chunk[i].work,
- btrfs_delalloc_helper,
- async_cow_start, async_cow_submit,
- async_cow_free);
+ /*
+ * The locked_page comes all the way from writepage and its
+ * the original page we were actually given. As we spread
+ * this large delalloc region across multiple async_chunk
+ * structs, only the first struct needs a pointer to locked_page
+ *
+ * This way we don't need racey decisions about who is supposed
+ * to unlock it.
+ */
+ if (locked_page) {
+ async_chunk[i].locked_page = locked_page;
+ locked_page = NULL;
+ } else {
+ async_chunk[i].locked_page = NULL;
+ }
+
+ btrfs_init_work(&async_chunk[i].work, async_cow_start,
+ async_cow_submit, async_cow_free);
nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
atomic_add(nr_pages, &fs_info->async_delalloc_pages);
@@ -1416,7 +1461,7 @@ next_slot:
btrfs_file_extent_num_bytes(leaf, fi);
disk_num_bytes =
btrfs_file_extent_disk_num_bytes(leaf, fi);
- if (extent_end <= start) {
+ if (extent_end <= cur_offset) {
path->slots[0]++;
goto next_slot;
}
@@ -2098,6 +2143,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
/* see btrfs_writepage_start_hook for details on why this is required */
struct btrfs_writepage_fixup {
struct page *page;
+ struct inode *inode;
struct btrfs_work work;
};
@@ -2111,27 +2157,71 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
struct inode *inode;
u64 page_start;
u64 page_end;
- int ret;
+ int ret = 0;
+ bool free_delalloc_space = true;
fixup = container_of(work, struct btrfs_writepage_fixup, work);
page = fixup->page;
+ inode = fixup->inode;
+ page_start = page_offset(page);
+ page_end = page_offset(page) + PAGE_SIZE - 1;
+
+ /*
+ * This is similar to page_mkwrite, we need to reserve the space before
+ * we take the page lock.
+ */
+ ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
+ PAGE_SIZE);
again:
lock_page(page);
+
+ /*
+ * Before we queued this fixup, we took a reference on the page.
+ * page->mapping may go NULL, but it shouldn't be moved to a different
+ * address space.
+ */
if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
- ClearPageChecked(page);
+ /*
+ * Unfortunately this is a little tricky, either
+ *
+ * 1) We got here and our page had already been dealt with and
+ * we reserved our space, thus ret == 0, so we need to just
+ * drop our space reservation and bail. This can happen the
+ * first time we come into the fixup worker, or could happen
+ * while waiting for the ordered extent.
+ * 2) Our page was already dealt with, but we happened to get an
+ * ENOSPC above from the btrfs_delalloc_reserve_space. In
+ * this case we obviously don't have anything to release, but
+ * because the page was already dealt with we don't want to
+ * mark the page with an error, so make sure we're resetting
+ * ret to 0. This is why we have this check _before_ the ret
+ * check, because we do not want to have a surprise ENOSPC
+ * when the page was already properly dealt with.
+ */
+ if (!ret) {
+ btrfs_delalloc_release_extents(BTRFS_I(inode),
+ PAGE_SIZE);
+ btrfs_delalloc_release_space(inode, data_reserved,
+ page_start, PAGE_SIZE,
+ true);
+ }
+ ret = 0;
goto out_page;
}
- inode = page->mapping->host;
- page_start = page_offset(page);
- page_end = page_offset(page) + PAGE_SIZE - 1;
+ /*
+ * We can't mess with the page state unless it is locked, so now that
+ * it is locked bail if we failed to make our space reservation.
+ */
+ if (ret)
+ goto out_page;
lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
&cached_state);
/* already ordered? We're done */
if (PagePrivate2(page))
- goto out;
+ goto out_reserved;
ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
PAGE_SIZE);
@@ -2144,35 +2234,49 @@ again:
goto again;
}
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
- PAGE_SIZE);
- if (ret) {
- mapping_set_error(page->mapping, ret);
- end_extent_writepage(page, ret, page_start, page_end);
- ClearPageChecked(page);
- goto out;
- }
-
ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
&cached_state, 0);
+ if (ret)
+ goto out_reserved;
+
+ /*
+ * Everything went as planned, we're now the owner of a dirty page with
+ * delayed allocation bits set and space reserved for our COW
+ * destination.
+ *
+ * The page was dirty when we started, nothing should have cleaned it.
+ */
+ BUG_ON(!PageDirty(page));
+ free_delalloc_space = false;
+out_reserved:
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
+ if (free_delalloc_space)
+ btrfs_delalloc_release_space(inode, data_reserved, page_start,
+ PAGE_SIZE, true);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
+ &cached_state);
+out_page:
if (ret) {
+ /*
+ * We hit ENOSPC or other errors. Update the mapping and page
+ * to reflect the errors and clean the page.
+ */
mapping_set_error(page->mapping, ret);
end_extent_writepage(page, ret, page_start, page_end);
- ClearPageChecked(page);
- goto out;
+ clear_page_dirty_for_io(page);
+ SetPageError(page);
}
-
ClearPageChecked(page);
- set_page_dirty(page);
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, false);
-out:
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
- &cached_state);
-out_page:
unlock_page(page);
put_page(page);
kfree(fixup);
extent_changeset_free(data_reserved);
+ /*
+ * As a precaution, do a delayed iput in case it would be the last iput
+ * that could need flushing space. Recursing back to fixup worker would
+ * deadlock.
+ */
+ btrfs_add_delayed_iput(inode);
}
/*
@@ -2196,6 +2300,13 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
if (TestClearPagePrivate2(page))
return 0;
+ /*
+ * PageChecked is set below when we create a fixup worker for this page,
+ * don't try to create another one if we're already PageChecked()
+ *
+ * The extent_io writepage code will redirty the page if we send back
+ * EAGAIN.
+ */
if (PageChecked(page))
return -EAGAIN;
@@ -2203,13 +2314,21 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end)
if (!fixup)
return -EAGAIN;
+ /*
+ * We are already holding a reference to this inode from
+ * write_cache_pages. We need to hold it because the space reservation
+ * takes place outside of the page lock, and we can't trust
+ * page->mapping outside of the page lock.
+ */
+ ihold(inode);
SetPageChecked(page);
get_page(page);
- btrfs_init_work(&fixup->work, btrfs_fixup_helper,
- btrfs_writepage_fixup_worker, NULL, NULL);
+ btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
fixup->page = page;
+ fixup->inode = inode;
btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
- return -EBUSY;
+
+ return -EAGAIN;
}
static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
@@ -3199,7 +3318,6 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered_extent = NULL;
struct btrfs_workqueue *wq;
- btrfs_work_func_t func;
trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
@@ -3208,16 +3326,12 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
end - start + 1, uptodate))
return;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(BTRFS_I(inode)))
wq = fs_info->endio_freespace_worker;
- func = btrfs_freespace_write_helper;
- } else {
+ else
wq = fs_info->endio_write_workers;
- func = btrfs_endio_write_helper;
- }
- btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
- NULL);
+ btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL);
btrfs_queue_work(wq, &ordered_extent->work);
}
@@ -4116,18 +4230,30 @@ out:
}
static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
- struct inode *dir, u64 objectid,
- const char *name, int name_len)
+ struct inode *dir, struct dentry *dentry)
{
struct btrfs_root *root = BTRFS_I(dir)->root;
+ struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
+ const char *name = dentry->d_name.name;
+ int name_len = dentry->d_name.len;
u64 index;
int ret;
+ u64 objectid;
u64 dir_ino = btrfs_ino(BTRFS_I(dir));
+ if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
+ objectid = inode->root->root_key.objectid;
+ } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
+ objectid = inode->location.objectid;
+ } else {
+ WARN_ON(1);
+ return -EINVAL;
+ }
+
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -4149,13 +4275,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
- ret = btrfs_del_root_ref(trans, objectid, root->root_key.objectid,
- dir_ino, &index, name, name_len);
- if (ret < 0) {
- if (ret != -ENOENT) {
- btrfs_abort_transaction(trans, ret);
- goto out;
- }
+ /*
+ * This is a placeholder inode for a subvolume we didn't have a
+ * reference to at the time of the snapshot creation. In the meantime
+ * we could have renamed the real subvol link into our snapshot, so
+ * depending on btrfs_del_root_ref to return -ENOENT here is incorret.
+ * Instead simply lookup the dir_index_item for this entry so we can
+ * remove it. Otherwise we know we have a ref to the root and we can
+ * call btrfs_del_root_ref, and it _shouldn't_ fail.
+ */
+ if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
di = btrfs_search_dir_index_item(root, path, dir_ino,
name, name_len);
if (IS_ERR_OR_NULL(di)) {
@@ -4170,8 +4299,16 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
index = key.offset;
+ btrfs_release_path(path);
+ } else {
+ ret = btrfs_del_root_ref(trans, objectid,
+ root->root_key.objectid, dir_ino,
+ &index, name, name_len);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out;
+ }
}
- btrfs_release_path(path);
ret = btrfs_delete_delayed_dir_index(trans, BTRFS_I(dir), index);
if (ret) {
@@ -4365,8 +4502,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
btrfs_record_snapshot_destroy(trans, BTRFS_I(dir));
- ret = btrfs_unlink_subvol(trans, dir, dest->root_key.objectid,
- dentry->d_name.name, dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, dir, dentry);
if (ret) {
err = ret;
btrfs_abort_transaction(trans, ret);
@@ -4411,6 +4547,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
}
}
+ free_anon_bdev(dest->anon_dev);
+ dest->anon_dev = 0;
out_end_trans:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
@@ -4461,10 +4599,7 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
return PTR_ERR(trans);
if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- err = btrfs_unlink_subvol(trans, dir,
- BTRFS_I(inode)->location.objectid,
- dentry->d_name.name,
- dentry->d_name.len);
+ err = btrfs_unlink_subvol(trans, dir, dentry);
goto out;
}
@@ -4545,6 +4680,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
u64 bytes_deleted = 0;
bool be_nice = false;
bool should_throttle = false;
+ const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
+ struct extent_state *cached_state = NULL;
BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
@@ -4561,6 +4698,10 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = READA_BACK;
+ if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
+ lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, (u64)-1,
+ &cached_state);
+
/*
* We want to drop from the next block forward in case this new size is
* not block aligned since we will be keeping the last block of the
@@ -4597,7 +4738,6 @@ search_again:
goto out;
}
- path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto out;
@@ -4749,7 +4889,6 @@ delete:
root == fs_info->tree_root)) {
struct btrfs_ref ref = { 0 };
- btrfs_set_path_blocking(path);
bytes_deleted += extent_num_bytes;
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF,
@@ -4825,6 +4964,8 @@ out:
if (!ret && last_size > new_size)
last_size = new_size;
btrfs_ordered_update_i_size(inode, last_size, NULL);
+ unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start,
+ (u64)-1, &cached_state);
}
btrfs_free_path(path);
@@ -4878,7 +5019,7 @@ again:
if (!page) {
btrfs_delalloc_release_space(inode, data_reserved,
block_start, blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
ret = -ENOMEM;
goto out;
}
@@ -4946,7 +5087,7 @@ out_unlock:
if (ret)
btrfs_delalloc_release_space(inode, data_reserved, block_start,
blocksize, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, (ret != 0));
+ btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
unlock_page(page);
put_page(page);
out:
@@ -5630,7 +5771,6 @@ static void inode_tree_add(struct inode *inode)
static void inode_tree_del(struct inode *inode)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
int empty = 0;
@@ -5643,7 +5783,6 @@ static void inode_tree_del(struct inode *inode)
spin_unlock(&root->inode_lock);
if (empty && btrfs_root_refs(&root->root_item) == 0) {
- synchronize_srcu(&fs_info->subvol_srcu);
spin_lock(&root->inode_lock);
empty = RB_EMPTY_ROOT(&root->inode_tree);
spin_unlock(&root->inode_lock);
@@ -8153,18 +8292,14 @@ static void __endio_write_update_ordered(struct inode *inode,
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_ordered_extent *ordered = NULL;
struct btrfs_workqueue *wq;
- btrfs_work_func_t func;
u64 ordered_offset = offset;
u64 ordered_bytes = bytes;
u64 last_offset;
- if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
+ if (btrfs_is_free_space_inode(BTRFS_I(inode)))
wq = fs_info->endio_freespace_worker;
- func = btrfs_freespace_write_helper;
- } else {
+ else
wq = fs_info->endio_write_workers;
- func = btrfs_endio_write_helper;
- }
while (ordered_offset < offset + bytes) {
last_offset = ordered_offset;
@@ -8172,9 +8307,8 @@ static void __endio_write_update_ordered(struct inode *inode,
&ordered_offset,
ordered_bytes,
uptodate)) {
- btrfs_init_work(&ordered->work, func,
- finish_ordered_fn,
- NULL, NULL);
+ btrfs_init_work(&ordered->work, finish_ordered_fn, NULL,
+ NULL);
btrfs_queue_work(wq, &ordered->work);
}
/*
@@ -8373,7 +8507,6 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
/* bio split */
ASSERT(map_length <= INT_MAX);
- atomic_inc(&dip->pending_bios);
do {
clone_len = min_t(int, submit_len, map_length);
@@ -8424,7 +8557,8 @@ submit:
if (!status)
return 0;
- bio_put(bio);
+ if (bio != orig_bio)
+ bio_put(bio);
out_err:
dip->errors = 1;
/*
@@ -8465,7 +8599,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
bio->bi_private = dip;
dip->orig_bio = bio;
dip->dio_bio = dio_bio;
- atomic_set(&dip->pending_bios, 0);
+ atomic_set(&dip->pending_bios, 1);
io_bio = btrfs_io_bio(bio);
io_bio->logical = file_offset;
@@ -8612,9 +8746,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
}
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
offset, count);
@@ -8666,7 +8797,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
} else if (ret >= 0 && (size_t)ret < count)
btrfs_delalloc_release_space(inode, data_reserved,
offset, count - (size_t)ret, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), count, false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), count);
}
out:
if (wakeup)
@@ -9019,7 +9150,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
if (!ret2) {
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
sb_end_pagefault(inode->i_sb);
extent_changeset_free(data_reserved);
return VM_FAULT_LOCKED;
@@ -9028,7 +9159,7 @@ again:
out_unlock:
unlock_page(page);
out:
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE, (ret != 0));
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
btrfs_delalloc_release_space(inode, data_reserved, page_start,
reserved_space, (ret != 0));
out_noreserve:
@@ -9460,7 +9591,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
- u64 root_objectid;
int ret;
bool root_log_pinned = false;
bool dest_log_pinned = false;
@@ -9478,9 +9608,8 @@ static int btrfs_rename_exchange(struct inode *old_dir,
btrfs_init_log_ctx(&ctx_dest, new_inode);
/* close the race window with snapshot create/destroy ioctl */
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
- down_read(&fs_info->subvol_sem);
- if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&fs_info->subvol_sem);
/*
@@ -9497,6 +9626,9 @@ static int btrfs_rename_exchange(struct inode *old_dir,
goto out_notrans;
}
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+
/*
* We need to find a free sequence number both in the source and
* in the destination directory for the exchange.
@@ -9564,10 +9696,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
- root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
- ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
- old_dentry->d_name.name,
- old_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else { /* src is an inode */
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(old_dentry->d_inode),
@@ -9583,10 +9712,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* dest is a subvolume */
if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
- root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
- ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
- new_dentry->d_name.name,
- new_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
} else { /* dest is an inode */
ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
BTRFS_I(new_dentry->d_inode),
@@ -9691,6 +9817,18 @@ out_fail:
commit_transaction = true;
}
if (commit_transaction) {
+ /*
+ * We may have set commit_transaction when logging the new name
+ * in the destination root, in which case we left the source
+ * root context in the list of log contextes. So make sure we
+ * remove it to avoid invalid memory accesses, since the context
+ * was allocated in our stack frame.
+ */
+ if (sync_log_root) {
+ mutex_lock(&root->log_mutex);
+ list_del_init(&ctx_root.list);
+ mutex_unlock(&root->log_mutex);
+ }
ret = btrfs_commit_transaction(trans);
} else {
int ret2;
@@ -9699,11 +9837,13 @@ out_fail:
ret = ret ? ret : ret2;
}
out_notrans:
- if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
- up_read(&fs_info->subvol_sem);
- if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
+ if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
+ old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
+ ASSERT(list_empty(&ctx_root.list));
+ ASSERT(list_empty(&ctx_dest.list));
+
return ret;
}
@@ -9770,7 +9910,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_inode = d_inode(new_dentry);
struct inode *old_inode = d_inode(old_dentry);
u64 index = 0;
- u64 root_objectid;
int ret;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
@@ -9878,10 +10017,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode), 1);
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
- root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
- ret = btrfs_unlink_subvol(trans, old_dir, root_objectid,
- old_dentry->d_name.name,
- old_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
@@ -9900,10 +10036,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_inode->i_ctime = current_time(new_inode);
if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
- root_objectid = BTRFS_I(new_inode)->location.objectid;
- ret = btrfs_unlink_subvol(trans, new_dir, root_objectid,
- new_dentry->d_name.name,
- new_dentry->d_name.len);
+ ret = btrfs_unlink_subvol(trans, new_dir, new_dentry);
BUG_ON(new_inode->i_nlink == 0);
} else {
ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
@@ -9983,6 +10116,10 @@ out_fail:
ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
if (ret)
commit_transaction = true;
+ } else if (sync_log) {
+ mutex_lock(&root->log_mutex);
+ list_del(&ctx.list);
+ mutex_unlock(&root->log_mutex);
}
if (commit_transaction) {
ret = btrfs_commit_transaction(trans);
@@ -10048,8 +10185,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode
init_completion(&work->completion);
INIT_LIST_HEAD(&work->list);
work->inode = inode;
- btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
- btrfs_run_delalloc_work, NULL, NULL);
+ btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
return work;
}
@@ -10314,6 +10450,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_key ins;
u64 cur_offset = start;
+ u64 clear_offset = start;
u64 i_size;
u64 cur_bytes;
u64 last_alloc = (u64)-1;
@@ -10348,6 +10485,15 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
btrfs_end_transaction(trans);
break;
}
+
+ /*
+ * We've reserved this space, and thus converted it from
+ * ->bytes_may_use to ->bytes_reserved. Any error that happens
+ * from here on out we will only need to clear our reservation
+ * for the remaining unreserved area, so advance our
+ * clear_offset by our extent size.
+ */
+ clear_offset += ins.offset;
btrfs_dec_block_group_reservations(fs_info, ins.objectid);
last_alloc = ins.offset;
@@ -10428,9 +10574,9 @@ next:
if (own_trans)
btrfs_end_transaction(trans);
}
- if (cur_offset < end)
- btrfs_free_reserved_data_space(inode, NULL, cur_offset,
- end - cur_offset + 1);
+ if (clear_offset < end)
+ btrfs_free_reserved_data_space(inode, NULL, clear_offset,
+ end - clear_offset + 1);
return ret;
}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5b4beebf138c..524821b437d0 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -710,11 +710,17 @@ static noinline int create_subvol(struct inode *dir,
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
ret = btrfs_update_inode(trans, root, dir);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
btrfs_ino(BTRFS_I(dir)), index, name, namelen);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
ret = btrfs_uuid_tree_add(trans, root_item->uuid,
BTRFS_UUID_KEY_SUBVOL, objectid);
@@ -1366,8 +1372,7 @@ again:
unlock_page(pages[i]);
put_page(pages[i]);
}
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
- false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return i_done;
out:
@@ -1378,8 +1383,7 @@ out:
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
page_cnt << PAGE_SHIFT, true);
- btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT,
- true);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), page_cnt << PAGE_SHIFT);
extent_changeset_free(data_reserved);
return ret;
@@ -3237,6 +3241,7 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
+ const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
int ret;
/*
@@ -3244,7 +3249,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
* source range to serialize with relocation.
*/
btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
- ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1);
+ ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
return ret;
@@ -4376,7 +4381,19 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
&sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
0);
- if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
+ /*
+ * Copy scrub args to user space even if btrfs_scrub_dev() returned an
+ * error. This is important as it allows user space to know how much
+ * progress scrub has done. For example, if scrub is canceled we get
+ * -ECANCELED from btrfs_scrub_dev() and return that error back to user
+ * space. Later user space can inspect the progress from the structure
+ * btrfs_ioctl_scrub_args and resume scrub from where it left off
+ * previously (btrfs-progs does this).
+ * If we fail to copy the btrfs_ioctl_scrub_args structure to user space
+ * then return -EFAULT to signal the structure was not copied or it may
+ * be corrupt and unreliable due to a partial copy.
+ */
+ if (copy_to_user(arg, sa, sizeof(*sa)))
ret = -EFAULT;
if (!(sa->flags & BTRFS_SCRUB_READONLY))
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 52889da69113..6755e6966da6 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -545,7 +545,6 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr,
spin_unlock(&root->ordered_extent_lock);
btrfs_init_work(&ordered->flush_work,
- btrfs_flush_delalloc_helper,
btrfs_run_ordered_extent_work, NULL, NULL);
list_add_tail(&ordered->work_list, &works);
btrfs_queue_work(fs_info->flush_workers, &ordered->flush_work);
@@ -689,10 +688,15 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
}
btrfs_start_ordered_extent(inode, ordered, 1);
end = ordered->file_offset;
+ /*
+ * If the ordered extent had an error save the error but don't
+ * exit without waiting first for all other ordered extents in
+ * the range to complete.
+ */
if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
ret = -EIO;
btrfs_put_ordered_extent(ordered);
- if (ret || end == 0 || end == start)
+ if (end == 0 || end == start)
break;
end--;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 60a00f6ca18f..297554cd03d6 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1030,6 +1030,7 @@ out_add_root:
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
qgroup_rescan_zero_tracking(fs_info);
+ fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
}
@@ -2411,8 +2412,12 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 nr_old_roots = 0;
int ret = 0;
+ /*
+ * If quotas get disabled meanwhile, the resouces need to be freed and
+ * we can't just exit here.
+ */
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
- return 0;
+ goto out_free;
if (new_roots) {
if (!maybe_fs_roots(new_roots))
@@ -2619,6 +2624,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
+ bool need_rescan = false;
u32 level_size = 0;
u64 nums;
@@ -2762,6 +2768,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
goto unlock;
}
++i_qgroups;
+
+ /*
+ * If we're doing a snapshot, and adding the snapshot to a new
+ * qgroup, the numbers are guaranteed to be incorrect.
+ */
+ if (srcid)
+ need_rescan = true;
}
for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
@@ -2781,6 +2794,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
dst->rfer = src->rfer - level_size;
dst->rfer_cmpr = src->rfer_cmpr - level_size;
+
+ /* Manually tweaking numbers certainly needs a rescan */
+ need_rescan = true;
}
for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
@@ -2799,6 +2815,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
dst->excl = src->excl + level_size;
dst->excl_cmpr = src->excl_cmpr + level_size;
+ need_rescan = true;
}
unlock:
@@ -2806,6 +2823,8 @@ unlock:
out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (need_rescan)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
return ret;
}
@@ -3220,12 +3239,12 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_RESCAN)) {
btrfs_warn(fs_info,
- "qgroup rescan init failed, qgroup is not enabled");
+ "qgroup rescan init failed, qgroup rescan is not queued");
ret = -EINVAL;
} else if (!(fs_info->qgroup_flags &
BTRFS_QGROUP_STATUS_FLAG_ON)) {
btrfs_warn(fs_info,
- "qgroup rescan init failed, qgroup rescan is not queued");
+ "qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
}
@@ -3260,7 +3279,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
sizeof(fs_info->qgroup_rescan_progress));
fs_info->qgroup_rescan_progress.objectid = progress_objectid;
init_completion(&fs_info->qgroup_rescan_completion);
- fs_info->qgroup_rescan_running = true;
spin_unlock(&fs_info->qgroup_lock);
mutex_unlock(&fs_info->qgroup_rescan_lock);
@@ -3268,7 +3286,6 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
memset(&fs_info->qgroup_rescan_work, 0,
sizeof(fs_info->qgroup_rescan_work));
btrfs_init_work(&fs_info->qgroup_rescan_work,
- btrfs_qgroup_rescan_helper,
btrfs_qgroup_rescan_worker, NULL, NULL);
return 0;
}
@@ -3326,8 +3343,11 @@ btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info)
qgroup_rescan_zero_tracking(fs_info);
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
return 0;
}
@@ -3363,9 +3383,13 @@ int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
void
btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
{
- if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN)
+ if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+ }
}
/*
@@ -4003,3 +4027,16 @@ out:
}
return ret;
}
+
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
+{
+ struct btrfs_qgroup_extent_record *entry;
+ struct btrfs_qgroup_extent_record *next;
+ struct rb_root *root;
+
+ root = &trans->delayed_refs.dirty_extent_root;
+ rbtree_postorder_for_each_entry_safe(entry, next, root, node) {
+ ulist_free(entry->old_roots);
+ kfree(entry);
+ }
+}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 46ba7bd2961c..17e8ac992c50 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -414,5 +414,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
u64 last_snapshot);
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
+void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index f3d0576dd327..16c8af21b3fb 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -174,7 +174,7 @@ static void scrub_parity_work(struct btrfs_work *work);
static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
{
- btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL);
+ btrfs_init_work(&rbio->work, work_func, NULL, NULL);
btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
}
@@ -1727,8 +1727,7 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
plug = container_of(cb, struct btrfs_plug_cb, cb);
if (from_schedule) {
- btrfs_init_work(&plug->work, btrfs_rmw_helper,
- unplug_work, NULL, NULL);
+ btrfs_init_work(&plug->work, unplug_work, NULL, NULL);
btrfs_queue_work(plug->info->rmw_workers,
&plug->work);
return;
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index bb5bd49573b4..6f5397e018cb 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -722,21 +722,19 @@ static int reada_start_machine_dev(struct btrfs_device *dev)
static void reada_start_machine_worker(struct btrfs_work *work)
{
struct reada_machine_work *rmw;
- struct btrfs_fs_info *fs_info;
int old_ioprio;
rmw = container_of(work, struct reada_machine_work, work);
- fs_info = rmw->fs_info;
-
- kfree(rmw);
old_ioprio = IOPRIO_PRIO_VALUE(task_nice_ioclass(current),
task_nice_ioprio(current));
set_task_ioprio(current, BTRFS_IOPRIO_READA);
- __reada_start_machine(fs_info);
+ __reada_start_machine(rmw->fs_info);
set_task_ioprio(current, old_ioprio);
- atomic_dec(&fs_info->reada_works_cnt);
+ atomic_dec(&rmw->fs_info->reada_works_cnt);
+
+ kfree(rmw);
}
static void __reada_start_machine(struct btrfs_fs_info *fs_info)
@@ -791,8 +789,7 @@ static void reada_start_machine(struct btrfs_fs_info *fs_info)
/* FIXME we cannot handle this properly right now */
BUG();
}
- btrfs_init_work(&rmw->work, btrfs_readahead_helper,
- reada_start_machine_worker, NULL, NULL);
+ btrfs_init_work(&rmw->work, reada_start_machine_worker, NULL, NULL);
rmw->fs_info = fs_info;
btrfs_queue_work(fs_info->readahead_workers, &rmw->work);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index b57f3618e58e..9a2f15f4c80e 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -286,6 +286,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
exist_re = insert_root_entry(&exist->roots, re);
if (exist_re)
kfree(re);
+ } else {
+ kfree(re);
}
kfree(be);
return exist;
@@ -744,6 +746,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
*/
be = add_block_entry(fs_info, bytenr, num_bytes, ref_root);
if (IS_ERR(be)) {
+ kfree(ref);
kfree(ra);
ret = PTR_ERR(be);
goto out;
@@ -757,6 +760,8 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"re-allocated a block that still has references to it!");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
+ kfree(ra);
goto out_unlock;
}
@@ -819,6 +824,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"dropping a ref for a existing root that doesn't have a ref on the block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
kfree(ra);
goto out_unlock;
}
@@ -834,6 +840,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
"attempting to add another ref for an existing ref on a tree block");
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
+ kfree(ref);
kfree(ra);
goto out_unlock;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 4ac240c18cbd..4147b569c64c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -527,8 +527,8 @@ static int should_ignore_root(struct btrfs_root *root)
if (!reloc_root)
return 0;
- if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
- root->fs_info->running_transaction->transid - 1)
+ if (btrfs_header_generation(reloc_root->commit_root) ==
+ root->fs_info->running_transaction->transid)
return 0;
/*
* if there is reloc tree and it was created in previous
@@ -1152,7 +1152,7 @@ out:
free_backref_node(cache, lower);
}
- free_backref_node(cache, node);
+ remove_backref_node(cache, node);
return ERR_PTR(err);
}
ASSERT(!node || !node->detached);
@@ -1264,7 +1264,7 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
if (!node)
return -ENOMEM;
- node->bytenr = root->node->start;
+ node->bytenr = root->commit_root->start;
node->data = root;
spin_lock(&rc->reloc_root_tree.lock);
@@ -1295,10 +1295,11 @@ static void __del_reloc_root(struct btrfs_root *root)
if (rc && root->node) {
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->node->start);
+ root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+ RB_CLEAR_NODE(&node->rb_node);
}
spin_unlock(&rc->reloc_root_tree.lock);
if (!node)
@@ -1316,7 +1317,7 @@ static void __del_reloc_root(struct btrfs_root *root)
* helper to update the 'address of tree root -> reloc tree'
* mapping
*/
-static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
+static int __update_reloc_root(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct rb_node *rb_node;
@@ -1325,7 +1326,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
- root->node->start);
+ root->commit_root->start);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
@@ -1337,7 +1338,7 @@ static int __update_reloc_root(struct btrfs_root *root, u64 new_bytenr)
BUG_ON((struct btrfs_root *)node->data != root);
spin_lock(&rc->reloc_root_tree.lock);
- node->bytenr = new_bytenr;
+ node->bytenr = root->node->start;
rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
@@ -1491,6 +1492,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
}
if (reloc_root->commit_root != reloc_root->node) {
+ __update_reloc_root(reloc_root);
btrfs_set_root_node(root_item, reloc_root->node);
free_extent_buffer(reloc_root->commit_root);
reloc_root->commit_root = btrfs_root_node(reloc_root);
@@ -2480,12 +2482,10 @@ again:
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
+ root = read_fs_root(fs_info, reloc_root->root_key.offset);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
- root = read_fs_root(fs_info,
- reloc_root->root_key.offset);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
-
ret = merge_reloc_root(rc, root);
if (ret) {
if (list_empty(&reloc_root->root_list))
@@ -2494,6 +2494,13 @@ again:
goto out;
}
} else {
+ if (!IS_ERR(root)) {
+ if (root->reloc_root == reloc_root)
+ root->reloc_root = NULL;
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
+ &root->state);
+ }
+
list_del_init(&reloc_root->root_list);
/* Don't forget to queue this reloc root for cleanup */
list_add_tail(&reloc_root->reloc_dirty_list,
@@ -2519,7 +2526,21 @@ out:
free_reloc_roots(&reloc_roots);
}
- BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+ /*
+ * We used to have
+ *
+ * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+ *
+ * here, but it's wrong. If we fail to start the transaction in
+ * prepare_to_merge() we will have only 0 ref reloc roots, none of which
+ * have actually been removed from the reloc_root_tree rb tree. This is
+ * fine because we're bailing here, and we hold a reference on the root
+ * for the list that holds it, so these roots will be cleaned up when we
+ * do the reloc_dirty_list afterwards. Meanwhile the root->reloc_root
+ * will be cleaned up on unmount.
+ *
+ * The remaining nodes will be cleaned up by free_reloc_control.
+ */
}
static void free_block_list(struct rb_root *blocks)
@@ -3119,9 +3140,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
if (ret < 0) {
- if (ret != -EAGAIN || &block->rb_node == rb_first(blocks))
- err = ret;
- goto out;
+ err = ret;
+ break;
}
}
out:
@@ -3276,7 +3296,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE, true);
+ PAGE_SIZE);
ret = -ENOMEM;
goto out;
}
@@ -3297,7 +3317,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE, true);
+ PAGE_SIZE);
ret = -EIO;
goto out;
}
@@ -3326,7 +3346,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
btrfs_delalloc_release_metadata(BTRFS_I(inode),
PAGE_SIZE, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
- PAGE_SIZE, true);
+ PAGE_SIZE);
clear_extent_bits(&BTRFS_I(inode)->io_tree,
page_start, page_end,
@@ -3342,8 +3362,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
put_page(page);
index++;
- btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE,
- false);
+ btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
balance_dirty_pages_ratelimited(inode->i_mapping);
btrfs_throttle(fs_info);
}
@@ -4098,12 +4117,6 @@ restart:
if (!RB_EMPTY_ROOT(&blocks)) {
ret = relocate_tree_blocks(trans, rc, &blocks);
if (ret < 0) {
- /*
- * if we fail to relocate tree blocks, force to update
- * backref cache when committing transaction.
- */
- rc->backref_cache.last_trans = trans->transid - 1;
-
if (ret != -EAGAIN) {
err = ret;
break;
@@ -4554,6 +4567,8 @@ int btrfs_recover_relocation(struct btrfs_root *root)
fs_root = read_fs_root(fs_info, reloc_root->root_key.offset);
if (IS_ERR(fs_root)) {
err = PTR_ERR(fs_root);
+ list_add_tail(&reloc_root->root_list, &reloc_roots);
+ btrfs_end_transaction(trans);
goto out_free;
}
@@ -4668,11 +4683,6 @@ int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
- if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
- if (buf == root->node)
- __update_reloc_root(root, cow->start);
- }
-
level = btrfs_header_level(buf);
if (btrfs_header_generation(buf) <=
btrfs_root_last_snapshot(&root->root_item))
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 22124122728c..24a581030840 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -374,11 +374,13 @@ again:
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
-
- WARN_ON(btrfs_root_ref_dirid(leaf, ref) != dirid);
- WARN_ON(btrfs_root_ref_name_len(leaf, ref) != name_len);
ptr = (unsigned long)(ref + 1);
- WARN_ON(memcmp_extent_buffer(leaf, name, ptr, name_len));
+ if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
+ (btrfs_root_ref_name_len(leaf, ref) != name_len) ||
+ memcmp_extent_buffer(leaf, name, ptr, name_len)) {
+ err = -ENOENT;
+ goto out;
+ }
*sequence = btrfs_root_ref_sequence(leaf, ref);
ret = btrfs_del_item(trans, tree_root, path);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f7b29f9db5e2..ba4247349ed4 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -596,8 +596,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
sbio->index = i;
sbio->sctx = sctx;
sbio->page_count = 0;
- btrfs_init_work(&sbio->work, btrfs_scrub_helper,
- scrub_bio_end_io_worker, NULL, NULL);
+ btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL,
+ NULL);
if (i != SCRUB_BIOS_PER_SCTX - 1)
sctx->bios[i]->next_free = i + 1;
@@ -1718,8 +1718,7 @@ static void scrub_wr_bio_end_io(struct bio *bio)
sbio->status = bio->bi_status;
sbio->bio = bio;
- btrfs_init_work(&sbio->work, btrfs_scrubwrc_helper,
- scrub_wr_bio_end_io_worker, NULL, NULL);
+ btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
}
@@ -2136,14 +2135,13 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work)
scrub_write_block_to_dev_replace(sblock);
}
- scrub_block_put(sblock);
-
if (sctx->is_dev_replace && sctx->flush_all_writes) {
mutex_lock(&sctx->wr_lock);
scrub_wr_submit(sctx);
mutex_unlock(&sctx->wr_lock);
}
+ scrub_block_put(sblock);
scrub_pending_bio_dec(sctx);
}
@@ -2191,8 +2189,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock)
raid56_add_scrub_pages(rbio, spage->page, spage->logical);
}
- btrfs_init_work(&sblock->work, btrfs_scrub_helper,
- scrub_missing_raid56_worker, NULL, NULL);
+ btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL);
scrub_block_get(sblock);
scrub_pending_bio_inc(sctx);
raid56_submit_missing_rbio(rbio);
@@ -2730,8 +2727,8 @@ static void scrub_parity_bio_endio(struct bio *bio)
bio_put(bio);
- btrfs_init_work(&sparity->work, btrfs_scrubparity_helper,
- scrub_parity_bio_endio_worker, NULL, NULL);
+ btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL,
+ NULL);
btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work);
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index d25271381c56..577a3a6e5bed 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -23,6 +23,15 @@
#include "btrfs_inode.h"
#include "transaction.h"
#include "compression.h"
+#include "xattr.h"
+
+/*
+ * Maximum number of references an extent can have in order for us to attempt to
+ * issue clone operations instead of write operations. This currently exists to
+ * avoid hitting limitations of the backreference walking code (taking a lot of
+ * time and using too much memory for extents with large number of references).
+ */
+#define SEND_MAX_EXTENT_REFS 64
/*
* A fs_path is a helper to dynamically build path names with unknown size.
@@ -1287,6 +1296,7 @@ static int find_extent_clone(struct send_ctx *sctx,
struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
+ struct btrfs_extent_item *ei;
int compressed;
u32 i;
@@ -1334,7 +1344,6 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = extent_from_logical(fs_info, disk_byte, tmp_path,
&found_key, &flags);
up_read(&fs_info->commit_root_sem);
- btrfs_release_path(tmp_path);
if (ret < 0)
goto out;
@@ -1343,6 +1352,21 @@ static int find_extent_clone(struct send_ctx *sctx,
goto out;
}
+ ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
+ struct btrfs_extent_item);
+ /*
+ * Backreference walking (iterate_extent_inodes() below) is currently
+ * too expensive when an extent has a large number of references, both
+ * in time spent and used memory. So for now just fallback to write
+ * operations instead of clone operations when an extent has more than
+ * a certain amount of references.
+ */
+ if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
+ ret = -ENOENT;
+ goto out;
+ }
+ btrfs_release_path(tmp_path);
+
/*
* Setup the clone roots.
*/
@@ -4498,6 +4522,10 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
struct fs_path *p;
struct posix_acl_xattr_header dummy_acl;
+ /* Capabilities are emitted by finish_inode_if_needed */
+ if (!strncmp(name, XATTR_NAME_CAPS, name_len))
+ return 0;
+
p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -5060,6 +5088,64 @@ static int send_extent_data(struct send_ctx *sctx,
return 0;
}
+/*
+ * Search for a capability xattr related to sctx->cur_ino. If the capability is
+ * found, call send_set_xattr function to emit it.
+ *
+ * Return 0 if there isn't a capability, or when the capability was emitted
+ * successfully, or < 0 if an error occurred.
+ */
+static int send_capabilities(struct send_ctx *sctx)
+{
+ struct fs_path *fspath = NULL;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *di;
+ struct extent_buffer *leaf;
+ unsigned long data_ptr;
+ char *buf = NULL;
+ int buf_len;
+ int ret = 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino,
+ XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
+ if (!di) {
+ /* There is no xattr for this inode */
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ buf_len = btrfs_dir_data_len(leaf, di);
+
+ fspath = fs_path_alloc();
+ buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!fspath || !buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
+ read_extent_buffer(leaf, buf, data_ptr, buf_len);
+
+ ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
+ strlen(XATTR_NAME_CAPS), buf, buf_len);
+out:
+ kfree(buf);
+ fs_path_free(fspath);
+ btrfs_free_path(path);
+ return ret;
+}
+
static int clone_range(struct send_ctx *sctx,
struct clone_root *clone_root,
const u64 disk_byte,
@@ -5070,7 +5156,7 @@ static int clone_range(struct send_ctx *sctx,
struct btrfs_path *path;
struct btrfs_key key;
int ret;
- u64 clone_src_i_size;
+ u64 clone_src_i_size = 0;
/*
* Prevent cloning from a zero offset with a length matching the sector
@@ -5963,6 +6049,10 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
}
+ ret = send_capabilities(sctx);
+ if (ret < 0)
+ goto out;
+
/*
* If other directory inodes depended on our current directory
* inode's move/rename, now do their move/rename operations.
@@ -6678,12 +6768,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
spin_unlock(&send_root->root_item_lock);
/*
- * This is done when we lookup the root, it should already be complete
- * by the time we get here.
- */
- WARN_ON(send_root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE);
-
- /*
* Userspace tools do the checks and warn the user if it's
* not RO.
*/
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 0645ec428b4f..0050eb146b96 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -432,6 +432,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
char *compress_type;
bool compress_force = false;
enum btrfs_compression_type saved_compress_type;
+ int saved_compress_level;
bool saved_compress_force;
int no_compress = 0;
@@ -514,6 +515,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
info->compress_type : BTRFS_COMPRESS_NONE;
saved_compress_force =
btrfs_test_opt(info, FORCE_COMPRESS);
+ saved_compress_level = info->compress_level;
if (token == Opt_compress ||
token == Opt_compress_force ||
strncmp(args[0].from, "zlib", 4) == 0) {
@@ -558,6 +560,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
no_compress = 0;
} else if (strncmp(args[0].from, "no", 2) == 0) {
compress_type = "no";
+ info->compress_level = 0;
+ info->compress_type = 0;
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
compress_force = false;
@@ -578,11 +582,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
*/
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
- if ((btrfs_test_opt(info, COMPRESS) &&
- (info->compress_type != saved_compress_type ||
- compress_force != saved_compress_force)) ||
- (!btrfs_test_opt(info, COMPRESS) &&
- no_compress == 1)) {
+ if (no_compress == 1) {
+ btrfs_info(info, "use no compression");
+ } else if ((info->compress_type != saved_compress_type) ||
+ (compress_force != saved_compress_force) ||
+ (info->compress_level != saved_compress_level)) {
btrfs_info(info, "%s %s compression, level %d",
(compress_force) ? "force" : "use",
compress_type, info->compress_level);
@@ -1803,6 +1807,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
}
if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+ btrfs_warn(fs_info,
+ "mount required to replay tree-log, cannot remount read-write");
ret = -EINVAL;
goto restore;
}
@@ -2113,7 +2119,15 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
*/
thresh = SZ_4M;
- if (!mixed && total_free_meta - thresh < block_rsv->size)
+ /*
+ * We only want to claim there's no available space if we can no longer
+ * allocate chunks for our metadata profile and our global reserve will
+ * not fit in the free metadata space. If we aren't ->full then we
+ * still can allocate chunks and thus are fine using the currently
+ * calculated f_bavail.
+ */
+ if (!mixed && block_rsv->space_info->full &&
+ total_free_meta - thresh < block_rsv->size)
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
@@ -2255,9 +2269,7 @@ static int btrfs_unfreeze(struct super_block *sb)
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
- struct btrfs_fs_devices *cur_devices;
struct btrfs_device *dev, *first_dev = NULL;
- struct list_head *head;
/*
* Lightweight locking of the devices. We should not need
@@ -2267,18 +2279,13 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
* least until the rcu_read_unlock.
*/
rcu_read_lock();
- cur_devices = fs_info->fs_devices;
- while (cur_devices) {
- head = &cur_devices->devices;
- list_for_each_entry_rcu(dev, head, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
- continue;
- if (!dev->name)
- continue;
- if (!first_dev || dev->devid < first_dev->devid)
- first_dev = dev;
- }
- cur_devices = cur_devices->seed;
+ list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ if (!dev->name)
+ continue;
+ if (!first_dev || dev->devid < first_dev->devid)
+ first_dev = dev;
}
if (first_dev)
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 41f41540382e..69971540797e 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -117,7 +117,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
spin_lock_init(&fs_info->qgroup_lock);
spin_lock_init(&fs_info->super_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
- spin_lock_init(&fs_info->tree_mod_seq_lock);
mutex_init(&fs_info->qgroup_ioctl_lock);
mutex_init(&fs_info->qgroup_rescan_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 7bf4d5734dbe..b9c7a92bd1cc 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -432,6 +432,114 @@ out:
return ret;
}
+static int test_find_first_clear_extent_bit(void)
+{
+ struct extent_io_tree tree;
+ u64 start, end;
+ int ret = -EINVAL;
+
+ test_msg("running find_first_clear_extent_bit test");
+
+ extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
+
+ /* Test correct handling of empty tree */
+ find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);
+ if (start != 0 || end != -1) {
+ test_err(
+ "error getting a range from completely empty tree: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+ /*
+ * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between
+ * 4M-32M
+ */
+ set_extent_bits(&tree, SZ_1M, SZ_4M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ find_first_clear_extent_bit(&tree, SZ_512K, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ if (start != 0 || end != SZ_1M - 1) {
+ test_err("error finding beginning range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /* Now add 32M-64M so that we have a hole between 4M-32M */
+ set_extent_bits(&tree, SZ_32M, SZ_64M - 1,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ /*
+ * Request first hole starting at 12M, we should get 4M-32M
+ */
+ find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ if (start != SZ_4M || end != SZ_32M - 1) {
+ test_err("error finding trimmed range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /*
+ * Search in the middle of allocated range, should get the next one
+ * available, which happens to be unallocated -> 4M-32M
+ */
+ find_first_clear_extent_bit(&tree, SZ_2M, &start, &end,
+ CHUNK_TRIMMED | CHUNK_ALLOCATED);
+
+ if (start != SZ_4M || end != SZ_32M - 1) {
+ test_err("error finding next unalloc range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /*
+ * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag
+ * being unset in this range, we should get the entry in range 64M-72M
+ */
+ set_extent_bits(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED);
+ find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end,
+ CHUNK_TRIMMED);
+
+ if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) {
+ test_err("error finding exact range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end,
+ CHUNK_TRIMMED);
+
+ /*
+ * Search in the middle of set range whose immediate neighbour doesn't
+ * have the bits set so it must be returned
+ */
+ if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) {
+ test_err("error finding next alloc range: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ /*
+ * Search beyond any known range, shall return after last known range
+ * and end should be -1
+ */
+ find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED);
+ if (start != SZ_64M + SZ_8M || end != -1) {
+ test_err(
+ "error handling beyond end of range search: start %llu end %llu",
+ start, end);
+ goto out;
+ }
+
+ ret = 0;
+out:
+ clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED);
+ return ret;
+}
+
int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
{
int ret;
@@ -442,6 +550,10 @@ int btrfs_test_extent_io(u32 sectorsize, u32 nodesize)
if (ret)
goto out;
+ ret = test_find_first_clear_extent_bit();
+ if (ret)
+ goto out;
+
ret = test_eb_bitmaps(sectorsize, nodesize);
out:
return ret;
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
index a90dad166971..4a22cb981a61 100644
--- a/fs/btrfs/tests/free-space-tree-tests.c
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -462,9 +462,9 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
root->fs_info->tree_root = root;
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
- if (!root->node) {
+ if (IS_ERR(root->node)) {
test_std_err(TEST_ALLOC_EXTENT_BUFFER);
- ret = -ENOMEM;
+ ret = PTR_ERR(root->node);
goto out;
}
btrfs_set_header_level(root->node, 0);
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 09aaca1efd62..ac035a6fa003 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -484,9 +484,9 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
* *cough*backref walking code*cough*
*/
root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
- if (!root->node) {
+ if (IS_ERR(root->node)) {
test_err("couldn't allocate dummy buffer");
- ret = -ENOMEM;
+ ret = PTR_ERR(root->node);
goto out;
}
btrfs_set_header_level(root->node, 0);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2db14fdd6bff..553c6631ba8b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -49,6 +49,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
BUG_ON(!list_empty(&transaction->list));
WARN_ON(!RB_EMPTY_ROOT(
&transaction->delayed_refs.href_root.rb_root));
+ WARN_ON(!RB_EMPTY_ROOT(
+ &transaction->delayed_refs.dirty_extent_root));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
"pending csums is %llu",
@@ -566,10 +568,19 @@ again:
}
got_it:
- btrfs_record_root_in_trans(h, root);
-
if (!current->journal_info)
current->journal_info = h;
+
+ /*
+ * btrfs_record_root_in_trans() needs to alloc new extents, and may
+ * call btrfs_join_transaction() while we're also starting a
+ * transaction.
+ *
+ * Thus it need to be called after current->journal_info initialized,
+ * or we can deadlock.
+ */
+ btrfs_record_root_in_trans(h, root);
+
return h;
join_fail:
@@ -1929,6 +1940,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *prev_trans = NULL;
int ret;
+ /*
+ * Some places just start a transaction to commit it. We need to make
+ * sure that if this commit fails that the abort code actually marks the
+ * transaction as failed, so set trans->dirty to make the abort code do
+ * the right thing.
+ */
+ trans->dirty = true;
+
/* Stop the commit early if ->aborted is set */
if (unlikely(READ_ONCE(cur_trans->aborted))) {
ret = cur_trans->aborted;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 9634cae1e1b1..d41a566594cf 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -243,7 +243,7 @@ static int check_extent_data_item(struct extent_buffer *leaf,
}
static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
- int slot)
+ int slot, struct btrfs_key *prev_key)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
u32 sectorsize = fs_info->sectorsize;
@@ -267,6 +267,20 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key,
btrfs_item_size_nr(leaf, slot), csumsize);
return -EUCLEAN;
}
+ if (slot > 0 && prev_key->type == BTRFS_EXTENT_CSUM_KEY) {
+ u64 prev_csum_end;
+ u32 prev_item_size;
+
+ prev_item_size = btrfs_item_size_nr(leaf, slot - 1);
+ prev_csum_end = (prev_item_size / csumsize) * sectorsize;
+ prev_csum_end += prev_key->offset;
+ if (prev_csum_end > key->offset) {
+ generic_err(leaf, slot - 1,
+"csum end range (%llu) goes beyond the start range (%llu) of the next csum item",
+ prev_csum_end, key->offset);
+ return -EUCLEAN;
+ }
+ }
return 0;
}
@@ -686,9 +700,7 @@ static void dev_item_err(const struct extent_buffer *eb, int slot,
static int check_dev_item(struct extent_buffer *leaf,
struct btrfs_key *key, int slot)
{
- struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_dev_item *ditem;
- u64 max_devid = max(BTRFS_MAX_DEVS(fs_info), BTRFS_MAX_DEVS_SYS_CHUNK);
if (key->objectid != BTRFS_DEV_ITEMS_OBJECTID) {
dev_item_err(leaf, slot,
@@ -696,12 +708,6 @@ static int check_dev_item(struct extent_buffer *leaf,
key->objectid, BTRFS_DEV_ITEMS_OBJECTID);
return -EUCLEAN;
}
- if (key->offset > max_devid) {
- dev_item_err(leaf, slot,
- "invalid devid: has=%llu expect=[0, %llu]",
- key->offset, max_devid);
- return -EUCLEAN;
- }
ditem = btrfs_item_ptr(leaf, slot, struct btrfs_dev_item);
if (btrfs_device_id(leaf, ditem) != key->offset) {
dev_item_err(leaf, slot,
@@ -925,7 +931,7 @@ static int check_leaf_item(struct extent_buffer *leaf,
ret = check_extent_data_item(leaf, key, slot, prev_key);
break;
case BTRFS_EXTENT_CSUM_KEY:
- ret = check_csum_item(leaf, key, slot);
+ ret = check_csum_item(leaf, key, slot, prev_key);
break;
case BTRFS_DIR_ITEM_KEY:
case BTRFS_DIR_INDEX_KEY:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c6bafb8b5f42..e2e361d09959 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -807,7 +807,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_del_csums(trans, fs_info,
+ ret = btrfs_del_csums(trans,
+ fs_info->csum_root,
sums->bytenr,
sums->len);
if (!ret)
@@ -3927,10 +3928,32 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
return 0;
}
+static int log_csums(struct btrfs_trans_handle *trans,
+ struct btrfs_root *log_root,
+ struct btrfs_ordered_sum *sums)
+{
+ int ret;
+
+ /*
+ * Due to extent cloning, we might have logged a csum item that covers a
+ * subrange of a cloned extent, and later we can end up logging a csum
+ * item for a larger subrange of the same extent or the entire range.
+ * This would leave csum items in the log tree that cover the same range
+ * and break the searches for checksums in the log tree, resulting in
+ * some checksums missing in the fs/subvolume tree. So just delete (or
+ * trim and adjust) any existing csum items in the log for this range.
+ */
+ ret = btrfs_del_csums(trans, log_root, sums->bytenr, sums->len);
+ if (ret)
+ return ret;
+
+ return btrfs_csum_file_blocks(trans, log_root, sums);
+}
+
static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode,
struct btrfs_path *dst_path,
- struct btrfs_path *src_path, u64 *last_extent,
+ struct btrfs_path *src_path,
int start_slot, int nr, int inode_only,
u64 logged_isize)
{
@@ -3941,7 +3964,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item *extent;
struct btrfs_inode_item *inode_item;
struct extent_buffer *src = src_path->nodes[0];
- struct btrfs_key first_key, last_key, key;
int ret;
struct btrfs_key *ins_keys;
u32 *ins_sizes;
@@ -3949,9 +3971,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
int i;
struct list_head ordered_sums;
int skip_csum = inode->flags & BTRFS_INODE_NODATASUM;
- bool has_extents = false;
- bool need_find_last_extent = true;
- bool done = false;
INIT_LIST_HEAD(&ordered_sums);
@@ -3960,8 +3979,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
if (!ins_data)
return -ENOMEM;
- first_key.objectid = (u64)-1;
-
ins_sizes = (u32 *)ins_data;
ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
@@ -3982,9 +3999,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src_offset = btrfs_item_ptr_offset(src, start_slot + i);
- if (i == nr - 1)
- last_key = ins_keys[i];
-
if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
inode_item = btrfs_item_ptr(dst_path->nodes[0],
dst_path->slots[0],
@@ -3998,20 +4012,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
src_offset, ins_sizes[i]);
}
- /*
- * We set need_find_last_extent here in case we know we were
- * processing other items and then walk into the first extent in
- * the inode. If we don't hit an extent then nothing changes,
- * we'll do the last search the next time around.
- */
- if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) {
- has_extents = true;
- if (first_key.objectid == (u64)-1)
- first_key = ins_keys[i];
- } else {
- need_find_last_extent = false;
- }
-
/* take a reference on file data extents so that truncates
* or deletes of this inode don't have to relog the inode
* again
@@ -4049,11 +4049,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
fs_info->csum_root,
ds + cs, ds + cs + cl - 1,
&ordered_sums, 0);
- if (ret) {
- btrfs_release_path(dst_path);
- kfree(ins_data);
- return ret;
- }
+ if (ret)
+ break;
}
}
}
@@ -4066,178 +4063,16 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* we have to do this after the loop above to avoid changing the
* log tree while trying to change the log tree.
*/
- ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_csum_file_blocks(trans, log, sums);
+ ret = log_csums(trans, log, sums);
list_del(&sums->list);
kfree(sums);
}
- if (!has_extents)
- return ret;
-
- if (need_find_last_extent && *last_extent == first_key.offset) {
- /*
- * We don't have any leafs between our current one and the one
- * we processed before that can have file extent items for our
- * inode (and have a generation number smaller than our current
- * transaction id).
- */
- need_find_last_extent = false;
- }
-
- /*
- * Because we use btrfs_search_forward we could skip leaves that were
- * not modified and then assume *last_extent is valid when it really
- * isn't. So back up to the previous leaf and read the end of the last
- * extent before we go and fill in holes.
- */
- if (need_find_last_extent) {
- u64 len;
-
- ret = btrfs_prev_leaf(inode->root, src_path);
- if (ret < 0)
- return ret;
- if (ret)
- goto fill_holes;
- if (src_path->slots[0])
- src_path->slots[0]--;
- src = src_path->nodes[0];
- btrfs_item_key_to_cpu(src, &key, src_path->slots[0]);
- if (key.objectid != btrfs_ino(inode) ||
- key.type != BTRFS_EXTENT_DATA_KEY)
- goto fill_holes;
- extent = btrfs_item_ptr(src, src_path->slots[0],
- struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(src, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(src, extent);
- *last_extent = ALIGN(key.offset + len,
- fs_info->sectorsize);
- } else {
- len = btrfs_file_extent_num_bytes(src, extent);
- *last_extent = key.offset + len;
- }
- }
-fill_holes:
- /* So we did prev_leaf, now we need to move to the next leaf, but a few
- * things could have happened
- *
- * 1) A merge could have happened, so we could currently be on a leaf
- * that holds what we were copying in the first place.
- * 2) A split could have happened, and now not all of the items we want
- * are on the same leaf.
- *
- * So we need to adjust how we search for holes, we need to drop the
- * path and re-search for the first extent key we found, and then walk
- * forward until we hit the last one we copied.
- */
- if (need_find_last_extent) {
- /* btrfs_prev_leaf could return 1 without releasing the path */
- btrfs_release_path(src_path);
- ret = btrfs_search_slot(NULL, inode->root, &first_key,
- src_path, 0, 0);
- if (ret < 0)
- return ret;
- ASSERT(ret == 0);
- src = src_path->nodes[0];
- i = src_path->slots[0];
- } else {
- i = start_slot;
- }
-
- /*
- * Ok so here we need to go through and fill in any holes we may have
- * to make sure that holes are punched for those areas in case they had
- * extents previously.
- */
- while (!done) {
- u64 offset, len;
- u64 extent_end;
-
- if (i >= btrfs_header_nritems(src_path->nodes[0])) {
- ret = btrfs_next_leaf(inode->root, src_path);
- if (ret < 0)
- return ret;
- ASSERT(ret == 0);
- src = src_path->nodes[0];
- i = 0;
- need_find_last_extent = true;
- }
-
- btrfs_item_key_to_cpu(src, &key, i);
- if (!btrfs_comp_cpu_keys(&key, &last_key))
- done = true;
- if (key.objectid != btrfs_ino(inode) ||
- key.type != BTRFS_EXTENT_DATA_KEY) {
- i++;
- continue;
- }
- extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item);
- if (btrfs_file_extent_type(src, extent) ==
- BTRFS_FILE_EXTENT_INLINE) {
- len = btrfs_file_extent_ram_bytes(src, extent);
- extent_end = ALIGN(key.offset + len,
- fs_info->sectorsize);
- } else {
- len = btrfs_file_extent_num_bytes(src, extent);
- extent_end = key.offset + len;
- }
- i++;
-
- if (*last_extent == key.offset) {
- *last_extent = extent_end;
- continue;
- }
- offset = *last_extent;
- len = key.offset - *last_extent;
- ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode),
- offset, 0, 0, len, 0, len, 0, 0, 0);
- if (ret)
- break;
- *last_extent = extent_end;
- }
-
- /*
- * Check if there is a hole between the last extent found in our leaf
- * and the first extent in the next leaf. If there is one, we need to
- * log an explicit hole so that at replay time we can punch the hole.
- */
- if (ret == 0 &&
- key.objectid == btrfs_ino(inode) &&
- key.type == BTRFS_EXTENT_DATA_KEY &&
- i == btrfs_header_nritems(src_path->nodes[0])) {
- ret = btrfs_next_leaf(inode->root, src_path);
- need_find_last_extent = true;
- if (ret > 0) {
- ret = 0;
- } else if (ret == 0) {
- btrfs_item_key_to_cpu(src_path->nodes[0], &key,
- src_path->slots[0]);
- if (key.objectid == btrfs_ino(inode) &&
- key.type == BTRFS_EXTENT_DATA_KEY &&
- *last_extent < key.offset) {
- const u64 len = key.offset - *last_extent;
-
- ret = btrfs_insert_file_extent(trans, log,
- btrfs_ino(inode),
- *last_extent, 0,
- 0, len, 0, len,
- 0, 0, 0);
- *last_extent += len;
- }
- }
- }
- /*
- * Need to let the callers know we dropped the path so they should
- * re-search.
- */
- if (!ret && need_find_last_extent)
- ret = 1;
return ret;
}
@@ -4292,7 +4127,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
struct btrfs_ordered_sum,
list);
if (!ret)
- ret = btrfs_csum_file_blocks(trans, log_root, sums);
+ ret = log_csums(trans, log_root, sums);
list_del(&sums->list);
kfree(sums);
}
@@ -4403,7 +4238,10 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
const u64 i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
struct btrfs_path *dst_path = NULL;
- u64 last_extent = (u64)-1;
+ bool dropped_extents = false;
+ u64 truncate_offset = i_size;
+ struct extent_buffer *leaf;
+ int slot;
int ins_nr = 0;
int start_slot;
int ret;
@@ -4418,15 +4256,48 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
if (ret < 0)
goto out;
+ /*
+ * We must check if there is a prealloc extent that starts before the
+ * i_size and crosses the i_size boundary. This is to ensure later we
+ * truncate down to the end of that extent and not to the i_size, as
+ * otherwise we end up losing part of the prealloc extent after a log
+ * replay and with an implicit hole if there is another prealloc extent
+ * that starts at an offset beyond i_size.
+ */
+ ret = btrfs_previous_item(root, path, ino, BTRFS_EXTENT_DATA_KEY);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ struct btrfs_file_extent_item *ei;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ ei = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+
+ if (btrfs_file_extent_type(leaf, ei) ==
+ BTRFS_FILE_EXTENT_PREALLOC) {
+ u64 extent_end;
+
+ btrfs_item_key_to_cpu(leaf, &key, slot);
+ extent_end = key.offset +
+ btrfs_file_extent_num_bytes(leaf, ei);
+
+ if (extent_end > i_size)
+ truncate_offset = extent_end;
+ }
+ } else {
+ ret = 0;
+ }
+
while (true) {
- struct extent_buffer *leaf = path->nodes[0];
- int slot = path->slots[0];
+ leaf = path->nodes[0];
+ slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
if (ins_nr > 0) {
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, start_slot,
- ins_nr, 1, 0);
+ start_slot, ins_nr, 1, 0);
if (ret < 0)
goto out;
ins_nr = 0;
@@ -4450,8 +4321,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
path->slots[0]++;
continue;
}
- if (last_extent == (u64)-1) {
- last_extent = key.offset;
+ if (!dropped_extents) {
/*
* Avoid logging extent items logged in past fsync calls
* and leading to duplicate keys in the log tree.
@@ -4460,11 +4330,12 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
ret = btrfs_truncate_inode_items(trans,
root->log_root,
&inode->vfs_inode,
- i_size,
+ truncate_offset,
BTRFS_EXTENT_DATA_KEY);
} while (ret == -EAGAIN);
if (ret)
goto out;
+ dropped_extents = true;
}
if (ins_nr == 0)
start_slot = slot;
@@ -4479,7 +4350,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
}
}
if (ins_nr > 0) {
- ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ ret = copy_items(trans, inode, dst_path, path,
start_slot, ins_nr, 1, 0);
if (ret > 0)
ret = 0;
@@ -4666,13 +4537,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
if (slot >= nritems) {
if (ins_nr > 0) {
- u64 last_extent = 0;
-
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, start_slot,
- ins_nr, 1, 0);
- /* can't be 1, extent items aren't processed */
- ASSERT(ret <= 0);
+ start_slot, ins_nr, 1, 0);
if (ret < 0)
return ret;
ins_nr = 0;
@@ -4696,13 +4562,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
cond_resched();
}
if (ins_nr > 0) {
- u64 last_extent = 0;
-
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, start_slot,
- ins_nr, 1, 0);
- /* can't be 1, extent items aren't processed */
- ASSERT(ret <= 0);
+ start_slot, ins_nr, 1, 0);
if (ret < 0)
return ret;
}
@@ -4711,100 +4572,119 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans,
}
/*
- * If the no holes feature is enabled we need to make sure any hole between the
- * last extent and the i_size of our inode is explicitly marked in the log. This
- * is to make sure that doing something like:
- *
- * 1) create file with 128Kb of data
- * 2) truncate file to 64Kb
- * 3) truncate file to 256Kb
- * 4) fsync file
- * 5) <crash/power failure>
- * 6) mount fs and trigger log replay
- *
- * Will give us a file with a size of 256Kb, the first 64Kb of data match what
- * the file had in its first 64Kb of data at step 1 and the last 192Kb of the
- * file correspond to a hole. The presence of explicit holes in a log tree is
- * what guarantees that log replay will remove/adjust file extent items in the
- * fs/subvol tree.
- *
- * Here we do not need to care about holes between extents, that is already done
- * by copy_items(). We also only need to do this in the full sync path, where we
- * lookup for extents from the fs/subvol tree only. In the fast path case, we
- * lookup the list of modified extent maps and if any represents a hole, we
- * insert a corresponding extent representing a hole in the log tree.
+ * When using the NO_HOLES feature if we punched a hole that causes the
+ * deletion of entire leafs or all the extent items of the first leaf (the one
+ * that contains the inode item and references) we may end up not processing
+ * any extents, because there are no leafs with a generation matching the
+ * current transaction that have extent items for our inode. So we need to find
+ * if any holes exist and then log them. We also need to log holes after any
+ * truncate operation that changes the inode's size.
*/
-static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_inode *inode,
- struct btrfs_path *path)
+static int btrfs_log_holes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_inode *inode,
+ struct btrfs_path *path)
{
struct btrfs_fs_info *fs_info = root->fs_info;
- int ret;
struct btrfs_key key;
- u64 hole_start;
- u64 hole_size;
- struct extent_buffer *leaf;
- struct btrfs_root *log = root->log_root;
const u64 ino = btrfs_ino(inode);
const u64 i_size = i_size_read(&inode->vfs_inode);
+ u64 prev_extent_end = 0;
+ int ret;
- if (!btrfs_fs_incompat(fs_info, NO_HOLES))
+ if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0)
return 0;
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
- key.offset = (u64)-1;
+ key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- ASSERT(ret != 0);
if (ret < 0)
return ret;
- ASSERT(path->slots[0] > 0);
- path->slots[0]--;
- leaf = path->nodes[0];
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-
- if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) {
- /* inode does not have any extents */
- hole_start = 0;
- hole_size = i_size;
- } else {
+ while (true) {
struct btrfs_file_extent_item *extent;
+ struct extent_buffer *leaf = path->nodes[0];
u64 len;
- /*
- * If there's an extent beyond i_size, an explicit hole was
- * already inserted by copy_items().
- */
- if (key.offset >= i_size)
- return 0;
+ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+ leaf = path->nodes[0];
+ }
+
+ btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+ if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
+ break;
+
+ /* We have a hole, log it. */
+ if (prev_extent_end < key.offset) {
+ const u64 hole_len = key.offset - prev_extent_end;
+
+ /*
+ * Release the path to avoid deadlocks with other code
+ * paths that search the root while holding locks on
+ * leafs from the log root.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_insert_file_extent(trans, root->log_root,
+ ino, prev_extent_end, 0,
+ 0, hole_len, 0, hole_len,
+ 0, 0, 0);
+ if (ret < 0)
+ return ret;
+
+ /*
+ * Search for the same key again in the root. Since it's
+ * an extent item and we are holding the inode lock, the
+ * key must still exist. If it doesn't just emit warning
+ * and return an error to fall back to a transaction
+ * commit.
+ */
+ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+ if (ret < 0)
+ return ret;
+ if (WARN_ON(ret > 0))
+ return -ENOENT;
+ leaf = path->nodes[0];
+ }
extent = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_file_extent_item);
-
if (btrfs_file_extent_type(leaf, extent) ==
- BTRFS_FILE_EXTENT_INLINE)
- return 0;
+ BTRFS_FILE_EXTENT_INLINE) {
+ len = btrfs_file_extent_ram_bytes(leaf, extent);
+ prev_extent_end = ALIGN(key.offset + len,
+ fs_info->sectorsize);
+ } else {
+ len = btrfs_file_extent_num_bytes(leaf, extent);
+ prev_extent_end = key.offset + len;
+ }
- len = btrfs_file_extent_num_bytes(leaf, extent);
- /* Last extent goes beyond i_size, no need to log a hole. */
- if (key.offset + len > i_size)
- return 0;
- hole_start = key.offset + len;
- hole_size = i_size - hole_start;
+ path->slots[0]++;
+ cond_resched();
}
- btrfs_release_path(path);
- /* Last extent ends at i_size. */
- if (hole_size == 0)
- return 0;
+ if (prev_extent_end < i_size) {
+ u64 hole_len;
- hole_size = ALIGN(hole_size, fs_info->sectorsize);
- ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0,
- hole_size, 0, hole_size, 0, 0, 0);
- return ret;
+ btrfs_release_path(path);
+ hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize);
+ ret = btrfs_insert_file_extent(trans, root->log_root,
+ ino, prev_extent_end, 0, 0,
+ hole_len, 0, hole_len,
+ 0, 0, 0);
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
}
/*
@@ -5009,6 +4889,50 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
continue;
}
/*
+ * If the inode was already logged skip it - otherwise we can
+ * hit an infinite loop. Example:
+ *
+ * From the commit root (previous transaction) we have the
+ * following inodes:
+ *
+ * inode 257 a directory
+ * inode 258 with references "zz" and "zz_link" on inode 257
+ * inode 259 with reference "a" on inode 257
+ *
+ * And in the current (uncommitted) transaction we have:
+ *
+ * inode 257 a directory, unchanged
+ * inode 258 with references "a" and "a2" on inode 257
+ * inode 259 with reference "zz_link" on inode 257
+ * inode 261 with reference "zz" on inode 257
+ *
+ * When logging inode 261 the following infinite loop could
+ * happen if we don't skip already logged inodes:
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 261
+ * on reference "zz", and log it;
+ *
+ * - we detect inode 259 as a conflicting inode, with inode 258
+ * on reference "a", and log it;
+ *
+ * - we detect inode 258 as a conflicting inode, with inode 259
+ * on reference "zz_link", and log it - again! After this we
+ * repeat the above steps forever.
+ */
+ spin_lock(&BTRFS_I(inode)->lock);
+ /*
+ * Check the inode's logged_trans only instead of
+ * btrfs_inode_in_log(). This is because the last_log_commit of
+ * the inode is not updated when we only log that it exists and
+ * and it has the full sync bit set (see btrfs_log_inode()).
+ */
+ if (BTRFS_I(inode)->logged_trans == trans->transid) {
+ spin_unlock(&BTRFS_I(inode)->lock);
+ btrfs_add_delayed_iput(inode);
+ continue;
+ }
+ spin_unlock(&BTRFS_I(inode)->lock);
+ /*
* We are safe logging the other inode without acquiring its
* lock as long as we log with the LOG_INODE_EXISTS mode. We
* are safe against concurrent renames of the other inode as
@@ -5107,7 +5031,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
- u64 last_extent = 0;
int err = 0;
int ret;
int nritems;
@@ -5285,7 +5208,7 @@ again:
ins_start_slot = path->slots[0];
}
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, ins_start_slot,
+ ins_start_slot,
ins_nr, inode_only,
logged_isize);
if (ret < 0) {
@@ -5308,17 +5231,13 @@ again:
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, ins_start_slot,
+ ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
ins_nr = 0;
- if (ret) {
- btrfs_release_path(path);
- continue;
- }
goto next_slot;
}
@@ -5331,18 +5250,13 @@ again:
goto next_slot;
}
- ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
- if (ret) {
- ins_nr = 0;
- btrfs_release_path(path);
- continue;
- }
ins_nr = 1;
ins_start_slot = path->slots[0];
next_slot:
@@ -5356,13 +5270,12 @@ next_slot:
}
if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path,
- &last_extent, ins_start_slot,
+ ins_start_slot,
ins_nr, inode_only, logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
- ret = 0;
ins_nr = 0;
}
btrfs_release_path(path);
@@ -5377,14 +5290,13 @@ next_key:
}
}
if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, path, &last_extent,
+ ret = copy_items(trans, inode, dst_path, path,
ins_start_slot, ins_nr, inode_only,
logged_isize);
if (ret < 0) {
err = ret;
goto out_unlock;
}
- ret = 0;
ins_nr = 0;
}
@@ -5397,7 +5309,7 @@ next_key:
if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) {
btrfs_release_path(path);
btrfs_release_path(dst_path);
- err = btrfs_log_trailing_hole(trans, root, inode, path);
+ err = btrfs_log_holes(trans, root, inode, path);
if (err)
goto out_unlock;
}
@@ -6315,9 +6227,28 @@ again:
wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
if (IS_ERR(wc.replay_dest)) {
ret = PTR_ERR(wc.replay_dest);
+
+ /*
+ * We didn't find the subvol, likely because it was
+ * deleted. This is ok, simply skip this log and go to
+ * the next one.
+ *
+ * We need to exclude the root because we can't have
+ * other log replays overwriting this log as we'll read
+ * it back in a few more times. This will keep our
+ * block from being modified, and we'll just bail for
+ * each subsequent pass.
+ */
+ if (ret == -ENOENT)
+ ret = btrfs_pin_extent_for_log_replay(fs_info,
+ log->node->start,
+ log->node->len);
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
kfree(log);
+
+ if (!ret)
+ goto next;
btrfs_handle_fs_error(fs_info, ret,
"Couldn't read target root for tree log recovery.");
goto error;
@@ -6349,7 +6280,6 @@ again:
&root->highest_objectid);
}
- key.offset = found_key.offset - 1;
wc.replay_dest->log_root = NULL;
free_extent_buffer(log->node);
free_extent_buffer(log->commit_root);
@@ -6357,9 +6287,10 @@ again:
if (ret)
goto error;
-
+next:
if (found_key.offset == 0)
break;
+ key.offset = found_key.offset - 1;
}
btrfs_release_path(path);
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
index 91caab63bdf5..76b84f2397b1 100644
--- a/fs/btrfs/uuid-tree.c
+++ b/fs/btrfs/uuid-tree.c
@@ -324,6 +324,8 @@ again_search_slot:
}
if (ret < 0 && ret != -ENOENT)
goto out;
+ key.offset++;
+ goto again_search_slot;
}
item_size -= sizeof(subid_le);
offset += sizeof(subid_le);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 4075b1b95105..81edabb2524d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -216,7 +216,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*
* global::fs_devs - add, remove, updates to the global list
*
- * does not protect: manipulation of the fs_devices::devices list!
+ * does not protect: manipulation of the fs_devices::devices list in general
+ * but in mount context it could be used to exclude list modifications by eg.
+ * scan ioctl
*
* btrfs_device::name - renames (write side), read is RCU
*
@@ -229,6 +231,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* may be used to exclude some operations from running concurrently without any
* modifications to the list (see write_all_supers)
*
+ * Is not required at mount and close times, because our device list is
+ * protected by the uuid_mutex at that point.
+ *
* balance_mutex
* -------------
* protects balance structures (status, state) and context accessed from
@@ -786,6 +791,11 @@ static int btrfs_free_stale_devices(const char *path,
return ret;
}
+/*
+ * This is only used on mount, and we are protected from competing things
+ * messing with our fs_devices by the uuid_mutex, thus we do not need the
+ * fs_devices->device_list_mutex here.
+ */
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, fmode_t flags,
void *holder)
@@ -889,17 +899,54 @@ static struct btrfs_fs_devices *find_fsid_changed(
/*
* Handles the case where scanned device is part of an fs that had
* multiple successful changes of FSID but curently device didn't
- * observe it. Meaning our fsid will be different than theirs.
+ * observe it. Meaning our fsid will be different than theirs. We need
+ * to handle two subcases :
+ * 1 - The fs still continues to have different METADATA/FSID uuids.
+ * 2 - The fs is switched back to its original FSID (METADATA/FSID
+ * are equal).
*/
list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ /* Changed UUIDs */
if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
BTRFS_FSID_SIZE) != 0 &&
memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
BTRFS_FSID_SIZE) == 0 &&
memcmp(fs_devices->fsid, disk_super->fsid,
- BTRFS_FSID_SIZE) != 0) {
+ BTRFS_FSID_SIZE) != 0)
+ return fs_devices;
+
+ /* Unchanged UUIDs */
+ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
+ BTRFS_FSID_SIZE) == 0 &&
+ memcmp(fs_devices->fsid, disk_super->metadata_uuid,
+ BTRFS_FSID_SIZE) == 0)
+ return fs_devices;
+ }
+
+ return NULL;
+}
+
+static struct btrfs_fs_devices *find_fsid_reverted_metadata(
+ struct btrfs_super_block *disk_super)
+{
+ struct btrfs_fs_devices *fs_devices;
+
+ /*
+ * Handle the case where the scanned device is part of an fs whose last
+ * metadata UUID change reverted it to the original FSID. At the same
+ * time * fs_devices was first created by another constitutent device
+ * which didn't fully observe the operation. This results in an
+ * btrfs_fs_devices created with metadata/fsid different AND
+ * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
+ * fs_devices equal to the FSID of the disk.
+ */
+ list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
+ if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
+ BTRFS_FSID_SIZE) != 0 &&
+ memcmp(fs_devices->metadata_uuid, disk_super->fsid,
+ BTRFS_FSID_SIZE) == 0 &&
+ fs_devices->fsid_change)
return fs_devices;
- }
}
return NULL;
@@ -943,7 +990,9 @@ static noinline struct btrfs_device *device_list_add(const char *path,
fs_devices = find_fsid(disk_super->fsid,
disk_super->metadata_uuid);
} else {
- fs_devices = find_fsid(disk_super->fsid, NULL);
+ fs_devices = find_fsid_reverted_metadata(disk_super);
+ if (!fs_devices)
+ fs_devices = find_fsid(disk_super->fsid, NULL);
}
@@ -973,12 +1022,18 @@ static noinline struct btrfs_device *device_list_add(const char *path,
* a device which had the CHANGING_FSID_V2 flag then replace the
* metadata_uuid/fsid values of the fs_devices.
*/
- if (has_metadata_uuid && fs_devices->fsid_change &&
+ if (fs_devices->fsid_change &&
found_transid > fs_devices->latest_generation) {
memcpy(fs_devices->fsid, disk_super->fsid,
BTRFS_FSID_SIZE);
- memcpy(fs_devices->metadata_uuid,
- disk_super->metadata_uuid, BTRFS_FSID_SIZE);
+
+ if (has_metadata_uuid)
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->metadata_uuid,
+ BTRFS_FSID_SIZE);
+ else
+ memcpy(fs_devices->metadata_uuid,
+ disk_super->fsid, BTRFS_FSID_SIZE);
fs_devices->fsid_change = false;
}
@@ -1182,6 +1237,8 @@ again:
&device->dev_state)) {
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
&device->dev_state) &&
+ !test_bit(BTRFS_DEV_STATE_MISSING,
+ &device->dev_state) &&
(!latest_dev ||
device->generation > latest_dev->generation)) {
latest_dev = device;
@@ -1375,8 +1432,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int ret;
lockdep_assert_held(&uuid_mutex);
+ /*
+ * The device_list_mutex cannot be taken here in case opening the
+ * underlying device takes further locks like bd_mutex.
+ *
+ * We also don't need the lock here as this is called during mount and
+ * exclusion is provided by uuid_mutex
+ */
- mutex_lock(&fs_devices->device_list_mutex);
if (fs_devices->opened) {
fs_devices->opened++;
ret = 0;
@@ -1384,7 +1447,6 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
list_sort(NULL, &fs_devices->devices, devid_cmp);
ret = open_fs_devices(fs_devices, flags, holder);
}
- mutex_unlock(&fs_devices->device_list_mutex);
return ret;
}
@@ -4079,7 +4141,11 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
}
}
- num_devices = btrfs_num_devices(fs_info);
+ /*
+ * rw_devices will not change at the moment, device add/delete/replace
+ * are excluded by EXCL_OP
+ */
+ num_devices = fs_info->fs_devices->rw_devices;
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
if (num_devices > 1)
@@ -6649,8 +6715,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
else
generate_random_uuid(dev->uuid);
- btrfs_init_work(&dev->work, btrfs_submit_helper,
- pending_bios_fn, NULL, NULL);
+ btrfs_init_work(&dev->work, pending_bios_fn, NULL, NULL);
return dev;
}
@@ -7180,7 +7245,14 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* otherwise we don't need it.
*/
mutex_lock(&uuid_mutex);
- mutex_lock(&fs_info->chunk_mutex);
+
+ /*
+ * It is possible for mount and umount to race in such a way that
+ * we execute this code path, but open_fs_devices failed to clear
+ * total_rw_bytes. We certainly want it cleared before reading the
+ * device items, so clear it here.
+ */
+ fs_info->fs_devices->total_rw_bytes = 0;
/*
* Read all device items, and then all the chunk items. All
@@ -7217,7 +7289,9 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ mutex_lock(&fs_info->chunk_mutex);
ret = read_one_chunk(&found_key, leaf, chunk);
+ mutex_unlock(&fs_info->chunk_mutex);
if (ret)
goto error;
}
@@ -7247,7 +7321,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
}
ret = 0;
error:
- mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&uuid_mutex);
btrfs_free_path(path);
@@ -7509,6 +7582,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
else
btrfs_dev_stat_reset(dev, i);
}
+ btrfs_info(fs_info, "device stats zeroed by %s (%d)",
+ current->comm, task_pid_nr(current));
} else {
for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
if (stats->nr_items > i)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 136a3eb64604..c0e93ab55501 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -311,7 +311,6 @@ struct btrfs_bio {
u64 map_type; /* get from map_lookup->type */
bio_end_io_t *end_io;
struct bio *orig_bio;
- unsigned long flags;
void *private;
atomic_t error;
int max_errors;
diff --git a/fs/buffer.c b/fs/buffer.c
index e450c55f6434..3a4b88555ca1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1337,6 +1337,17 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
}
EXPORT_SYMBOL(__breadahead);
+void __breadahead_gfp(struct block_device *bdev, sector_t block, unsigned size,
+ gfp_t gfp)
+{
+ struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
+ if (likely(bh)) {
+ ll_rw_block(REQ_OP_READ, REQ_RAHEAD, 1, &bh);
+ brelse(bh);
+ }
+}
+EXPORT_SYMBOL(__breadahead_gfp);
+
/**
* __bread_gfp() - reads a specified block and returns the bh
* @bdev: the block_device to read from
@@ -2995,11 +3006,9 @@ static void end_bio_bh_io_sync(struct bio *bio)
* errors, this only handles the "we need to be able to
* do IO at the final sector" case.
*/
-void guard_bio_eod(int op, struct bio *bio)
+void guard_bio_eod(struct bio *bio)
{
sector_t maxsector;
- struct bio_vec *bvec = bio_last_bvec_all(bio);
- unsigned truncated_bytes;
struct hd_struct *part;
rcu_read_lock();
@@ -3025,28 +3034,7 @@ void guard_bio_eod(int op, struct bio *bio)
if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
return;
- /* Uhhuh. We've got a bio that straddles the device size! */
- truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9);
-
- /*
- * The bio contains more than one segment which spans EOD, just return
- * and let IO layer turn it into an EIO
- */
- if (truncated_bytes > bvec->bv_len)
- return;
-
- /* Truncate the bio.. */
- bio->bi_iter.bi_size -= truncated_bytes;
- bvec->bv_len -= truncated_bytes;
-
- /* ..and clear the end of the buffer for reads */
- if (op == REQ_OP_READ) {
- struct bio_vec bv;
-
- mp_bvec_last_segment(bvec, &bv);
- zero_user(bv.bv_page, bv.bv_offset + bv.bv_len,
- truncated_bytes);
- }
+ bio_truncate(bio, maxsector << 9);
}
static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
@@ -3082,15 +3070,15 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
- /* Take care of bh's that straddle the end of the device */
- guard_bio_eod(op, bio);
-
if (buffer_meta(bh))
op_flags |= REQ_META;
if (buffer_prio(bh))
op_flags |= REQ_PRIO;
bio_set_op_attrs(bio, op, op_flags);
+ /* Take care of bh's that straddle the end of the device */
+ guard_bio_eod(bio);
+
if (wbc) {
wbc_init_bio(wbc, bio);
wbc_account_io(wbc, bh->b_page, bh->b_size);
diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 44a3ce1e4ce4..ad057ed2b30b 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -60,9 +60,9 @@ static int cachefiles_read_waiter(wait_queue_entry_t *wait, unsigned mode,
object = container_of(op->op.object, struct cachefiles_object, fscache);
spin_lock(&object->work_lock);
list_add_tail(&monitor->op_link, &op->to_do);
+ fscache_enqueue_retrieval(op);
spin_unlock(&object->work_lock);
- fscache_enqueue_retrieval(op);
fscache_put_retrieval(op);
return 0;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 07ac2de542eb..9e06a42fa672 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1086,6 +1086,11 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+ /* remove from inode's cap rbtree, and clear auth cap */
+ rb_erase(&cap->ci_node, &ci->i_caps);
+ if (ci->i_auth_cap == cap)
+ ci->i_auth_cap = NULL;
+
/* remove from session list */
spin_lock(&session->s_cap_lock);
if (session->s_cap_iterator == cap) {
@@ -1119,11 +1124,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
spin_unlock(&session->s_cap_lock);
- /* remove from inode list */
- rb_erase(&cap->ci_node, &ci->i_caps);
- if (ci->i_auth_cap == cap)
- ci->i_auth_cap = NULL;
-
if (removed)
ceph_put_cap(mdsc, cap);
@@ -2005,8 +2005,12 @@ retry_locked:
}
/* want more caps from mds? */
- if (want & ~(cap->mds_wanted | cap->issued))
- goto ack;
+ if (want & ~cap->mds_wanted) {
+ if (want & ~(cap->mds_wanted | cap->issued))
+ goto ack;
+ if (!__cap_is_valid(cap))
+ goto ack;
+ }
/* things we might delay */
if ((cap->issued & ~retain) == 0)
@@ -3661,6 +3665,7 @@ retry:
WARN_ON(1);
tsession = NULL;
target = -1;
+ mutex_lock(&session->s_mutex);
}
goto retry;
@@ -3903,7 +3908,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
__ceph_queue_cap_release(session, cap);
spin_unlock(&session->s_cap_lock);
}
- goto done;
+ goto flush_cap_releases;
}
/* these will work even if we don't have a cap yet */
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 1271024a3797..950cb6b20063 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1542,36 +1542,37 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
{
int valid = 0;
struct dentry *parent;
- struct inode *dir;
+ struct inode *dir, *inode;
if (flags & LOOKUP_RCU) {
parent = READ_ONCE(dentry->d_parent);
dir = d_inode_rcu(parent);
if (!dir)
return -ECHILD;
+ inode = d_inode_rcu(dentry);
} else {
parent = dget_parent(dentry);
dir = d_inode(parent);
+ inode = d_inode(dentry);
}
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
- dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
+ dentry, inode, ceph_dentry(dentry)->offset);
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
- dentry, d_inode(dentry));
+ dentry, inode);
valid = 1;
- } else if (d_really_is_positive(dentry) &&
- ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
+ } else if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
valid = 1;
} else {
valid = dentry_lease_is_valid(dentry, flags, dir);
if (valid == -ECHILD)
return valid;
if (valid || dir_lease_is_valid(dir, dentry)) {
- if (d_really_is_positive(dentry))
- valid = ceph_is_any_caps(d_inode(dentry));
+ if (inode)
+ valid = ceph_is_any_caps(inode);
else
valid = 1;
}
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index d3ef7ee429ec..8982e0c2c6a5 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -172,9 +172,16 @@ struct inode *ceph_lookup_inode(struct super_block *sb, u64 ino)
static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
{
struct inode *inode = __lookup_inode(sb, ino);
+ int err;
+
if (IS_ERR(inode))
return ERR_CAST(inode);
- if (inode->i_nlink == 0) {
+ /* We need LINK caps to reliably check i_nlink */
+ err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false);
+ if (err)
+ return ERR_PTR(err);
+ /* -ESTALE if inode as been unlinked and no file is open */
+ if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) {
iput(inode);
return ERR_PTR(-ESTALE);
}
@@ -315,6 +322,11 @@ static struct dentry *__get_parent(struct super_block *sb,
req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err) {
+ ceph_mdsc_put_request(req);
+ return ERR_PTR(err);
+ }
+
inode = req->r_target_inode;
if (inode)
ihold(inode);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7a57db8e2fa9..2595052d5eef 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -454,6 +454,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_pre_init_acls(dir, &mode, &acls);
if (err < 0)
return err;
+ } else if (!d_in_lookup(dentry)) {
+ /* If it's not being looked up, it's negative */
+ return -ENOENT;
}
/* do the open */
@@ -1381,9 +1384,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+ struct ceph_osd_client *osdc = &fsc->client->osdc;
struct ceph_cap_flush *prealloc_cf;
ssize_t count, written = 0;
int err, want, got;
+ u32 map_flags;
+ u64 pool_flags;
loff_t pos;
loff_t limit = max(i_size_read(inode), fsc->max_file_size);
@@ -1438,8 +1444,12 @@ retry_snap:
goto out;
}
- /* FIXME: not complete since it doesn't account for being at quota */
- if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_FULL)) {
+ down_read(&osdc->lock);
+ map_flags = osdc->osdmap->flags;
+ pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
+ up_read(&osdc->lock);
+ if ((map_flags & CEPH_OSDMAP_FULL) ||
+ (pool_flags & CEPH_POOL_FLAG_FULL)) {
err = -ENOSPC;
goto out;
}
@@ -1529,7 +1539,8 @@ retry_snap:
}
if (written >= 0) {
- if (ceph_osdmap_flag(&fsc->client->osdc, CEPH_OSDMAP_NEARFULL))
+ if ((map_flags & CEPH_OSDMAP_NEARFULL) ||
+ (pool_flags & CEPH_POOL_FLAG_NEARFULL))
iocb->ki_flags |= IOCB_DSYNC;
written = generic_write_sync(iocb, written);
}
@@ -1923,10 +1934,18 @@ static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ceph_test_mount_opt(ceph_inode_to_client(src_inode), NOCOPYFROM))
return -EOPNOTSUPP;
+ /*
+ * Striped file layouts require that we copy partial objects, but the
+ * OSD copy-from operation only supports full-object copies. Limit
+ * this to non-striped file layouts for now.
+ */
if ((src_ci->i_layout.stripe_unit != dst_ci->i_layout.stripe_unit) ||
- (src_ci->i_layout.stripe_count != dst_ci->i_layout.stripe_count) ||
- (src_ci->i_layout.object_size != dst_ci->i_layout.object_size))
+ (src_ci->i_layout.stripe_count != 1) ||
+ (dst_ci->i_layout.stripe_count != 1) ||
+ (src_ci->i_layout.object_size != dst_ci->i_layout.object_size)) {
+ dout("Invalid src/dst files layout\n");
return -EOPNOTSUPP;
+ }
if (len < src_ci->i_layout.object_size)
return -EOPNOTSUPP; /* no remote copy will be done */
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index b1e49fcf7754..99c980896f79 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -1414,6 +1414,7 @@ retry_lookup:
dout(" final dn %p\n", dn);
} else if ((req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
req->r_op == CEPH_MDS_OP_MKSNAP) &&
+ test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
struct inode *dir = req->r_parent;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ef58306df674..6eddc935d8ce 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -694,8 +694,10 @@ void ceph_mdsc_release_request(struct kref *kref)
/* avoid calling iput_final() in mds dispatch threads */
ceph_async_iput(req->r_inode);
}
- if (req->r_parent)
+ if (req->r_parent) {
ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ ceph_async_iput(req->r_parent);
+ }
ceph_async_iput(req->r_target_inode);
if (req->r_dentry)
dput(req->r_dentry);
@@ -2518,8 +2520,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (!(mdsc->fsc->mount_options->flags &
CEPH_MOUNT_OPT_MOUNTWAIT) &&
!ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
- err = -ENOENT;
- pr_info("probably no mds server is up\n");
+ err = -EHOSTUNREACH;
goto finish;
}
}
@@ -2638,8 +2639,10 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
/* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */
if (req->r_inode)
ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
- if (req->r_parent)
+ if (req->r_parent) {
ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN);
+ ihold(req->r_parent);
+ }
if (req->r_old_dentry_dir)
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
@@ -3032,8 +3035,7 @@ static void handle_session(struct ceph_mds_session *session,
void *end = p + msg->front.iov_len;
struct ceph_mds_session_head *h;
u32 op;
- u64 seq;
- unsigned long features = 0;
+ u64 seq, features = 0;
int wake = 0;
/* decode */
@@ -3051,9 +3053,10 @@ static void handle_session(struct ceph_mds_session *session,
goto bad;
/* version >= 3, feature bits */
ceph_decode_32_safe(&p, end, len, bad);
- ceph_decode_need(&p, end, len, bad);
- memcpy(&features, p, min_t(size_t, len, sizeof(features)));
- p += len;
+ if (len) {
+ ceph_decode_64_safe(&p, end, features, bad);
+ p += len - sizeof(features);
+ }
}
mutex_lock(&mdsc->mutex);
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index d629fc857450..8534920aefa7 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -148,8 +148,8 @@ static struct inode *lookup_quotarealm_inode(struct ceph_mds_client *mdsc,
}
in = ceph_lookup_inode(sb, realm->ino);
if (IS_ERR(in)) {
- pr_warn("Can't lookup inode %llx (err: %ld)\n",
- realm->ino, PTR_ERR(in));
+ dout("Can't lookup inode %llx (err: %ld)\n",
+ realm->ino, PTR_ERR(in));
qri->timeout = jiffies + msecs_to_jiffies(60 * 1000); /* XXX */
} else {
qri->timeout = 0;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 213bc1475e91..e630762175a4 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -1152,5 +1152,6 @@ void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
pr_err("snapid map %llx -> %x still in use\n",
sm->snap, sm->dev);
}
+ kfree(sm);
}
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 74ca5970397f..e7b2f6038428 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -106,7 +106,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
-
static int ceph_sync_fs(struct super_block *sb, int wait)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
@@ -213,6 +212,26 @@ static match_table_t fsopt_tokens = {
{-1, NULL}
};
+/*
+ * Remove adjacent slashes and then the trailing slash, unless it is
+ * the only remaining character.
+ *
+ * E.g. "//dir1////dir2///" --> "/dir1/dir2", "///" --> "/".
+ */
+static void canonicalize_path(char *path)
+{
+ int i, j = 0;
+
+ for (i = 0; path[i] != '\0'; i++) {
+ if (path[i] != '/' || j < 1 || path[j - 1] != '/')
+ path[j++] = path[i];
+ }
+
+ if (j > 1 && path[j - 1] == '/')
+ j--;
+ path[j] = '\0';
+}
+
static int parse_fsopt_token(char *c, void *private)
{
struct ceph_mount_options *fsopt = private;
@@ -255,6 +274,7 @@ static int parse_fsopt_token(char *c, void *private)
return -ENOMEM;
break;
case Opt_fscache_uniq:
+#ifdef CONFIG_CEPH_FSCACHE
kfree(fsopt->fscache_uniq);
fsopt->fscache_uniq = kstrndup(argstr[0].from,
argstr[0].to-argstr[0].from,
@@ -263,7 +283,10 @@ static int parse_fsopt_token(char *c, void *private)
return -ENOMEM;
fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
break;
- /* misc */
+#else
+ pr_err("fscache support is disabled\n");
+ return -EINVAL;
+#endif
case Opt_wsize:
if (intval < (int)PAGE_SIZE || intval > CEPH_MAX_WRITE_SIZE)
return -EINVAL;
@@ -340,10 +363,15 @@ static int parse_fsopt_token(char *c, void *private)
fsopt->flags &= ~CEPH_MOUNT_OPT_INO32;
break;
case Opt_fscache:
+#ifdef CONFIG_CEPH_FSCACHE
fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
kfree(fsopt->fscache_uniq);
fsopt->fscache_uniq = NULL;
break;
+#else
+ pr_err("fscache support is disabled\n");
+ return -EINVAL;
+#endif
case Opt_nofscache:
fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE;
kfree(fsopt->fscache_uniq);
@@ -424,12 +452,15 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
if (ret)
return ret;
+
ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
if (ret)
return ret;
+
ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
if (ret)
return ret;
+
ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq);
if (ret)
return ret;
@@ -485,13 +516,17 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
*/
dev_name_end = strchr(dev_name, '/');
if (dev_name_end) {
- if (strlen(dev_name_end) > 1) {
- fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
- if (!fsopt->server_path) {
- err = -ENOMEM;
- goto out;
- }
+ /*
+ * The server_path will include the whole chars from userland
+ * including the leading '/'.
+ */
+ fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
+ if (!fsopt->server_path) {
+ err = -ENOMEM;
+ goto out;
}
+
+ canonicalize_path(fsopt->server_path);
} else {
dev_name_end = dev_name + strlen(dev_name);
}
@@ -814,7 +849,6 @@ static void destroy_caches(void)
ceph_fscache_unregister();
}
-
/*
* ceph_umount_begin - initiate forced umount. Tear down down the
* mount, skipping steps that may hang while waiting for server(s).
@@ -901,9 +935,6 @@ out:
return root;
}
-
-
-
/*
* mount: join the ceph cluster, and open root directory.
*/
@@ -917,7 +948,9 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
mutex_lock(&fsc->client->mount_mutex);
if (!fsc->sb->s_root) {
- const char *path;
+ const char *path = fsc->mount_options->server_path ?
+ fsc->mount_options->server_path + 1 : "";
+
err = __ceph_open_session(fsc->client, started);
if (err < 0)
goto out;
@@ -929,13 +962,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
goto out;
}
- if (!fsc->mount_options->server_path) {
- path = "";
- dout("mount opening path \\t\n");
- } else {
- path = fsc->mount_options->server_path + 1;
- dout("mount opening path %s\n", path);
- }
+ dout("mount opening path '%s'\n", path);
err = ceph_fs_debugfs_init(fsc);
if (err < 0)
@@ -1110,6 +1137,11 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
return res;
out_splat:
+ if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) {
+ pr_info("No mds server is up or the cluster is laggy\n");
+ err = -EHOSTUNREACH;
+ }
+
ceph_mdsc_close_sessions(fsc->mdsc);
deactivate_locked_super(sb);
goto out_final;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 38b42d7594b6..1283a5774920 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -90,7 +90,7 @@ struct ceph_mount_options {
char *snapdir_name; /* default ".snap" */
char *mds_namespace; /* default NULL */
- char *server_path; /* default "/" */
+ char *server_path; /* default NULL (means "/") */
char *fscache_uniq; /* default NULL */
};
diff --git a/fs/char_dev.c b/fs/char_dev.c
index d18cad28c1c3..a4eee26c5a50 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -351,7 +351,7 @@ static struct kobject *cdev_get(struct cdev *p)
if (owner && !try_module_get(owner))
return NULL;
- kobj = kobject_get(&p->kobj);
+ kobj = kobject_get_unless_zero(&p->kobj);
if (!kobj)
module_put(owner);
return kobj;
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index ec933fb0b36e..e6247b124e63 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -258,6 +258,11 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v)
if (!server->rdma)
goto skip_rdma;
+ if (!server->smbd_conn) {
+ seq_printf(m, "\nSMBDirect transport not available");
+ goto skip_rdma;
+ }
+
seq_printf(m, "\nSMBDirect (in hex) protocol version: %x "
"transport status: %x",
server->smbd_conn->protocol,
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 41957b82d796..cc3ada12848d 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -120,17 +120,17 @@ cifs_build_devname(char *nodename, const char *prepath)
/**
- * cifs_compose_mount_options - creates mount options for refferral
+ * cifs_compose_mount_options - creates mount options for referral
* @sb_mountdata: parent/root DFS mount options (template)
* @fullpath: full path in UNC format
- * @ref: server's referral
+ * @ref: optional server's referral
* @devname: optional pointer for saving device name
*
* creates mount options for submount based on template options sb_mountdata
* and replacing unc,ip,prefixpath options with ones we've got form ref_unc.
*
* Returns: pointer to new mount options or ERR_PTR.
- * Caller is responcible for freeing retunrned value if it is not error.
+ * Caller is responsible for freeing returned value if it is not error.
*/
char *cifs_compose_mount_options(const char *sb_mountdata,
const char *fullpath,
@@ -150,18 +150,27 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
if (sb_mountdata == NULL)
return ERR_PTR(-EINVAL);
- if (strlen(fullpath) - ref->path_consumed) {
- prepath = fullpath + ref->path_consumed;
- /* skip initial delimiter */
- if (*prepath == '/' || *prepath == '\\')
- prepath++;
- }
+ if (ref) {
+ if (strlen(fullpath) - ref->path_consumed) {
+ prepath = fullpath + ref->path_consumed;
+ /* skip initial delimiter */
+ if (*prepath == '/' || *prepath == '\\')
+ prepath++;
+ }
- name = cifs_build_devname(ref->node_name, prepath);
- if (IS_ERR(name)) {
- rc = PTR_ERR(name);
- name = NULL;
- goto compose_mount_options_err;
+ name = cifs_build_devname(ref->node_name, prepath);
+ if (IS_ERR(name)) {
+ rc = PTR_ERR(name);
+ name = NULL;
+ goto compose_mount_options_err;
+ }
+ } else {
+ name = cifs_build_devname((char *)fullpath, NULL);
+ if (IS_ERR(name)) {
+ rc = PTR_ERR(name);
+ name = NULL;
+ goto compose_mount_options_err;
+ }
}
rc = dns_resolve_server_name_to_ip(name, &srvIP);
@@ -225,6 +234,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
if (devname)
*devname = name;
+ else
+ kfree(name);
/*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/
/*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/
@@ -241,23 +252,23 @@ compose_mount_options_err:
}
/**
- * cifs_dfs_do_refmount - mounts specified path using provided refferal
+ * cifs_dfs_do_mount - mounts specified path using DFS full path
+ *
+ * Always pass down @fullpath to smb3_do_mount() so we can use the root server
+ * to perform failover in case we failed to connect to the first target in the
+ * referral.
+ *
* @cifs_sb: parent/root superblock
* @fullpath: full path in UNC format
- * @ref: server's referral
*/
-static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
- struct cifs_sb_info *cifs_sb,
- const char *fullpath, const struct dfs_info3_param *ref)
+static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt,
+ struct cifs_sb_info *cifs_sb,
+ const char *fullpath)
{
struct vfsmount *mnt;
char *mountdata;
char *devname;
- /*
- * Always pass down the DFS full path to smb3_do_mount() so we
- * can use it later for failover.
- */
devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
if (!devname)
return ERR_PTR(-ENOMEM);
@@ -266,7 +277,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
/* strip first '\' from fullpath */
mountdata = cifs_compose_mount_options(cifs_sb->mountdata,
- fullpath + 1, ref, NULL);
+ fullpath + 1, NULL, NULL);
if (IS_ERR(mountdata)) {
kfree(devname);
return (struct vfsmount *)mountdata;
@@ -278,28 +289,16 @@ static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt,
return mnt;
}
-static void dump_referral(const struct dfs_info3_param *ref)
-{
- cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name);
- cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name);
- cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n",
- ref->flags, ref->server_type);
- cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n",
- ref->ref_flag, ref->path_consumed);
-}
-
/*
* Create a vfsmount that we can automount
*/
static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
{
- struct dfs_info3_param referral = {0};
struct cifs_sb_info *cifs_sb;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
char *full_path, *root_path;
unsigned int xid;
- int len;
int rc;
struct vfsmount *mnt;
@@ -325,6 +324,8 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
if (full_path == NULL)
goto cdda_exit;
+ convert_delimiter(full_path, '\\');
+
cifs_dbg(FYI, "%s: full_path: %s\n", __func__, full_path);
if (!cifs_sb_master_tlink(cifs_sb)) {
@@ -357,7 +358,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
if (!rc) {
rc = dfs_cache_find(xid, ses, cifs_sb->local_nls,
cifs_remap(cifs_sb), full_path + 1,
- &referral, NULL);
+ NULL, NULL);
}
free_xid(xid);
@@ -366,26 +367,16 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt)
mnt = ERR_PTR(rc);
goto free_root_path;
}
-
- dump_referral(&referral);
-
- len = strlen(referral.node_name);
- if (len < 2) {
- cifs_dbg(VFS, "%s: Net Address path too short: %s\n",
- __func__, referral.node_name);
- mnt = ERR_PTR(-EINVAL);
- goto free_dfs_ref;
- }
/*
- * cifs_mount() will retry every available node server in case
- * of failures.
+ * OK - we were able to get and cache a referral for @full_path.
+ *
+ * Now, pass it down to cifs_mount() and it will retry every available
+ * node server in case of failures - no need to do it here.
*/
- mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, &referral);
- cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__,
- referral.node_name, mnt);
+ mnt = cifs_dfs_do_mount(mntpt, cifs_sb, full_path);
+ cifs_dbg(FYI, "%s: cifs_dfs_do_mount:%s , mnt:%p\n", __func__,
+ full_path + 1, mnt);
-free_dfs_ref:
- free_dfs_info_param(&referral);
free_root_path:
kfree(root_path);
free_full_path:
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1d377b7f2860..130bdca9e568 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -603,7 +603,7 @@ static void access_flags_to_mode(__le32 ace_flags, int type, umode_t *pmode,
((flags & FILE_EXEC_RIGHTS) == FILE_EXEC_RIGHTS))
*pmode |= (S_IXUGO & (*pbits_to_set));
- cifs_dbg(NOISY, "access flags 0x%x mode now 0x%x\n", flags, *pmode);
+ cifs_dbg(NOISY, "access flags 0x%x mode now %04o\n", flags, *pmode);
return;
}
@@ -632,7 +632,7 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use,
if (mode & S_IXUGO)
*pace_flags |= SET_FILE_EXEC_RIGHTS;
- cifs_dbg(NOISY, "mode: 0x%x, access flags now 0x%x\n",
+ cifs_dbg(NOISY, "mode: %04o, access flags now 0x%x\n",
mode, *pace_flags);
return;
}
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index bd34ea0d27e9..dd19556c835f 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -376,7 +376,7 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
seq_puts(s, "ntlm");
break;
case Kerberos:
- seq_printf(s, "krb5,cruid=%u", from_kuid_munged(&init_user_ns,ses->cred_uid));
+ seq_puts(s, "krb5");
break;
case RawNTLMSSP:
seq_puts(s, "ntlmssp");
@@ -389,6 +389,10 @@ cifs_show_security(struct seq_file *s, struct cifs_ses *ses)
if (ses->sign)
seq_puts(s, "i");
+
+ if (ses->sectype == Kerberos)
+ seq_printf(s, ",cruid=%u",
+ from_kuid_munged(&init_user_ns, ses->cred_uid));
}
static void
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 85aa1bc930f1..6ccff44dbd4b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1376,6 +1376,11 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
struct cifsInodeInfo {
bool can_cache_brlcks;
struct list_head llist; /* locks helb by this inode */
+ /*
+ * NOTE: Some code paths call down_read(lock_sem) twice, so
+ * we must always use use cifs_down_write() instead of down_write()
+ * for this semaphore to avoid deadlocks.
+ */
struct rw_semaphore lock_sem; /* protect the fields above */
/* BB add in lists for dirty pages i.e. write caching info for oplock */
struct list_head openFileList;
@@ -1504,6 +1509,7 @@ struct mid_q_entry {
struct TCP_Server_Info *server; /* server corresponding to this mid */
__u64 mid; /* multiplex id */
__u16 credits; /* number of credits consumed by this mid */
+ __u16 credits_received; /* number of credits from the response */
__u32 pid; /* process id */
__u32 sequence_number; /* for CIFS signing */
unsigned long when_alloc; /* when mid was created */
@@ -1515,6 +1521,7 @@ struct mid_q_entry {
mid_callback_t *callback; /* call completion callback */
mid_handle_t *handle; /* call handle mid callback */
void *callback_data; /* general purpose pointer for callback */
+ struct task_struct *creator;
void *resp_buf; /* pointer to received SMB header */
unsigned int resp_buf_size;
int mid_state; /* wish this were enum but can not pass to wait_event */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index e23234207fc2..6d7c692ce2ba 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -166,6 +166,7 @@ extern int cifs_unlock_range(struct cifsFileInfo *cfile,
struct file_lock *flock, const unsigned int xid);
extern int cifs_push_mandatory_locks(struct cifsFileInfo *cfile);
+extern void cifs_down_write(struct rw_semaphore *sem);
extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid,
struct file *file,
struct tcon_link *tlink,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1fbd92843a73..0f40811a481b 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2148,8 +2148,8 @@ cifs_writev_requeue(struct cifs_writedata *wdata)
}
}
+ kref_put(&wdata2->refcount, cifs_writedata_release);
if (rc) {
- kref_put(&wdata2->refcount, cifs_writedata_release);
if (is_retryable_error(rc))
continue;
i += nr_pages;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index bd8c00635ea4..8924f1427472 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -360,8 +360,10 @@ static int reconn_set_ipaddr(struct TCP_Server_Info *server)
return rc;
}
+ spin_lock(&cifs_tcp_ses_lock);
rc = cifs_convert_address((struct sockaddr *)&server->dstaddr, ipaddr,
strlen(ipaddr));
+ spin_unlock(&cifs_tcp_ses_lock);
kfree(ipaddr);
return !rc ? -1 : 0;
@@ -376,7 +378,7 @@ static inline int reconn_set_ipaddr(struct TCP_Server_Info *server)
#ifdef CONFIG_CIFS_DFS_UPCALL
struct super_cb_data {
struct TCP_Server_Info *server;
- struct cifs_sb_info *cifs_sb;
+ struct super_block *sb;
};
/* These functions must be called with server->srv_mutex held */
@@ -387,25 +389,39 @@ static void super_cb(struct super_block *sb, void *arg)
struct cifs_sb_info *cifs_sb;
struct cifs_tcon *tcon;
- if (d->cifs_sb)
+ if (d->sb)
return;
cifs_sb = CIFS_SB(sb);
tcon = cifs_sb_master_tcon(cifs_sb);
if (tcon->ses->server == d->server)
- d->cifs_sb = cifs_sb;
+ d->sb = sb;
}
-static inline struct cifs_sb_info *
-find_super_by_tcp(struct TCP_Server_Info *server)
+static struct super_block *get_tcp_super(struct TCP_Server_Info *server)
{
struct super_cb_data d = {
.server = server,
- .cifs_sb = NULL,
+ .sb = NULL,
};
iterate_supers_type(&cifs_fs_type, super_cb, &d);
- return d.cifs_sb ? d.cifs_sb : ERR_PTR(-ENOENT);
+
+ if (unlikely(!d.sb))
+ return ERR_PTR(-ENOENT);
+ /*
+ * Grab an active reference in order to prevent automounts (DFS links)
+ * of expiring and then freeing up our cifs superblock pointer while
+ * we're doing failover.
+ */
+ cifs_sb_active(d.sb);
+ return d.sb;
+}
+
+static inline void put_tcp_super(struct super_block *sb)
+{
+ if (!IS_ERR_OR_NULL(sb))
+ cifs_sb_deactive(sb);
}
static void reconn_inval_dfs_target(struct TCP_Server_Info *server,
@@ -469,6 +485,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
struct mid_q_entry *mid_entry;
struct list_head retry_list;
#ifdef CONFIG_CIFS_DFS_UPCALL
+ struct super_block *sb = NULL;
struct cifs_sb_info *cifs_sb = NULL;
struct dfs_cache_tgt_list tgt_list = {0};
struct dfs_cache_tgt_iterator *tgt_it = NULL;
@@ -478,13 +495,15 @@ cifs_reconnect(struct TCP_Server_Info *server)
server->nr_targets = 1;
#ifdef CONFIG_CIFS_DFS_UPCALL
spin_unlock(&GlobalMid_Lock);
- cifs_sb = find_super_by_tcp(server);
- if (IS_ERR(cifs_sb)) {
- rc = PTR_ERR(cifs_sb);
+ sb = get_tcp_super(server);
+ if (IS_ERR(sb)) {
+ rc = PTR_ERR(sb);
cifs_dbg(FYI, "%s: will not do DFS failover: rc = %d\n",
__func__, rc);
- cifs_sb = NULL;
+ sb = NULL;
} else {
+ cifs_sb = CIFS_SB(sb);
+
rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it);
if (rc && (rc != -EOPNOTSUPP)) {
cifs_dbg(VFS, "%s: no target servers for DFS failover\n",
@@ -501,6 +520,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
/* the demux thread will exit normally
next time through the loop */
spin_unlock(&GlobalMid_Lock);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ dfs_cache_free_tgts(&tgt_list);
+ put_tcp_super(sb);
+#endif
return rc;
} else
server->tcpStatus = CifsNeedReconnect;
@@ -553,9 +576,11 @@ cifs_reconnect(struct TCP_Server_Info *server)
spin_lock(&GlobalMid_Lock);
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
+ kref_get(&mid_entry->refcount);
if (mid_entry->mid_state == MID_REQUEST_SUBMITTED)
mid_entry->mid_state = MID_RETRY_NEEDED;
list_move(&mid_entry->qhead, &retry_list);
+ mid_entry->mid_flags |= MID_DELETED;
}
spin_unlock(&GlobalMid_Lock);
mutex_unlock(&server->srv_mutex);
@@ -565,6 +590,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
list_del_init(&mid_entry->qhead);
mid_entry->callback(mid_entry);
+ cifs_mid_q_entry_release(mid_entry);
}
if (cifs_rdma_enabled(server)) {
@@ -577,26 +603,26 @@ cifs_reconnect(struct TCP_Server_Info *server)
try_to_freeze();
mutex_lock(&server->srv_mutex);
+#ifdef CONFIG_CIFS_DFS_UPCALL
/*
* Set up next DFS target server (if any) for reconnect. If DFS
* feature is disabled, then we will retry last server we
* connected to before.
*/
+ reconn_inval_dfs_target(server, cifs_sb, &tgt_list, &tgt_it);
+#endif
+ rc = reconn_set_ipaddr(server);
+ if (rc) {
+ cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
+ __func__, rc);
+ }
+
if (cifs_rdma_enabled(server))
rc = smbd_reconnect(server);
else
rc = generic_ip_connect(server);
if (rc) {
cifs_dbg(FYI, "reconnect error %d\n", rc);
-#ifdef CONFIG_CIFS_DFS_UPCALL
- reconn_inval_dfs_target(server, cifs_sb, &tgt_list,
- &tgt_it);
-#endif
- rc = reconn_set_ipaddr(server);
- if (rc) {
- cifs_dbg(FYI, "%s: failed to resolve hostname: %d\n",
- __func__, rc);
- }
mutex_unlock(&server->srv_mutex);
msleep(3000);
} else {
@@ -624,7 +650,10 @@ cifs_reconnect(struct TCP_Server_Info *server)
__func__, rc);
}
dfs_cache_free_tgts(&tgt_list);
+
}
+
+ put_tcp_super(sb);
#endif
if (server->tcpStatus == CifsNeedNegotiate)
mod_delayed_work(cifsiod_wq, &server->echo, 0);
@@ -884,11 +913,27 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
if (mid->mid_flags & MID_DELETED)
printk_once(KERN_WARNING
"trying to dequeue a deleted mid\n");
- else
+ else {
list_del_init(&mid->qhead);
+ mid->mid_flags |= MID_DELETED;
+ }
spin_unlock(&GlobalMid_Lock);
}
+static unsigned int
+smb2_get_credits_from_hdr(char *buffer, struct TCP_Server_Info *server)
+{
+ struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer;
+
+ /*
+ * SMB1 does not use credits.
+ */
+ if (server->vals->header_preamble_size)
+ return 0;
+
+ return le16_to_cpu(shdr->CreditRequest);
+}
+
static void
handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
char *buf, int malformed)
@@ -896,6 +941,7 @@ handle_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server,
if (server->ops->check_trans2 &&
server->ops->check_trans2(mid, server, buf, malformed))
return;
+ mid->credits_received = smb2_get_credits_from_hdr(buf, server);
mid->resp_buf = buf;
mid->large_buf = server->large_buf;
/* Was previous buf put in mpx struct for multi-rsp? */
@@ -955,8 +1001,10 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
list_for_each_safe(tmp, tmp2, &server->pending_mid_q) {
mid_entry = list_entry(tmp, struct mid_q_entry, qhead);
cifs_dbg(FYI, "Clearing mid 0x%llx\n", mid_entry->mid);
+ kref_get(&mid_entry->refcount);
mid_entry->mid_state = MID_SHUTDOWN;
list_move(&mid_entry->qhead, &dispose_list);
+ mid_entry->mid_flags |= MID_DELETED;
}
spin_unlock(&GlobalMid_Lock);
@@ -966,6 +1014,7 @@ static void clean_demultiplex_info(struct TCP_Server_Info *server)
cifs_dbg(FYI, "Callback mid 0x%llx\n", mid_entry->mid);
list_del_init(&mid_entry->qhead);
mid_entry->callback(mid_entry);
+ cifs_mid_q_entry_release(mid_entry);
}
/* 1/8th of sec is more than enough time for them to exit */
msleep(125);
@@ -1202,12 +1251,6 @@ next_pdu:
for (i = 0; i < num_mids; i++) {
if (mids[i] != NULL) {
mids[i]->resp_buf_size = server->pdu_size;
- if ((mids[i]->mid_flags & MID_WAIT_CANCELLED) &&
- mids[i]->mid_state == MID_RESPONSE_RECEIVED &&
- server->ops->handle_cancelled_mid)
- server->ops->handle_cancelled_mid(
- mids[i]->resp_buf,
- server);
if (!mids[i]->multiRsp || mids[i]->multiEnd)
mids[i]->callback(mids[i]);
@@ -3248,6 +3291,10 @@ cifs_find_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &ses->tcon_list) {
tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
+#ifdef CONFIG_CIFS_DFS_UPCALL
+ if (tcon->dfs_path)
+ continue;
+#endif
if (!match_tcon(tcon, volume_info))
continue;
++tcon->tc_count;
@@ -3532,8 +3579,10 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data)
{
struct cifs_sb_info *old = CIFS_SB(sb);
struct cifs_sb_info *new = mnt_data->cifs_sb;
- bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
- bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH;
+ bool old_set = (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) &&
+ old->prepath;
+ bool new_set = (new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) &&
+ new->prepath;
if (old_set && new_set && !strcmp(new->prepath, old->prepath))
return 1;
@@ -3964,7 +4013,7 @@ int cifs_setup_cifs_sb(struct smb_vol *pvolume_info,
cifs_sb->mnt_gid = pvolume_info->linux_gid;
cifs_sb->mnt_file_mode = pvolume_info->file_mode;
cifs_sb->mnt_dir_mode = pvolume_info->dir_mode;
- cifs_dbg(FYI, "file mode: 0x%hx dir mode: 0x%hx\n",
+ cifs_dbg(FYI, "file mode: %04ho dir mode: %04ho\n",
cifs_sb->mnt_file_mode, cifs_sb->mnt_dir_mode);
cifs_sb->actimeo = pvolume_info->actimeo;
@@ -4582,6 +4631,17 @@ static int is_path_remote(struct cifs_sb_info *cifs_sb, struct smb_vol *vol,
}
#ifdef CONFIG_CIFS_DFS_UPCALL
+static inline void set_root_tcon(struct cifs_sb_info *cifs_sb,
+ struct cifs_tcon *tcon,
+ struct cifs_tcon **root)
+{
+ spin_lock(&cifs_tcp_ses_lock);
+ tcon->tc_count++;
+ tcon->remap = cifs_remap(cifs_sb);
+ spin_unlock(&cifs_tcp_ses_lock);
+ *root = tcon;
+}
+
int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
{
int rc = 0;
@@ -4683,18 +4743,10 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
/* Cache out resolved root server */
(void)dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb),
root_path + 1, NULL, NULL);
- /*
- * Save root tcon for additional DFS requests to update or create a new
- * DFS cache entry, or even perform DFS failover.
- */
- spin_lock(&cifs_tcp_ses_lock);
- tcon->tc_count++;
- tcon->dfs_path = root_path;
+ kfree(root_path);
root_path = NULL;
- tcon->remap = cifs_remap(cifs_sb);
- spin_unlock(&cifs_tcp_ses_lock);
- root_tcon = tcon;
+ set_root_tcon(cifs_sb, tcon, &root_tcon);
for (count = 1; ;) {
if (!rc && tcon) {
@@ -4731,6 +4783,15 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *vol)
mount_put_conns(cifs_sb, xid, server, ses, tcon);
rc = mount_get_conns(vol, cifs_sb, &xid, &server, &ses,
&tcon);
+ /*
+ * Ensure that DFS referrals go through new root server.
+ */
+ if (!rc && tcon &&
+ (tcon->share_flags & (SHI1005_FLAGS_DFS |
+ SHI1005_FLAGS_DFS_ROOT))) {
+ cifs_put_tcon(root_tcon);
+ set_root_tcon(cifs_sb, tcon, &root_tcon);
+ }
}
if (rc) {
if (rc == -EACCES || rc == -EOPNOTSUPP)
@@ -5110,9 +5171,15 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
vol_info->nocase = master_tcon->nocase;
vol_info->nohandlecache = master_tcon->nohandlecache;
vol_info->local_lease = master_tcon->local_lease;
+ vol_info->no_lease = master_tcon->no_lease;
+ vol_info->resilient = master_tcon->use_resilient;
+ vol_info->persistent = master_tcon->use_persistent;
+ vol_info->handle_timeout = master_tcon->handle_timeout;
vol_info->no_linux_ext = !master_tcon->unix_ext;
+ vol_info->linux_ext = master_tcon->posix_extensions;
vol_info->sectype = master_tcon->ses->sectype;
vol_info->sign = master_tcon->ses->sign;
+ vol_info->seal = master_tcon->seal;
rc = cifs_set_vol_auth(vol_info, master_tcon->ses);
if (rc) {
@@ -5138,10 +5205,6 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
goto out;
}
- /* if new SMB3.11 POSIX extensions are supported do not remap / and \ */
- if (tcon->posix_extensions)
- cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS;
-
if (cap_unix(ses))
reset_cifs_unix_caps(0, tcon, NULL, vol_info);
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index e3e1c13df439..24e2e22103a1 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -1317,10 +1317,9 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
int rc;
struct dfs_info3_param ref = {0};
char *mdata = NULL, *devname = NULL;
- bool is_smb3 = tcon->ses->server->vals->header_preamble_size == 0;
struct TCP_Server_Info *server;
struct cifs_ses *ses;
- struct smb_vol vol;
+ struct smb_vol vol = {NULL};
rpath = get_dfs_root(path);
if (IS_ERR(rpath))
@@ -1344,7 +1343,7 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
goto out;
}
- rc = cifs_setup_volume_info(&vol, mdata, devname, is_smb3);
+ rc = cifs_setup_volume_info(&vol, mdata, devname, false);
kfree(devname);
if (rc) {
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 53b5bdf465e0..bf7644bf0605 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -561,7 +561,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
if (server->ops->close)
server->ops->close(xid, tcon, &fid);
cifs_del_pending_open(&open);
- fput(file);
rc = -ENOMEM;
}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 53dbb6e0d390..076105389a63 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -281,6 +281,13 @@ cifs_has_mand_locks(struct cifsInodeInfo *cinode)
return has_locks;
}
+void
+cifs_down_write(struct rw_semaphore *sem)
+{
+ while (!down_write_trylock(sem))
+ msleep(10);
+}
+
struct cifsFileInfo *
cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
struct tcon_link *tlink, __u32 oplock)
@@ -306,9 +313,6 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
INIT_LIST_HEAD(&fdlocks->locks);
fdlocks->cfile = cfile;
cfile->llist = fdlocks;
- down_write(&cinode->lock_sem);
- list_add(&fdlocks->llist, &cinode->llist);
- up_write(&cinode->lock_sem);
cfile->count = 1;
cfile->pid = current->tgid;
@@ -332,6 +336,10 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
oplock = 0;
}
+ cifs_down_write(&cinode->lock_sem);
+ list_add(&fdlocks->llist, &cinode->llist);
+ up_write(&cinode->lock_sem);
+
spin_lock(&tcon->open_file_lock);
if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock)
oplock = fid->pending_open->oplock;
@@ -464,7 +472,7 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler)
* Delete any outstanding lock records. We'll lose them when the file
* is closed anyway.
*/
- down_write(&cifsi->lock_sem);
+ cifs_down_write(&cifsi->lock_sem);
list_for_each_entry_safe(li, tmp, &cifs_file->llist->locks, llist) {
list_del(&li->llist);
cifs_del_lock_waiters(li);
@@ -721,6 +729,13 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
if (backup_cred(cifs_sb))
create_options |= CREATE_OPEN_BACKUP_INTENT;
+ /* O_SYNC also has bit for O_DSYNC so following check picks up either */
+ if (cfile->f_flags & O_SYNC)
+ create_options |= CREATE_WRITE_THROUGH;
+
+ if (cfile->f_flags & O_DIRECT)
+ create_options |= CREATE_NO_BUFFER;
+
if (server->ops->get_lease_key)
server->ops->get_lease_key(inode, &cfile->fid);
@@ -1027,7 +1042,7 @@ static void
cifs_lock_add(struct cifsFileInfo *cfile, struct cifsLockInfo *lock)
{
struct cifsInodeInfo *cinode = CIFS_I(d_inode(cfile->dentry));
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_add_tail(&lock->llist, &cfile->llist->locks);
up_write(&cinode->lock_sem);
}
@@ -1049,7 +1064,7 @@ cifs_lock_add_if(struct cifsFileInfo *cfile, struct cifsLockInfo *lock,
try_again:
exist = false;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
lock->type, lock->flags, &conf_lock,
@@ -1072,7 +1087,7 @@ try_again:
(lock->blist.next == &lock->blist));
if (!rc)
goto try_again;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_del_init(&lock->blist);
}
@@ -1125,7 +1140,7 @@ cifs_posix_lock_set(struct file *file, struct file_lock *flock)
return rc;
try_again:
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
if (!cinode->can_cache_brlcks) {
up_write(&cinode->lock_sem);
return rc;
@@ -1134,7 +1149,8 @@ try_again:
rc = posix_lock_file(file, flock, NULL);
up_write(&cinode->lock_sem);
if (rc == FILE_LOCK_DEFERRED) {
- rc = wait_event_interruptible(flock->fl_wait, !flock->fl_blocker);
+ rc = wait_event_interruptible(flock->fl_wait,
+ list_empty(&flock->fl_blocked_member));
if (!rc)
goto try_again;
locks_delete_block(flock);
@@ -1331,7 +1347,7 @@ cifs_push_locks(struct cifsFileInfo *cfile)
int rc = 0;
/* we are going to update can_cache_brlcks here - need a write access */
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
if (!cinode->can_cache_brlcks) {
up_write(&cinode->lock_sem);
return rc;
@@ -1522,7 +1538,7 @@ cifs_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
if (!buf)
return -ENOMEM;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
for (i = 0; i < 2; i++) {
cur = buf;
num = 0;
@@ -3671,7 +3687,7 @@ again:
if (rc == -ENODATA)
rc = 0;
- ctx->rc = (rc == 0) ? ctx->total_len : rc;
+ ctx->rc = (rc == 0) ? (ssize_t)ctx->total_len : rc;
mutex_unlock(&ctx->aio_mutex);
@@ -3890,7 +3906,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset)
* than it negotiated since it will refuse the read
* then.
*/
- if ((tcon->ses) && !(tcon->ses->capabilities &
+ if (!(tcon->ses->capabilities &
tcon->ses->server->vals->cap_large_files)) {
current_read_size = min_t(uint,
current_read_size, CIFSMaxBufSize);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f2042d1a1cc4..97d826a26478 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -163,7 +163,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
spin_lock(&inode->i_lock);
/* we do not want atime to be less than mtime, it broke some apps */
- if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime))
+ if (timespec64_compare(&fattr->cf_atime, &fattr->cf_mtime) < 0)
inode->i_atime = fattr->cf_mtime;
else
inode->i_atime = fattr->cf_atime;
@@ -1579,7 +1579,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, umode_t mode)
struct TCP_Server_Info *server;
char *full_path;
- cifs_dbg(FYI, "In cifs_mkdir, mode = 0x%hx inode = 0x%p\n",
+ cifs_dbg(FYI, "In cifs_mkdir, mode = %04ho inode = 0x%p\n",
mode, inode);
cifs_sb = CIFS_SB(inode->i_sb);
@@ -2003,6 +2003,7 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
struct inode *inode = d_inode(dentry);
struct super_block *sb = dentry->d_sb;
char *full_path = NULL;
+ int count = 0;
if (inode == NULL)
return -ENOENT;
@@ -2024,15 +2025,18 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
full_path, inode, inode->i_count.counter,
dentry, cifs_get_time(dentry), jiffies);
+again:
if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
else
rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
xid, NULL);
-
+ if (rc == -EAGAIN && count++ < 10)
+ goto again;
out:
kfree(full_path);
free_xid(xid);
+
return rc;
}
@@ -2252,6 +2256,15 @@ set_size_out:
if (rc == 0) {
cifsInode->server_eof = attrs->ia_size;
cifs_setsize(inode, attrs->ia_size);
+
+ /*
+ * The man page of truncate says if the size changed,
+ * then the st_ctime and st_mtime fields for the file
+ * are updated.
+ */
+ attrs->ia_ctime = attrs->ia_mtime = current_time(inode);
+ attrs->ia_valid |= ATTR_CTIME | ATTR_MTIME;
+
cifs_truncate_page(inode->i_mapping, inode->i_size);
}
@@ -2442,25 +2455,26 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
/*
* Attempt to flush data before changing attributes. We need to do
- * this for ATTR_SIZE and ATTR_MTIME for sure, and if we change the
- * ownership or mode then we may also need to do this. Here, we take
- * the safe way out and just do the flush on all setattr requests. If
- * the flush returns error, store it to report later and continue.
+ * this for ATTR_SIZE and ATTR_MTIME. If the flush of the data
+ * returns error, store it to report later and continue.
*
* BB: This should be smarter. Why bother flushing pages that
* will be truncated anyway? Also, should we error out here if
- * the flush returns error?
+ * the flush returns error? Do we need to check for ATTR_MTIME_SET flag?
*/
- rc = filemap_write_and_wait(inode->i_mapping);
- if (is_interrupt_error(rc)) {
- rc = -ERESTARTSYS;
- goto cifs_setattr_exit;
+ if (attrs->ia_valid & (ATTR_MTIME | ATTR_SIZE | ATTR_CTIME)) {
+ rc = filemap_write_and_wait(inode->i_mapping);
+ if (is_interrupt_error(rc)) {
+ rc = -ERESTARTSYS;
+ goto cifs_setattr_exit;
+ }
+ mapping_set_error(inode->i_mapping, rc);
}
- mapping_set_error(inode->i_mapping, rc);
rc = 0;
- if (attrs->ia_valid & ATTR_MTIME) {
+ if ((attrs->ia_valid & ATTR_MTIME) &&
+ !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) {
rc = cifs_get_writable_file(cifsInode, false, &wfile);
if (!rc) {
tcon = tlink_tcon(wfile->tlink);
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index ed92958e842d..657f409d4de0 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -117,10 +117,6 @@ static const struct smb_to_posix_error mapping_table_ERRSRV[] = {
{0, 0}
};
-static const struct smb_to_posix_error mapping_table_ERRHRD[] = {
- {0, 0}
-};
-
/*
* Convert a string containing text IPv4 or IPv6 address to binary form.
*
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 54bffb2a1786..060fbe1e005b 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -67,7 +67,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
goto out;
- if (oparms->tcon->use_resilient) {
+ if (oparms->tcon->use_resilient) {
/* default timeout is 0, servers pick default (120 seconds) */
nr_ioctl_req.Timeout =
cpu_to_le32(oparms->tcon->handle_timeout);
@@ -139,7 +139,7 @@ smb2_unlock_range(struct cifsFileInfo *cfile, struct file_lock *flock,
cur = buf;
- down_write(&cinode->lock_sem);
+ cifs_down_write(&cinode->lock_sem);
list_for_each_entry_safe(li, tmp, &cfile->llist->locks, llist) {
if (flock->fl_start > li->offset ||
(flock->fl_start + length) <
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index e311f58dc1c8..2fc96f7923ee 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -509,15 +509,31 @@ cifs_ses_oplock_break(struct work_struct *work)
kfree(lw);
}
+static void
+smb2_queue_pending_open_break(struct tcon_link *tlink, __u8 *lease_key,
+ __le32 new_lease_state)
+{
+ struct smb2_lease_break_work *lw;
+
+ lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL);
+ if (!lw) {
+ cifs_put_tlink(tlink);
+ return;
+ }
+
+ INIT_WORK(&lw->lease_break, cifs_ses_oplock_break);
+ lw->tlink = tlink;
+ lw->lease_state = new_lease_state;
+ memcpy(lw->lease_key, lease_key, SMB2_LEASE_KEY_SIZE);
+ queue_work(cifsiod_wq, &lw->lease_break);
+}
+
static bool
-smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
- struct smb2_lease_break_work *lw)
+smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp)
{
- bool found;
__u8 lease_state;
struct list_head *tmp;
struct cifsFileInfo *cfile;
- struct cifs_pending_open *open;
struct cifsInodeInfo *cinode;
int ack_req = le32_to_cpu(rsp->Flags &
SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED);
@@ -556,22 +572,29 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
&cinode->flags);
cifs_queue_oplock_break(cfile);
- kfree(lw);
return true;
}
- found = false;
+ return false;
+}
+
+static struct cifs_pending_open *
+smb2_tcon_find_pending_open_lease(struct cifs_tcon *tcon,
+ struct smb2_lease_break *rsp)
+{
+ __u8 lease_state = le32_to_cpu(rsp->NewLeaseState);
+ int ack_req = le32_to_cpu(rsp->Flags &
+ SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED);
+ struct cifs_pending_open *open;
+ struct cifs_pending_open *found = NULL;
+
list_for_each_entry(open, &tcon->pending_opens, olist) {
if (memcmp(open->lease_key, rsp->LeaseKey,
SMB2_LEASE_KEY_SIZE))
continue;
if (!found && ack_req) {
- found = true;
- memcpy(lw->lease_key, open->lease_key,
- SMB2_LEASE_KEY_SIZE);
- lw->tlink = cifs_get_tlink(open->tlink);
- queue_work(cifsiod_wq, &lw->lease_break);
+ found = open;
}
cifs_dbg(FYI, "found in the pending open list\n");
@@ -592,14 +615,7 @@ smb2_is_valid_lease_break(char *buffer)
struct TCP_Server_Info *server;
struct cifs_ses *ses;
struct cifs_tcon *tcon;
- struct smb2_lease_break_work *lw;
-
- lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL);
- if (!lw)
- return false;
-
- INIT_WORK(&lw->lease_break, cifs_ses_oplock_break);
- lw->lease_state = rsp->NewLeaseState;
+ struct cifs_pending_open *open;
cifs_dbg(FYI, "Checking for lease break\n");
@@ -617,9 +633,25 @@ smb2_is_valid_lease_break(char *buffer)
spin_lock(&tcon->open_file_lock);
cifs_stats_inc(
&tcon->stats.cifs_stats.num_oplock_brks);
- if (smb2_tcon_has_lease(tcon, rsp, lw)) {
+ if (smb2_tcon_has_lease(tcon, rsp)) {
+ spin_unlock(&tcon->open_file_lock);
+ spin_unlock(&cifs_tcp_ses_lock);
+ return true;
+ }
+ open = smb2_tcon_find_pending_open_lease(tcon,
+ rsp);
+ if (open) {
+ __u8 lease_key[SMB2_LEASE_KEY_SIZE];
+ struct tcon_link *tlink;
+
+ tlink = cifs_get_tlink(open->tlink);
+ memcpy(lease_key, open->lease_key,
+ SMB2_LEASE_KEY_SIZE);
spin_unlock(&tcon->open_file_lock);
spin_unlock(&cifs_tcp_ses_lock);
+ smb2_queue_pending_open_break(tlink,
+ lease_key,
+ rsp->NewLeaseState);
return true;
}
spin_unlock(&tcon->open_file_lock);
@@ -639,7 +671,6 @@ smb2_is_valid_lease_break(char *buffer)
}
}
spin_unlock(&cifs_tcp_ses_lock);
- kfree(lw);
cifs_dbg(FYI, "Can not process lease break - no lease matched\n");
return false;
}
@@ -673,10 +704,10 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
spin_lock(&cifs_tcp_ses_lock);
list_for_each(tmp, &server->smb_ses_list) {
ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+
list_for_each(tmp1, &ses->tcon_list) {
tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
- cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
spin_lock(&tcon->open_file_lock);
list_for_each(tmp2, &tcon->openFileList) {
cfile = list_entry(tmp2, struct cifsFileInfo,
@@ -688,6 +719,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
continue;
cifs_dbg(FYI, "file id match, oplock break\n");
+ cifs_stats_inc(
+ &tcon->stats.cifs_stats.num_oplock_brks);
cinode = CIFS_I(d_inode(cfile->dentry));
spin_lock(&cfile->file_info_lock);
if (!CIFS_CACHE_WRITE(cinode) &&
@@ -720,9 +753,6 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
return true;
}
spin_unlock(&tcon->open_file_lock);
- spin_unlock(&cifs_tcp_ses_lock);
- cifs_dbg(FYI, "No matching file for oplock break\n");
- return true;
}
}
spin_unlock(&cifs_tcp_ses_lock);
@@ -744,36 +774,67 @@ smb2_cancelled_close_fid(struct work_struct *work)
kfree(cancelled);
}
+/* Caller should already has an extra reference to @tcon */
+static int
+__smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
+ __u64 volatile_fid)
+{
+ struct close_cancelled_open *cancelled;
+
+ cancelled = kzalloc(sizeof(*cancelled), GFP_ATOMIC);
+ if (!cancelled)
+ return -ENOMEM;
+
+ cancelled->fid.persistent_fid = persistent_fid;
+ cancelled->fid.volatile_fid = volatile_fid;
+ cancelled->tcon = tcon;
+ INIT_WORK(&cancelled->work, smb2_cancelled_close_fid);
+ WARN_ON(queue_work(cifsiod_wq, &cancelled->work) == false);
+
+ return 0;
+}
+
+int
+smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid,
+ __u64 volatile_fid)
+{
+ int rc;
+
+ cifs_dbg(FYI, "%s: tc_count=%d\n", __func__, tcon->tc_count);
+ spin_lock(&cifs_tcp_ses_lock);
+ tcon->tc_count++;
+ spin_unlock(&cifs_tcp_ses_lock);
+
+ rc = __smb2_handle_cancelled_close(tcon, persistent_fid, volatile_fid);
+ if (rc)
+ cifs_put_tcon(tcon);
+
+ return rc;
+}
+
int
smb2_handle_cancelled_mid(char *buffer, struct TCP_Server_Info *server)
{
struct smb2_sync_hdr *sync_hdr = (struct smb2_sync_hdr *)buffer;
struct smb2_create_rsp *rsp = (struct smb2_create_rsp *)buffer;
struct cifs_tcon *tcon;
- struct close_cancelled_open *cancelled;
+ int rc;
if (sync_hdr->Command != SMB2_CREATE ||
sync_hdr->Status != STATUS_SUCCESS)
return 0;
- cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL);
- if (!cancelled)
- return -ENOMEM;
-
tcon = smb2_find_smb_tcon(server, sync_hdr->SessionId,
sync_hdr->TreeId);
- if (!tcon) {
- kfree(cancelled);
+ if (!tcon)
return -ENOENT;
- }
- cancelled->fid.persistent_fid = rsp->PersistentFileId;
- cancelled->fid.volatile_fid = rsp->VolatileFileId;
- cancelled->tcon = tcon;
- INIT_WORK(&cancelled->work, smb2_cancelled_close_fid);
- queue_work(cifsiod_wq, &cancelled->work);
+ rc = __smb2_handle_cancelled_close(tcon, rsp->PersistentFileId,
+ rsp->VolatileFileId);
+ if (rc)
+ cifs_put_tcon(tcon);
- return 0;
+ return rc;
}
/**
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 0011e6bdaa9a..a71efe297244 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -151,13 +151,7 @@ smb2_get_credits_field(struct TCP_Server_Info *server, const int optype)
static unsigned int
smb2_get_credits(struct mid_q_entry *mid)
{
- struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)mid->resp_buf;
-
- if (mid->mid_state == MID_RESPONSE_RECEIVED
- || mid->mid_state == MID_RESPONSE_MALFORMED)
- return le16_to_cpu(shdr->CreditRequest);
-
- return 0;
+ return mid->credits_received;
}
static int
@@ -668,6 +662,11 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
+ if (!server->ops->new_lease_key)
+ return -EIO;
+
+ server->ops->new_lease_key(pfid);
+
memset(rqst, 0, sizeof(rqst));
resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER;
memset(rsp_iov, 0, sizeof(rsp_iov));
@@ -735,6 +734,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
/* close extra handle outside of crit sec */
SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid);
}
+ rc = 0;
goto oshr_free;
}
@@ -1084,7 +1084,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
void *data[1];
struct smb2_file_full_ea_info *ea = NULL;
struct kvec close_iov[1];
- int rc;
+ struct smb2_query_info_rsp *rsp;
+ int rc, used_len = 0;
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -1107,6 +1108,38 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
cifs_sb);
if (rc == -ENODATA)
goto sea_exit;
+ } else {
+ /* If we are adding a attribute we should first check
+ * if there will be enough space available to store
+ * the new EA. If not we should not add it since we
+ * would not be able to even read the EAs back.
+ */
+ rc = smb2_query_info_compound(xid, tcon, utf16_path,
+ FILE_READ_EA,
+ FILE_FULL_EA_INFORMATION,
+ SMB2_O_INFO_FILE,
+ CIFSMaxBufSize -
+ MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE,
+ &rsp_iov[1], &resp_buftype[1], cifs_sb);
+ if (rc == 0) {
+ rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
+ used_len = le32_to_cpu(rsp->OutputBufferLength);
+ }
+ free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+ resp_buftype[1] = CIFS_NO_BUFFER;
+ memset(&rsp_iov[1], 0, sizeof(rsp_iov[1]));
+ rc = 0;
+
+ /* Use a fudge factor of 256 bytes in case we collide
+ * with a different set_EAs command.
+ */
+ if(CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE - 256 <
+ used_len + ea_name_len + ea_value_len + 1) {
+ rc = -ENOSPC;
+ goto sea_exit;
+ }
}
}
@@ -1455,7 +1488,9 @@ smb2_ioctl_query_info(const unsigned int xid,
COMPOUND_FID, COMPOUND_FID,
qi.info_type, true, buffer,
qi.output_buffer_length,
- CIFSMaxBufSize);
+ CIFSMaxBufSize -
+ MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE);
}
} else if (qi.flags == PASSTHRU_QUERY_INFO) {
memset(&qi_iov, 0, sizeof(qi_iov));
@@ -2546,7 +2581,10 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
rc = SMB2_ioctl_init(tcon, &rqst[1], fid.persistent_fid,
fid.volatile_fid, FSCTL_GET_REPARSE_POINT,
- true /* is_fctl */, NULL, 0, CIFSMaxBufSize);
+ true /* is_fctl */, NULL, 0,
+ CIFSMaxBufSize -
+ MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE);
if (rc)
goto querty_exit;
@@ -2850,6 +2888,11 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid,
ses->Suid, offset, len);
+ /*
+ * We zero the range through ioctl, so we need remove the page caches
+ * first, otherwise the data may be inconsistent with the server.
+ */
+ truncate_pagecache_range(inode, offset, offset + len - 1);
/* if file not oplocked can't be sure whether asking to extend size */
if (!CIFS_CACHE_READ(cifsi))
@@ -2918,6 +2961,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
return rc;
}
+ /*
+ * We implement the punch hole through ioctl, so we need remove the page
+ * caches first, otherwise the data may be inconsistent with the server.
+ */
+ truncate_pagecache_range(inode, offset, offset + len - 1);
+
cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len);
fsctl_buf.FileOffset = cpu_to_le64(offset);
@@ -3140,7 +3189,7 @@ static int smb3_fiemap(struct cifs_tcon *tcon,
if (rc)
goto out;
- if (out_data_len < sizeof(struct file_allocated_range_buffer)) {
+ if (out_data_len && out_data_len < sizeof(struct file_allocated_range_buffer)) {
rc = -EINVAL;
goto out;
}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index a221536db0de..78f4cd690dc2 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -252,7 +252,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
if (tcon == NULL)
return 0;
- if (smb2_command == SMB2_TREE_CONNECT || smb2_command == SMB2_IOCTL)
+ if (smb2_command == SMB2_TREE_CONNECT)
return 0;
if (tcon->tidStatus == CifsExiting) {
@@ -312,7 +312,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
if (server->tcpStatus != CifsNeedReconnect)
break;
- if (--retries)
+ if (retries && --retries)
continue;
/*
@@ -350,9 +350,14 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
}
rc = cifs_negotiate_protocol(0, tcon->ses);
- if (!rc && tcon->ses->need_reconnect)
+ if (!rc && tcon->ses->need_reconnect) {
rc = cifs_setup_session(0, tcon->ses, nls_codepage);
-
+ if ((rc == -EACCES) && !tcon->retry) {
+ rc = -EHOSTDOWN;
+ mutex_unlock(&tcon->ses->session_mutex);
+ goto failed;
+ }
+ }
if (rc || !tcon->need_reconnect) {
mutex_unlock(&tcon->ses->session_mutex);
goto out;
@@ -397,6 +402,7 @@ out:
case SMB2_SET_INFO:
rc = -EAGAIN;
}
+failed:
unload_nls(nls_codepage);
return rc;
}
@@ -426,16 +432,9 @@ fill_small_buf(__le16 smb2_command, struct cifs_tcon *tcon, void *buf,
* SMB information in the SMB header. If the return code is zero, this
* function must have filled in request_buf pointer.
*/
-static int
-smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
- void **request_buf, unsigned int *total_len)
+static int __smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
+ void **request_buf, unsigned int *total_len)
{
- int rc;
-
- rc = smb2_reconnect(smb2_command, tcon);
- if (rc)
- return rc;
-
/* BB eventually switch this to SMB2 specific small buf size */
if (smb2_command == SMB2_SET_INFO)
*request_buf = cifs_buf_get();
@@ -456,7 +455,31 @@ smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
cifs_stats_inc(&tcon->num_smbs_sent);
}
- return rc;
+ return 0;
+}
+
+static int smb2_plain_req_init(__le16 smb2_command, struct cifs_tcon *tcon,
+ void **request_buf, unsigned int *total_len)
+{
+ int rc;
+
+ rc = smb2_reconnect(smb2_command, tcon);
+ if (rc)
+ return rc;
+
+ return __smb2_plain_req_init(smb2_command, tcon, request_buf,
+ total_len);
+}
+
+static int smb2_ioctl_req_init(u32 opcode, struct cifs_tcon *tcon,
+ void **request_buf, unsigned int *total_len)
+{
+ /* Skip reconnect only for FSCTL_VALIDATE_NEGOTIATE_INFO IOCTLs */
+ if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO) {
+ return __smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf,
+ total_len);
+ }
+ return smb2_plain_req_init(SMB2_IOCTL, tcon, request_buf, total_len);
}
/* For explanation of negotiate contexts see MS-SMB2 section 2.2.3.1 */
@@ -1275,6 +1298,8 @@ SMB2_auth_kerberos(struct SMB2_sess_data *sess_data)
spnego_key = cifs_get_spnego_key(ses);
if (IS_ERR(spnego_key)) {
rc = PTR_ERR(spnego_key);
+ if (rc == -ENOKEY)
+ cifs_dbg(VFS, "Verify user has a krb5 ticket and keyutils is installed\n");
spnego_key = NULL;
goto out;
}
@@ -2547,7 +2572,7 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
int rc;
char *in_data_buf;
- rc = smb2_plain_req_init(SMB2_IOCTL, tcon, (void **) &req, &total_len);
+ rc = smb2_ioctl_req_init(opcode, tcon, (void **) &req, &total_len);
if (rc)
return rc;
@@ -2610,7 +2635,9 @@ SMB2_ioctl_init(struct cifs_tcon *tcon, struct smb_rqst *rqst,
* response size smaller.
*/
req->MaxOutputResponse = cpu_to_le32(max_response_size);
-
+ req->sync_hdr.CreditCharge =
+ cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size),
+ SMB2_MAX_BUFFER_SIZE));
if (is_fsctl)
req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
else
@@ -2852,7 +2879,21 @@ int
SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid)
{
- return SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0);
+ int rc;
+ int tmp_rc;
+
+ rc = SMB2_close_flags(xid, tcon, persistent_fid, volatile_fid, 0);
+
+ /* retry close in a worker thread if this one is interrupted */
+ if (rc == -EINTR) {
+ tmp_rc = smb2_handle_cancelled_close(tcon, persistent_fid,
+ volatile_fid);
+ if (tmp_rc)
+ cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n",
+ persistent_fid, tmp_rc);
+ }
+
+ return rc;
}
int
@@ -3645,6 +3686,9 @@ smb2_writev_callback(struct mid_q_entry *mid)
wdata->cfile->fid.persistent_fid,
tcon->tid, tcon->ses->Suid, wdata->offset,
wdata->bytes, wdata->result);
+ if (wdata->result == -ENOSPC)
+ printk_once(KERN_WARNING "Out of space writing to %s\n",
+ tcon->treeName);
} else
trace_smb3_write_done(0 /* no xid */,
wdata->cfile->fid.persistent_fid,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 858353d20c39..06e4bfac68cc 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -816,6 +816,7 @@ struct create_durable_handle_reconnect_v2 {
struct create_context ccontext;
__u8 Name[8];
struct durable_reconnect_context_v2 dcontext;
+ __u8 Pad[4];
} __packed;
/* See MS-SMB2 2.2.13.2.5 */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 52df125e9189..2e07c2917947 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -203,6 +203,9 @@ extern int SMB2_set_compression(const unsigned int xid, struct cifs_tcon *tcon,
extern int SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
const u64 persistent_fid, const u64 volatile_fid,
const __u8 oplock_level);
+extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon,
+ __u64 persistent_fid,
+ __u64 volatile_fid);
extern int smb2_handle_cancelled_mid(char *buffer,
struct TCP_Server_Info *server);
void smb2_cancelled_close_fid(struct work_struct *work);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index d1181572758b..b9ccef97406d 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -597,6 +597,8 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr,
* The default is for the mid to be synchronous, so the
* default callback just wakes up the current task.
*/
+ get_task_struct(current);
+ temp->creator = current;
temp->callback = cifs_wake_up_task;
temp->callback_data = current;
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index cd07e5301d42..8d1dcf842f28 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -1069,7 +1069,7 @@ static int smbd_post_send_data(
if (n_vec > SMBDIRECT_MAX_SGE) {
cifs_dbg(VFS, "Can't fit data to SGL, n_vec=%d\n", n_vec);
- return -ENOMEM;
+ return -EINVAL;
}
sg_init_table(sgl, n_vec);
@@ -1476,6 +1476,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
info->transport_status = SMBD_DESTROYED;
destroy_workqueue(info->workqueue);
+ log_rdma_event(INFO, "rdma session destroyed\n");
kfree(info);
}
@@ -1505,8 +1506,9 @@ create_conn:
log_rdma_event(INFO, "creating rdma session\n");
server->smbd_conn = smbd_get_connection(
server, (struct sockaddr *) &server->dstaddr);
- log_rdma_event(INFO, "created rdma session info=%p\n",
- server->smbd_conn);
+
+ if (server->smbd_conn)
+ cifs_dbg(VFS, "RDMA transport re-established\n");
return server->smbd_conn ? 0 : -ENOENT;
}
@@ -1968,7 +1970,7 @@ read_rfc1002_done:
if (info->transport_status != SMBD_CONNECTED) {
log_read(ERR, "disconnected\n");
- return 0;
+ return -ECONNABORTED;
}
goto again;
@@ -2267,12 +2269,7 @@ static void smbd_mr_recovery_work(struct work_struct *work)
int rc;
list_for_each_entry(smbdirect_mr, &info->mr_list, list) {
- if (smbdirect_mr->state == MR_INVALIDATED)
- ib_dma_unmap_sg(
- info->id->device, smbdirect_mr->sgl,
- smbdirect_mr->sgl_count,
- smbdirect_mr->dir);
- else if (smbdirect_mr->state == MR_ERROR) {
+ if (smbdirect_mr->state == MR_ERROR) {
/* recover this MR entry */
rc = ib_dereg_mr(smbdirect_mr->mr);
@@ -2600,11 +2597,20 @@ int smbd_deregister_mr(struct smbd_mr *smbdirect_mr)
*/
smbdirect_mr->state = MR_INVALIDATED;
- /*
- * Schedule the work to do MR recovery for future I/Os
- * MR recovery is slow and we don't want it to block the current I/O
- */
- queue_work(info->workqueue, &info->mr_recovery_work);
+ if (smbdirect_mr->state == MR_INVALIDATED) {
+ ib_dma_unmap_sg(
+ info->id->device, smbdirect_mr->sgl,
+ smbdirect_mr->sgl_count,
+ smbdirect_mr->dir);
+ smbdirect_mr->state = MR_READY;
+ if (atomic_inc_return(&info->mr_ready_count) == 1)
+ wake_up_interruptible(&info->wait_mr);
+ } else
+ /*
+ * Schedule the work to do MR recovery for future I/Os MR
+ * recovery is slow and don't want it to block current I/O
+ */
+ queue_work(info->workqueue, &info->mr_recovery_work);
done:
if (atomic_dec_and_test(&info->mr_used_count))
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 60661b3f983a..44079b1a0456 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -76,6 +76,8 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
* The default is for the mid to be synchronous, so the
* default callback just wakes up the current task.
*/
+ get_task_struct(current);
+ temp->creator = current;
temp->callback = cifs_wake_up_task;
temp->callback_data = current;
@@ -86,29 +88,21 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server)
static void _cifs_mid_q_entry_release(struct kref *refcount)
{
- struct mid_q_entry *mid = container_of(refcount, struct mid_q_entry,
- refcount);
-
- mempool_free(mid, cifs_mid_poolp);
-}
-
-void cifs_mid_q_entry_release(struct mid_q_entry *midEntry)
-{
- spin_lock(&GlobalMid_Lock);
- kref_put(&midEntry->refcount, _cifs_mid_q_entry_release);
- spin_unlock(&GlobalMid_Lock);
-}
-
-void
-DeleteMidQEntry(struct mid_q_entry *midEntry)
-{
+ struct mid_q_entry *midEntry =
+ container_of(refcount, struct mid_q_entry, refcount);
#ifdef CONFIG_CIFS_STATS2
__le16 command = midEntry->server->vals->lock_cmd;
__u16 smb_cmd = le16_to_cpu(midEntry->command);
unsigned long now;
unsigned long roundtrip_time;
- struct TCP_Server_Info *server = midEntry->server;
#endif
+ struct TCP_Server_Info *server = midEntry->server;
+
+ if (midEntry->resp_buf && (midEntry->mid_flags & MID_WAIT_CANCELLED) &&
+ midEntry->mid_state == MID_RESPONSE_RECEIVED &&
+ server->ops->handle_cancelled_mid)
+ server->ops->handle_cancelled_mid(midEntry->resp_buf, server);
+
midEntry->mid_state = MID_FREE;
atomic_dec(&midCount);
if (midEntry->large_buf)
@@ -166,6 +160,20 @@ DeleteMidQEntry(struct mid_q_entry *midEntry)
}
}
#endif
+ put_task_struct(midEntry->creator);
+
+ mempool_free(midEntry, cifs_mid_poolp);
+}
+
+void cifs_mid_q_entry_release(struct mid_q_entry *midEntry)
+{
+ spin_lock(&GlobalMid_Lock);
+ kref_put(&midEntry->refcount, _cifs_mid_q_entry_release);
+ spin_unlock(&GlobalMid_Lock);
+}
+
+void DeleteMidQEntry(struct mid_q_entry *midEntry)
+{
cifs_mid_q_entry_release(midEntry);
}
@@ -173,8 +181,10 @@ void
cifs_delete_mid(struct mid_q_entry *mid)
{
spin_lock(&GlobalMid_Lock);
- list_del_init(&mid->qhead);
- mid->mid_flags |= MID_DELETED;
+ if (!(mid->mid_flags & MID_DELETED)) {
+ list_del_init(&mid->qhead);
+ mid->mid_flags |= MID_DELETED;
+ }
spin_unlock(&GlobalMid_Lock);
DeleteMidQEntry(mid);
@@ -318,8 +328,11 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
int val = 1;
__be32 rfc1002_marker;
- if (cifs_rdma_enabled(server) && server->smbd_conn) {
- rc = smbd_send(server, num_rqst, rqst);
+ if (cifs_rdma_enabled(server)) {
+ /* return -EAGAIN when connecting or reconnecting */
+ rc = -EAGAIN;
+ if (server->smbd_conn)
+ rc = smbd_send(server, num_rqst, rqst);
goto smbd_done;
}
@@ -453,7 +466,7 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
struct smb_rqst *rqst, int flags)
{
struct kvec iov;
- struct smb2_transform_hdr tr_hdr;
+ struct smb2_transform_hdr *tr_hdr;
struct smb_rqst cur_rqst[MAX_COMPOUND];
int rc;
@@ -463,28 +476,34 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
if (num_rqst > MAX_COMPOUND - 1)
return -ENOMEM;
- memset(&cur_rqst[0], 0, sizeof(cur_rqst));
- memset(&iov, 0, sizeof(iov));
- memset(&tr_hdr, 0, sizeof(tr_hdr));
-
- iov.iov_base = &tr_hdr;
- iov.iov_len = sizeof(tr_hdr);
- cur_rqst[0].rq_iov = &iov;
- cur_rqst[0].rq_nvec = 1;
-
if (!server->ops->init_transform_rq) {
cifs_dbg(VFS, "Encryption requested but transform callback "
"is missing\n");
return -EIO;
}
+ tr_hdr = kmalloc(sizeof(*tr_hdr), GFP_NOFS);
+ if (!tr_hdr)
+ return -ENOMEM;
+
+ memset(&cur_rqst[0], 0, sizeof(cur_rqst));
+ memset(&iov, 0, sizeof(iov));
+ memset(tr_hdr, 0, sizeof(*tr_hdr));
+
+ iov.iov_base = tr_hdr;
+ iov.iov_len = sizeof(*tr_hdr);
+ cur_rqst[0].rq_iov = &iov;
+ cur_rqst[0].rq_nvec = 1;
+
rc = server->ops->init_transform_rq(server, num_rqst + 1,
&cur_rqst[0], rqst);
if (rc)
- return rc;
+ goto out;
rc = __smb_send_rqst(server, num_rqst + 1, &cur_rqst[0]);
smb3_free_compound_rqst(num_rqst, &cur_rqst[1]);
+out:
+ kfree(tr_hdr);
return rc;
}
@@ -509,7 +528,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits,
const int timeout, const int flags,
unsigned int *instance)
{
- int rc;
+ long rc;
int *credits;
int optype;
long int t;
@@ -868,7 +887,10 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server)
rc = -EHOSTDOWN;
break;
default:
- list_del_init(&mid->qhead);
+ if (!(mid->mid_flags & MID_DELETED)) {
+ list_del_init(&mid->qhead);
+ mid->mid_flags |= MID_DELETED;
+ }
cifs_dbg(VFS, "%s: invalid mid state mid=%llu state=%d\n",
__func__, mid->mid, mid->mid_state);
rc = -EIO;
@@ -1109,8 +1131,8 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
midQ[i]->mid, le16_to_cpu(midQ[i]->command));
send_cancel(ses->server, &rqst[i], midQ[i]);
spin_lock(&GlobalMid_Lock);
+ midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
if (midQ[i]->mid_state == MID_REQUEST_SUBMITTED) {
- midQ[i]->mid_flags |= MID_WAIT_CANCELLED;
midQ[i]->callback = cifs_cancelled_callback;
cancelled_mid[i] = true;
credits[i].value = 0;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a7ec2d3dff92..e0226b2138d6 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1032,10 +1032,11 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
#endif
case FICLONE:
+ goto do_ioctl;
case FICLONERANGE:
case FIDEDUPERANGE:
case FS_IOC_FIEMAP:
- goto do_ioctl;
+ goto found_handler;
case FIBMAP:
case FIGETBSZ:
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index 91eac6c55e07..f3881e4caedd 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -143,11 +143,42 @@ int configfs_symlink(struct inode *dir, struct dentry *dentry, const char *symna
!type->ct_item_ops->allow_link)
goto out_put;
+ /*
+ * This is really sick. What they wanted was a hybrid of
+ * link(2) and symlink(2) - they wanted the target resolved
+ * at syscall time (as link(2) would've done), be a directory
+ * (which link(2) would've refused to do) *AND* be a deep
+ * fucking magic, making the target busy from rmdir POV.
+ * symlink(2) is nothing of that sort, and the locking it
+ * gets matches the normal symlink(2) semantics. Without
+ * attempts to resolve the target (which might very well
+ * not even exist yet) done prior to locking the parent
+ * directory. This perversion, OTOH, needs to resolve
+ * the target, which would lead to obvious deadlocks if
+ * attempted with any directories locked.
+ *
+ * Unfortunately, that garbage is userland ABI and we should've
+ * said "no" back in 2005. Too late now, so we get to
+ * play very ugly games with locking.
+ *
+ * Try *ANYTHING* of that sort in new code, and you will
+ * really regret it. Just ask yourself - what could a BOFH
+ * do to me and do I want to find it out first-hand?
+ *
+ * AV, a thoroughly annoyed bastard.
+ */
+ inode_unlock(dir);
ret = get_target(symname, &path, &target_item, dentry->d_sb);
+ inode_lock(dir);
if (ret)
goto out_put;
- ret = type->ct_item_ops->allow_link(parent_item, target_item);
+ if (dentry->d_inode || d_unhashed(dentry))
+ ret = -EEXIST;
+ else
+ ret = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+ if (!ret)
+ ret = type->ct_item_ops->allow_link(parent_item, target_item);
if (!ret) {
mutex_lock(&configfs_symlink_mutex);
ret = create_link(parent_item, target_item, dentry);
diff --git a/fs/coredump.c b/fs/coredump.c
index e42e17e55bfd..b24396625848 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -753,6 +753,14 @@ void do_coredump(const kernel_siginfo_t *siginfo)
if (displaced)
put_files_struct(displaced);
if (!dump_interrupted()) {
+ /*
+ * umh disabled with CONFIG_STATIC_USERMODEHELPER_PATH="" would
+ * have this set to NULL.
+ */
+ if (!cprm.file) {
+ pr_info("Core dump to |%s disabled\n", cn.corename);
+ goto close_fail;
+ }
file_start_write(cprm.file);
core_dumped = binfmt->core_dump(&cprm);
file_end_write(cprm.file);
diff --git a/fs/dax.c b/fs/dax.c
index 36538f5ecc31..aaa16945ef8a 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1208,6 +1208,9 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
lockdep_assert_held(&inode->i_rwsem);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ flags |= IOMAP_NOWAIT;
+
while (iov_iter_count(iter)) {
ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
iter, dax_iomap_actor);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index ddd708b09fa1..6cfe9c175977 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -147,8 +147,13 @@ static int open_proxy_open(struct inode *inode, struct file *filp)
return r == -EIO ? -ENOENT : r;
real_fops = debugfs_real_fops(filp);
- real_fops = fops_get(real_fops);
- if (!real_fops) {
+ if (!fops_get(real_fops)) {
+#ifdef CONFIG_MODULES
+ if (real_fops->owner &&
+ real_fops->owner->state == MODULE_STATE_GOING)
+ goto out;
+#endif
+
/* Huh? Module did not clean up after itself at exit? */
WARN(1, "debugfs file owner did not clean up at exit: %pd",
dentry);
@@ -272,8 +277,13 @@ static int full_proxy_open(struct inode *inode, struct file *filp)
return r == -EIO ? -ENOENT : r;
real_fops = debugfs_real_fops(filp);
- real_fops = fops_get(real_fops);
- if (!real_fops) {
+ if (!fops_get(real_fops)) {
+#ifdef CONFIG_MODULES
+ if (real_fops->owner &&
+ real_fops->owner->state == MODULE_STATE_GOING)
+ goto out;
+#endif
+
/* Huh? Module did not cleanup after itself at exit? */
WARN(1, "debugfs file owner did not clean up at exit: %pd",
dentry);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index da1173a0b274..db09bed48153 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -97,7 +97,6 @@ do { \
__LINE__, __FILE__, #x, jiffies); \
{do} \
printk("\n"); \
- BUG(); \
panic("DLM: Record message above and reboot.\n"); \
} \
}
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index 4c2c85a223ac..3dd6e0189d39 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -631,6 +631,9 @@ static int new_lockspace(const char *name, const char *cluster,
wait_event(ls->ls_recover_lock_wait,
test_bit(LSFL_RECOVER_LOCK, &ls->ls_flags));
+ /* let kobject handle freeing of ls if there's an error */
+ do_unreg = 1;
+
ls->ls_kobj.kset = dlm_kset;
error = kobject_init_and_add(&ls->ls_kobj, &dlm_ktype, NULL,
"%s", ls->ls_name);
@@ -638,9 +641,6 @@ static int new_lockspace(const char *name, const char *cluster,
goto out_recoverd;
kobject_uevent(&ls->ls_kobj, KOBJ_ADD);
- /* let kobject handle freeing of ls if there's an error */
- do_unreg = 1;
-
/* This uevent triggers dlm_controld in userspace to add us to the
group of nodes that are members of this lockspace (managed by the
cluster infrastructure.) Once it's done that, it tells us who the
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index d31b6c72b476..dc1a1d5d825b 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -35,11 +35,11 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
spin_unlock(&inode->i_lock);
spin_unlock(&sb->s_inode_list_lock);
- cond_resched();
invalidate_mapping_pages(inode->i_mapping, 0, -1);
iput(toput_inode);
toput_inode = inode;
+ cond_resched();
spin_lock(&sb->s_inode_list_lock);
}
spin_unlock(&sb->s_inode_list_lock);
diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c
index 54bfa71ffdad..9a85905ccf98 100644
--- a/fs/ecryptfs/crypto.c
+++ b/fs/ecryptfs/crypto.c
@@ -310,8 +310,10 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat,
struct extent_crypt_result ecr;
int rc = 0;
- BUG_ON(!crypt_stat || !crypt_stat->tfm
- || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED));
+ if (!crypt_stat || !crypt_stat->tfm
+ || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED))
+ return -EINVAL;
+
if (unlikely(ecryptfs_verbosity > 0)) {
ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n",
crypt_stat->key_size);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1e994d780f37..8d27fd208f67 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -128,13 +128,20 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
struct inode *inode)
{
struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- struct inode *lower_dir_inode = ecryptfs_inode_to_lower(dir);
struct dentry *lower_dir_dentry;
+ struct inode *lower_dir_inode;
int rc;
- dget(lower_dentry);
- lower_dir_dentry = lock_parent(lower_dentry);
- rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
+ lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+ lower_dir_inode = d_inode(lower_dir_dentry);
+ inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+ dget(lower_dentry); // don't even try to make the lower negative
+ if (lower_dentry->d_parent != lower_dir_dentry)
+ rc = -EINVAL;
+ else if (d_unhashed(lower_dentry))
+ rc = -EINVAL;
+ else
+ rc = vfs_unlink(lower_dir_inode, lower_dentry, NULL);
if (rc) {
printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc);
goto out_unlock;
@@ -142,10 +149,11 @@ static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry,
fsstack_copy_attr_times(dir, lower_dir_inode);
set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink);
inode->i_ctime = dir->i_ctime;
- d_drop(dentry);
out_unlock:
- unlock_dir(lower_dir_dentry);
dput(lower_dentry);
+ inode_unlock(lower_dir_inode);
+ if (!rc)
+ d_drop(dentry);
return rc;
}
@@ -311,9 +319,9 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode)
static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
struct dentry *lower_dentry)
{
- struct inode *inode, *lower_inode = d_inode(lower_dentry);
+ struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent);
+ struct inode *inode, *lower_inode;
struct ecryptfs_dentry_info *dentry_info;
- struct vfsmount *lower_mnt;
int rc = 0;
dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL);
@@ -322,16 +330,23 @@ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry,
return ERR_PTR(-ENOMEM);
}
- lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent));
fsstack_copy_attr_atime(d_inode(dentry->d_parent),
- d_inode(lower_dentry->d_parent));
+ d_inode(path->dentry));
BUG_ON(!d_count(lower_dentry));
ecryptfs_set_dentry_private(dentry, dentry_info);
- dentry_info->lower_path.mnt = lower_mnt;
+ dentry_info->lower_path.mnt = mntget(path->mnt);
dentry_info->lower_path.dentry = lower_dentry;
- if (d_really_is_negative(lower_dentry)) {
+ /*
+ * negative dentry can go positive under us here - its parent is not
+ * locked. That's OK and that could happen just as we return from
+ * ecryptfs_lookup() anyway. Just need to be careful and fetch
+ * ->d_inode only once - it's not stable here.
+ */
+ lower_inode = READ_ONCE(lower_dentry->d_inode);
+
+ if (!lower_inode) {
/* We want to add because we couldn't find in lower */
d_add(dentry, NULL);
return NULL;
@@ -512,22 +527,30 @@ static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry)
{
struct dentry *lower_dentry;
struct dentry *lower_dir_dentry;
+ struct inode *lower_dir_inode;
int rc;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- dget(dentry);
- lower_dir_dentry = lock_parent(lower_dentry);
- dget(lower_dentry);
- rc = vfs_rmdir(d_inode(lower_dir_dentry), lower_dentry);
- dput(lower_dentry);
- if (!rc && d_really_is_positive(dentry))
+ lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent);
+ lower_dir_inode = d_inode(lower_dir_dentry);
+
+ inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT);
+ dget(lower_dentry); // don't even try to make the lower negative
+ if (lower_dentry->d_parent != lower_dir_dentry)
+ rc = -EINVAL;
+ else if (d_unhashed(lower_dentry))
+ rc = -EINVAL;
+ else
+ rc = vfs_rmdir(lower_dir_inode, lower_dentry);
+ if (!rc) {
clear_nlink(d_inode(dentry));
- fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
- set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
- unlock_dir(lower_dir_dentry);
+ fsstack_copy_attr_times(dir, lower_dir_inode);
+ set_nlink(dir, lower_dir_inode->i_nlink);
+ }
+ dput(lower_dentry);
+ inode_unlock(lower_dir_inode);
if (!rc)
d_drop(dentry);
- dput(dentry);
return rc;
}
@@ -565,20 +588,22 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct dentry *lower_new_dentry;
struct dentry *lower_old_dir_dentry;
struct dentry *lower_new_dir_dentry;
- struct dentry *trap = NULL;
+ struct dentry *trap;
struct inode *target_inode;
if (flags)
return -EINVAL;
+ lower_old_dir_dentry = ecryptfs_dentry_to_lower(old_dentry->d_parent);
+ lower_new_dir_dentry = ecryptfs_dentry_to_lower(new_dentry->d_parent);
+
lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry);
lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry);
- dget(lower_old_dentry);
- dget(lower_new_dentry);
- lower_old_dir_dentry = dget_parent(lower_old_dentry);
- lower_new_dir_dentry = dget_parent(lower_new_dentry);
+
target_inode = d_inode(new_dentry);
+
trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ dget(lower_new_dentry);
rc = -EINVAL;
if (lower_old_dentry->d_parent != lower_old_dir_dentry)
goto out_lock;
@@ -606,11 +631,8 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (new_dir != old_dir)
fsstack_copy_attr_all(old_dir, d_inode(lower_old_dir_dentry));
out_lock:
- unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
- dput(lower_new_dir_dentry);
- dput(lower_old_dir_dentry);
dput(lower_new_dentry);
- dput(lower_old_dentry);
+ unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
return rc;
}
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index 9536e592e25a..7cd6db598e9f 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1303,7 +1303,7 @@ parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat,
printk(KERN_WARNING "Tag 1 packet contains key larger "
"than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
rc = -EINVAL;
- goto out;
+ goto out_free;
}
memcpy((*new_auth_tok)->session_key.encrypted_key,
&data[(*packet_size)], (body_size - (ECRYPTFS_SIG_SIZE + 2)));
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index d668e60b85b5..c05ca39aa449 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -379,6 +379,7 @@ int __init ecryptfs_init_messaging(void)
* ecryptfs_message_buf_len),
GFP_KERNEL);
if (!ecryptfs_msg_ctx_arr) {
+ kfree(ecryptfs_daemon_hash);
rc = -ENOMEM;
goto out;
}
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8aa0ea8c55e8..78e41c7c3d05 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -24,6 +24,8 @@
#include <linux/seq_file.h>
#include <linux/idr.h>
+DEFINE_PER_CPU(int, eventfd_wake_count);
+
static DEFINE_IDA(eventfd_ida);
struct eventfd_ctx {
@@ -60,12 +62,25 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
{
unsigned long flags;
+ /*
+ * Deadlock or stack overflow issues can happen if we recurse here
+ * through waitqueue wakeup handlers. If the caller users potentially
+ * nested waitqueues with custom wakeup handlers, then it should
+ * check eventfd_signal_count() before calling this function. If
+ * it returns true, the eventfd_signal() call should be deferred to a
+ * safe context.
+ */
+ if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+ return 0;
+
spin_lock_irqsave(&ctx->wqh.lock, flags);
+ this_cpu_inc(eventfd_wake_count);
if (ULLONG_MAX - ctx->count < n)
n = ULLONG_MAX - ctx->count;
ctx->count += n;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN);
+ this_cpu_dec(eventfd_wake_count);
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
return n;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 4c74c768ae43..70ecf02cdca2 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1827,7 +1827,6 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
{
int res = 0, eavail, timed_out = 0;
u64 slack = 0;
- bool waiter = false;
wait_queue_entry_t wait;
ktime_t expires, *to = NULL;
@@ -1872,21 +1871,23 @@ fetch_events:
*/
ep_reset_busy_poll_napi_id(ep);
- /*
- * We don't have any available event to return to the caller. We need
- * to sleep here, and we will be woken by ep_poll_callback() when events
- * become available.
- */
- if (!waiter) {
- waiter = true;
- init_waitqueue_entry(&wait, current);
-
- spin_lock_irq(&ep->wq.lock);
+ do {
+ /*
+ * Internally init_wait() uses autoremove_wake_function(),
+ * thus wait entry is removed from the wait queue on each
+ * wakeup. Why it is important? In case of several waiters
+ * each new wakeup will hit the next waiter, giving it the
+ * chance to harvest new event. Otherwise wakeup can be
+ * lost. This is also good performance-wise, because on
+ * normal wakeup path no need to call __remove_wait_queue()
+ * explicitly, thus ep->lock is not taken, which halts the
+ * event delivery.
+ */
+ init_wait(&wait);
+ write_lock_irq(&ep->lock);
__add_wait_queue_exclusive(&ep->wq, &wait);
- spin_unlock_irq(&ep->wq.lock);
- }
+ write_unlock_irq(&ep->lock);
- for (;;) {
/*
* We don't want to sleep if the ep_poll_callback() sends us
* a wakeup in between. That's why we set the task state
@@ -1916,10 +1917,20 @@ fetch_events:
timed_out = 1;
break;
}
- }
+
+ /* We were woken up, thus go and try to harvest some events */
+ eavail = 1;
+
+ } while (0);
__set_current_state(TASK_RUNNING);
+ if (!list_empty_careful(&wait.entry)) {
+ write_lock_irq(&ep->lock);
+ __remove_wait_queue(&ep->wq, &wait);
+ write_unlock_irq(&ep->lock);
+ }
+
send_events:
/*
* Try to transfer events to user space. In case we get 0 events and
@@ -1930,12 +1941,6 @@ send_events:
!(res = ep_send_events(ep, events, maxevents)) && !timed_out)
goto fetch_events;
- if (waiter) {
- spin_lock_irq(&ep->wq.lock);
- __remove_wait_queue(&ep->wq, &wait);
- spin_unlock_irq(&ep->wq.lock);
- }
-
return res;
}
@@ -2313,19 +2318,17 @@ SYSCALL_DEFINE6(epoll_pwait, int, epfd, struct epoll_event __user *, events,
size_t, sigsetsize)
{
int error;
- sigset_t ksigmask, sigsaved;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- error = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ error = set_user_sigmask(sigmask, sigsetsize);
if (error)
return error;
error = do_epoll_wait(epfd, events, maxevents, timeout);
-
- restore_user_sigmask(sigmask, &sigsaved, error == -EINTR);
+ restore_saved_sigmask_unless(error == -EINTR);
return error;
}
@@ -2338,19 +2341,17 @@ COMPAT_SYSCALL_DEFINE6(epoll_pwait, int, epfd,
compat_size_t, sigsetsize)
{
long err;
- sigset_t ksigmask, sigsaved;
/*
* If the caller wants a certain signal mask to be set during the wait,
* we apply it here.
*/
- err = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ err = set_compat_user_sigmask(sigmask, sigsetsize);
if (err)
return err;
err = do_epoll_wait(epfd, events, maxevents, timeout);
-
- restore_user_sigmask(sigmask, &sigsaved, err == -EINTR);
+ restore_saved_sigmask_unless(err == -EINTR);
return err;
}
diff --git a/fs/exec.c b/fs/exec.c
index ed8f4957a4a6..fa3faa50b592 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1274,6 +1274,8 @@ int flush_old_exec(struct linux_binprm * bprm)
*/
set_mm_exe_file(bprm->mm, bprm->file);
+ would_dump(bprm, bprm->file);
+
/*
* Release all of the old mmap stuff
*/
@@ -1383,7 +1385,7 @@ void setup_new_exec(struct linux_binprm * bprm)
/* An exec changes our domain. We are no longer part of the thread
group */
- current->self_exec_id++;
+ WRITE_ONCE(current->self_exec_id, current->self_exec_id + 1);
flush_signal_handlers(current, 0);
}
EXPORT_SYMBOL(setup_new_exec);
@@ -1817,8 +1819,6 @@ static int __do_execve_file(int fd, struct filename *filename,
if (retval < 0)
goto out;
- would_dump(bprm, bprm->file);
-
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index f0e549783caf..ba6de72a3e34 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -519,26 +519,33 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
* inode is actually connected to the parent.
*/
err = exportfs_get_name(mnt, target_dir, nbuf, result);
- if (!err) {
- inode_lock(target_dir->d_inode);
- nresult = lookup_one_len(nbuf, target_dir,
- strlen(nbuf));
- inode_unlock(target_dir->d_inode);
- if (!IS_ERR(nresult)) {
- if (nresult->d_inode) {
- dput(result);
- result = nresult;
- } else
- dput(nresult);
- }
+ if (err) {
+ dput(target_dir);
+ goto err_result;
}
+ inode_lock(target_dir->d_inode);
+ nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf));
+ if (!IS_ERR(nresult)) {
+ if (unlikely(nresult->d_inode != result->d_inode)) {
+ dput(nresult);
+ nresult = ERR_PTR(-ESTALE);
+ }
+ }
+ inode_unlock(target_dir->d_inode);
/*
* At this point we are done with the parent, but it's pinned
* by the child dentry anyway.
*/
dput(target_dir);
+ if (IS_ERR(nresult)) {
+ err = PTR_ERR(nresult);
+ goto err_result;
+ }
+ dput(result);
+ result = nresult;
+
/*
* And finally make sure the dentry is actually acceptable
* to NFSD.
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index a0c5ea91fcd4..8c29d4d6bfd5 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -80,6 +80,7 @@ static void ext2_release_inode(struct super_block *sb, int group, int dir)
if (dir)
le16_add_cpu(&desc->bg_used_dirs_count, -1);
spin_unlock(sb_bgl_lock(EXT2_SB(sb), group));
+ percpu_counter_inc(&EXT2_SB(sb)->s_freeinodes_counter);
if (dir)
percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter);
mark_buffer_dirty(bh);
@@ -529,7 +530,7 @@ got:
goto fail;
}
- percpu_counter_add(&sbi->s_freeinodes_counter, -1);
+ percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
percpu_counter_inc(&sbi->s_dirs_counter);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e474127dd255..95fae33f60b6 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -701,10 +701,13 @@ static int ext2_get_blocks(struct inode *inode,
if (!partial) {
count++;
mutex_unlock(&ei->truncate_mutex);
- if (err)
- goto cleanup;
goto got_it;
}
+
+ if (err) {
+ mutex_unlock(&ei->truncate_mutex);
+ goto cleanup;
+ }
}
/*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 1d7ab73b1014..ecf756630c22 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1082,9 +1082,9 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
if (EXT2_BLOCKS_PER_GROUP(sb) == 0)
goto cantfind_ext2;
- sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
- le32_to_cpu(es->s_first_data_block) - 1)
- / EXT2_BLOCKS_PER_GROUP(sb)) + 1;
+ sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
+ le32_to_cpu(es->s_first_data_block) - 1)
+ / EXT2_BLOCKS_PER_GROUP(sb)) + 1;
db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
EXT2_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc_array (db_count,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 1e33e0ac8cf1..4dc725d8b0c7 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -56,6 +56,7 @@
#include <linux/buffer_head.h>
#include <linux/init.h>
+#include <linux/printk.h>
#include <linux/slab.h>
#include <linux/mbcache.h>
#include <linux/quotaops.h>
@@ -84,8 +85,8 @@
printk("\n"); \
} while (0)
#else
-# define ea_idebug(f...)
-# define ea_bdebug(f...)
+# define ea_idebug(inode, f...) no_printk(f)
+# define ea_bdebug(bh, f...) no_printk(f)
#endif
static int ext2_xattr_set2(struct inode *, struct buffer_head *,
@@ -840,8 +841,7 @@ ext2_xattr_cache_insert(struct mb_cache *cache, struct buffer_head *bh)
true);
if (error) {
if (error == -EBUSY) {
- ea_bdebug(bh, "already in cache (%d cache entries)",
- atomic_read(&ext2_xattr_cache->c_entry_count));
+ ea_bdebug(bh, "already in cache");
error = 0;
}
} else
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e5d6ee61ff48..f9645de9d04c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -270,6 +270,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
ext4_group_t ngroups = ext4_get_groups_count(sb);
struct ext4_group_desc *desc;
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ struct buffer_head *bh_p;
if (block_group >= ngroups) {
ext4_error(sb, "block_group >= groups_count - block_group = %u,"
@@ -280,7 +281,14 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
- if (!sbi->s_group_desc[group_desc]) {
+ bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc);
+ /*
+ * sbi_array_rcu_deref returns with rcu unlocked, this is ok since
+ * the pointer being dereferenced won't be dereferenced again. By
+ * looking at the usage in add_new_gdb() the value isn't modified,
+ * just the pointer, and so it remains valid.
+ */
+ if (!bh_p) {
ext4_error(sb, "Group descriptor not loaded - "
"block_group = %u, group_desc = %u, desc = %u",
block_group, group_desc, offset);
@@ -288,10 +296,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
}
desc = (struct ext4_group_desc *)(
- (__u8 *)sbi->s_group_desc[group_desc]->b_data +
+ (__u8 *)bh_p->b_data +
offset * EXT4_DESC_SIZE(sb));
if (bh)
- *bh = sbi->s_group_desc[group_desc];
+ *bh = bh_p;
return desc;
}
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index d4d4fdfac1a6..ff8e1205127e 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -203,6 +203,7 @@ static int ext4_protect_reserved_inode(struct super_block *sb,
return PTR_ERR(inode);
num = (inode->i_size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
while (i < num) {
+ cond_resched();
map.m_lblk = i;
map.m_len = num - i;
n = ext4_map_blocks(NULL, inode, &map, 0);
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 92042f073d58..dd8bc705f0e3 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -78,6 +78,11 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
error_msg = "rec_len is too small for name_len";
else if (unlikely(((char *) de - buf) + rlen > size))
error_msg = "directory entry overrun";
+ else if (unlikely(((char *) de - buf) + rlen >
+ size - EXT4_DIR_REC_LEN(1) &&
+ ((char *) de - buf) + rlen != size)) {
+ error_msg = "directory entry too close to block end";
+ }
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
@@ -122,12 +127,14 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
if (err != ERR_BAD_DX_DIR) {
return err;
}
- /*
- * We don't set the inode dirty flag since it's not
- * critical that it get flushed back to the disk.
- */
- ext4_clear_inode_flag(file_inode(file),
- EXT4_INODE_INDEX);
+ /* Can we just clear INDEX flag to ignore htree information? */
+ if (!ext4_has_metadata_csum(sb)) {
+ /*
+ * We don't set the inode dirty flag since it's not
+ * critical that it gets flushed back to the disk.
+ */
+ ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
}
if (ext4_has_inline_data(inode)) {
@@ -666,24 +673,44 @@ static int ext4_d_compare(const struct dentry *dentry, unsigned int len,
const char *str, const struct qstr *name)
{
struct qstr qstr = {.name = str, .len = len };
+ const struct dentry *parent = READ_ONCE(dentry->d_parent);
+ const struct inode *inode = READ_ONCE(parent->d_inode);
+ char strbuf[DNAME_INLINE_LEN];
- if (!IS_CASEFOLDED(dentry->d_parent->d_inode)) {
+ if (!inode || !IS_CASEFOLDED(inode) ||
+ !EXT4_SB(inode->i_sb)->s_encoding) {
if (len != name->len)
return -1;
return memcmp(str, name->name, len);
}
- return ext4_ci_compare(dentry->d_parent->d_inode, name, &qstr);
+ /*
+ * If the dentry name is stored in-line, then it may be concurrently
+ * modified by a rename. If this happens, the VFS will eventually retry
+ * the lookup, so it doesn't matter what ->d_compare() returns.
+ * However, it's unsafe to call utf8_strncasecmp() with an unstable
+ * string. Therefore, we have to copy the name into a temporary buffer.
+ */
+ if (len <= DNAME_INLINE_LEN - 1) {
+ memcpy(strbuf, str, len);
+ strbuf[len] = 0;
+ qstr.name = strbuf;
+ /* prevent compiler from optimizing out the temporary buffer */
+ barrier();
+ }
+
+ return ext4_ci_compare(inode, name, &qstr, false);
}
static int ext4_d_hash(const struct dentry *dentry, struct qstr *str)
{
const struct ext4_sb_info *sbi = EXT4_SB(dentry->d_sb);
const struct unicode_map *um = sbi->s_encoding;
+ const struct inode *inode = READ_ONCE(dentry->d_inode);
unsigned char *norm;
int len, ret = 0;
- if (!IS_CASEFOLDED(dentry->d_inode))
+ if (!inode || !IS_CASEFOLDED(inode) || !um)
return 0;
norm = kmalloc(PATH_MAX, GFP_ATOMIC);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0014b1c5e6be..490e34ad927c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1379,7 +1379,7 @@ struct ext4_sb_info {
loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */
struct buffer_head * s_sbh; /* Buffer containing the super block */
struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */
- struct buffer_head **s_group_desc;
+ struct buffer_head * __rcu *s_group_desc;
unsigned int s_mount_opt;
unsigned int s_mount_opt2;
unsigned int s_mount_flags;
@@ -1441,7 +1441,7 @@ struct ext4_sb_info {
#endif
/* for buddy allocator */
- struct ext4_group_info ***s_group_info;
+ struct ext4_group_info ** __rcu *s_group_info;
struct inode *s_buddy_cache;
spinlock_t s_md_lock;
unsigned short *s_mb_offsets;
@@ -1491,7 +1491,7 @@ struct ext4_sb_info {
unsigned int s_extent_max_zeroout_kb;
unsigned int s_log_groups_per_flex;
- struct flex_groups *s_flex_groups;
+ struct flex_groups * __rcu *s_flex_groups;
ext4_group_t s_flex_groups_allocated;
/* workqueue for reserved extent conversions (buffered io) */
@@ -1531,8 +1531,11 @@ struct ext4_sb_info {
struct ratelimit_state s_warning_ratelimit_state;
struct ratelimit_state s_msg_ratelimit_state;
- /* Barrier between changing inodes' journal flags and writepages ops. */
- struct percpu_rw_semaphore s_journal_flag_rwsem;
+ /*
+ * Barrier between writepages ops and changing any inode's JOURNAL_DATA
+ * or EXTENTS flag.
+ */
+ struct percpu_rw_semaphore s_writepages_rwsem;
struct dax_device *s_daxdev;
};
@@ -2074,6 +2077,23 @@ struct dx_hash_info
/*
+ * Returns: sbi->field[index]
+ * Used to access an array element from the following sbi fields which require
+ * rcu protection to avoid dereferencing an invalid pointer due to reassignment
+ * - s_group_desc
+ * - s_group_info
+ * - s_flex_group
+ */
+#define sbi_array_rcu_deref(sbi, field, index) \
+({ \
+ typeof(*((sbi)->field)) _v; \
+ rcu_read_lock(); \
+ _v = ((typeof(_v)*)rcu_dereference((sbi)->field))[index]; \
+ rcu_read_unlock(); \
+ _v; \
+})
+
+/*
* Control parameters used by ext4_htree_next_block
*/
#define HASH_NB_ALWAYS 1
@@ -2085,6 +2105,9 @@ struct ext4_filename {
#ifdef CONFIG_FS_ENCRYPTION
struct fscrypt_str crypto_buf;
#endif
+#ifdef CONFIG_UNICODE
+ struct fscrypt_str cf_name;
+#endif
};
#define fname_name(p) ((p)->disk_name.name)
@@ -2310,6 +2333,12 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
struct ext4_group_desc *gdp);
ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
+#ifdef CONFIG_UNICODE
+extern void ext4_fname_setup_ci_filename(struct inode *dir,
+ const struct qstr *iname,
+ struct fscrypt_str *fname);
+#endif
+
#ifdef CONFIG_FS_ENCRYPTION
static inline void ext4_fname_from_fscrypt_name(struct ext4_filename *dst,
const struct fscrypt_name *src)
@@ -2336,6 +2365,10 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
return err;
ext4_fname_from_fscrypt_name(fname, &name);
+
+#ifdef CONFIG_UNICODE
+ ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
+#endif
return 0;
}
@@ -2351,6 +2384,10 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
return err;
ext4_fname_from_fscrypt_name(fname, &name);
+
+#ifdef CONFIG_UNICODE
+ ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name);
+#endif
return 0;
}
@@ -2364,6 +2401,11 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname)
fname->crypto_buf.name = NULL;
fname->usr_fname = NULL;
fname->disk_name.name = NULL;
+
+#ifdef CONFIG_UNICODE
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+#endif
}
#else /* !CONFIG_FS_ENCRYPTION */
static inline int ext4_fname_setup_filename(struct inode *dir,
@@ -2374,6 +2416,11 @@ static inline int ext4_fname_setup_filename(struct inode *dir,
fname->usr_fname = iname;
fname->disk_name.name = (unsigned char *) iname->name;
fname->disk_name.len = iname->len;
+
+#ifdef CONFIG_UNICODE
+ ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name);
+#endif
+
return 0;
}
@@ -2384,7 +2431,13 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir,
return ext4_fname_setup_filename(dir, &dentry->d_name, 1, fname);
}
-static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
+static inline void ext4_fname_free_filename(struct ext4_filename *fname)
+{
+#ifdef CONFIG_UNICODE
+ kfree(fname->cf_name.name);
+ fname->cf_name.name = NULL;
+#endif
+}
#endif /* !CONFIG_FS_ENCRYPTION */
/* dir.c */
@@ -2412,8 +2465,11 @@ void ext4_insert_dentry(struct inode *inode,
struct ext4_filename *fname);
static inline void ext4_update_dx_flag(struct inode *inode)
{
- if (!ext4_has_feature_dir_index(inode->i_sb))
+ if (!ext4_has_feature_dir_index(inode->i_sb)) {
+ /* ext4_iget() should have caught this... */
+ WARN_ON_ONCE(ext4_has_feature_metadata_csum(inode->i_sb));
ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
+ }
}
static const unsigned char ext4_filetype_table[] = {
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
@@ -2599,6 +2655,7 @@ extern int ext4_generic_delete_entry(handle_t *handle,
extern bool ext4_empty_dir(struct inode *inode);
/* resize.c */
+extern void ext4_kvfree_array_rcu(void *to_free);
extern int ext4_group_add(struct super_block *sb,
struct ext4_new_group_data *input);
extern int ext4_group_extend(struct super_block *sb,
@@ -2846,13 +2903,13 @@ static inline
struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
ext4_group_t group)
{
- struct ext4_group_info ***grp_info;
+ struct ext4_group_info **grp_info;
long indexv, indexh;
BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
- grp_info = EXT4_SB(sb)->s_group_info;
indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
- return grp_info[indexv][indexh];
+ grp_info = sbi_array_rcu_deref(EXT4_SB(sb), s_group_info, indexv);
+ return grp_info[indexh];
}
/*
@@ -2902,7 +2959,7 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
!inode_is_locked(inode));
down_write(&EXT4_I(inode)->i_data_sem);
if (newsize > EXT4_I(inode)->i_disksize)
- EXT4_I(inode)->i_disksize = newsize;
+ WRITE_ONCE(EXT4_I(inode)->i_disksize, newsize);
up_write(&EXT4_I(inode)->i_data_sem);
}
@@ -3127,8 +3184,8 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle,
struct inode *inode,
struct buffer_head *bh);
extern int ext4_ci_compare(const struct inode *parent,
- const struct qstr *name,
- const struct qstr *entry);
+ const struct qstr *fname,
+ const struct qstr *entry, bool quick);
#define S_SHIFT 12
static const unsigned char ext4_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 98bd0e9ee7df..ca78fd709845 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -170,10 +170,13 @@ struct partial_cluster {
(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
#define EXT_LAST_INDEX(__hdr__) \
(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
-#define EXT_MAX_EXTENT(__hdr__) \
- (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+ ((le16_to_cpu((__hdr__)->eh_max)) ? \
+ ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \
+ : 0)
#define EXT_MAX_INDEX(__hdr__) \
- (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+ ((le16_to_cpu((__hdr__)->eh_max)) ? \
+ ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) : 0)
static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
{
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7a72cde366f7..cfcdc7f0d19e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -498,6 +498,30 @@ int ext4_ext_check_inode(struct inode *inode)
return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode), 0);
}
+static void ext4_cache_extents(struct inode *inode,
+ struct ext4_extent_header *eh)
+{
+ struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
+ ext4_lblk_t prev = 0;
+ int i;
+
+ for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
+ unsigned int status = EXTENT_STATUS_WRITTEN;
+ ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
+ int len = ext4_ext_get_actual_len(ex);
+
+ if (prev && (prev != lblk))
+ ext4_es_cache_extent(inode, prev, lblk - prev, ~0,
+ EXTENT_STATUS_HOLE);
+
+ if (ext4_ext_is_unwritten(ex))
+ status = EXTENT_STATUS_UNWRITTEN;
+ ext4_es_cache_extent(inode, lblk, len,
+ ext4_ext_pblock(ex), status);
+ prev = lblk + len;
+ }
+}
+
static struct buffer_head *
__read_extent_tree_block(const char *function, unsigned int line,
struct inode *inode, ext4_fsblk_t pblk, int depth,
@@ -532,26 +556,7 @@ __read_extent_tree_block(const char *function, unsigned int line,
*/
if (!(flags & EXT4_EX_NOCACHE) && depth == 0) {
struct ext4_extent_header *eh = ext_block_hdr(bh);
- struct ext4_extent *ex = EXT_FIRST_EXTENT(eh);
- ext4_lblk_t prev = 0;
- int i;
-
- for (i = le16_to_cpu(eh->eh_entries); i > 0; i--, ex++) {
- unsigned int status = EXTENT_STATUS_WRITTEN;
- ext4_lblk_t lblk = le32_to_cpu(ex->ee_block);
- int len = ext4_ext_get_actual_len(ex);
-
- if (prev && (prev != lblk))
- ext4_es_cache_extent(inode, prev,
- lblk - prev, ~0,
- EXTENT_STATUS_HOLE);
-
- if (ext4_ext_is_unwritten(ex))
- status = EXTENT_STATUS_UNWRITTEN;
- ext4_es_cache_extent(inode, lblk, len,
- ext4_ext_pblock(ex), status);
- prev = lblk + len;
- }
+ ext4_cache_extents(inode, eh);
}
return bh;
errout:
@@ -899,6 +904,8 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block,
path[0].p_bh = NULL;
i = depth;
+ if (!(flags & EXT4_EX_NOCACHE) && depth == 0)
+ ext4_cache_extents(inode, eh);
/* walk through the tree */
while (i) {
ext_debug("depth %d: num %d, max %d\n",
@@ -2957,7 +2964,7 @@ again:
* in use to avoid freeing it when removing blocks.
*/
if (sbi->s_cluster_ratio > 1) {
- pblk = ext4_ext_pblock(ex) + end - ee_block + 2;
+ pblk = ext4_ext_pblock(ex) + end - ee_block + 1;
partial.pclu = EXT4_B2C(sbi, pblk);
partial.state = nofree;
}
@@ -3503,8 +3510,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
(unsigned long long)map->m_lblk, map_len);
sbi = EXT4_SB(inode->i_sb);
- eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
- inode->i_sb->s_blocksize_bits;
+ eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
+ >> inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map_len)
eof_block = map->m_lblk + map_len;
@@ -3759,8 +3766,8 @@ static int ext4_split_convert_extents(handle_t *handle,
__func__, inode->i_ino,
(unsigned long long)map->m_lblk, map->m_len);
- eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
- inode->i_sb->s_blocksize_bits;
+ eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1)
+ >> inode->i_sb->s_blocksize_bits;
if (eof_block < map->m_lblk + map->m_len)
eof_block = map->m_lblk + map->m_len;
/*
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f4a24a46245e..52d155b4e733 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -40,9 +40,10 @@ static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
- if (!inode_trylock_shared(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
return -EAGAIN;
+ } else {
inode_lock_shared(inode);
}
/*
@@ -190,9 +191,10 @@ ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(iocb->ki_filp);
ssize_t ret;
- if (!inode_trylock(inode)) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock(inode))
return -EAGAIN;
+ } else {
inode_lock(inode);
}
ret = ext4_write_checks(iocb, from);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 5508baa11bb6..8a28d47bd502 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -44,30 +44,28 @@
*/
static int ext4_sync_parent(struct inode *inode)
{
- struct dentry *dentry = NULL;
- struct inode *next;
+ struct dentry *dentry, *next;
int ret = 0;
if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY))
return 0;
- inode = igrab(inode);
+ dentry = d_find_any_alias(inode);
+ if (!dentry)
+ return 0;
while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
- dentry = d_find_any_alias(inode);
- if (!dentry)
- break;
- next = igrab(d_inode(dentry->d_parent));
+
+ next = dget_parent(dentry);
dput(dentry);
- if (!next)
- break;
- iput(inode);
- inode = next;
+ dentry = next;
+ inode = dentry->d_inode;
+
/*
* The directory inode may have gone through rmdir by now. But
* the inode itself and its blocks are still allocated (we hold
- * a reference to the inode so it didn't go through
- * ext4_evict_inode()) and so we are safe to flush metadata
- * blocks and the inode.
+ * a reference to the inode via its dentry), so it didn't go
+ * through ext4_evict_inode()) and so we are safe to flush
+ * metadata blocks and the inode.
*/
ret = sync_mapping_buffers(inode->i_mapping);
if (ret)
@@ -76,7 +74,7 @@ static int ext4_sync_parent(struct inode *inode)
if (ret)
break;
}
- iput(inode);
+ dput(dentry);
return ret;
}
diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c
index d358bfcb6b3f..3e133793a5a3 100644
--- a/fs/ext4/hash.c
+++ b/fs/ext4/hash.c
@@ -280,7 +280,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len,
unsigned char *buff;
struct qstr qstr = {.name = name, .len = len };
- if (len && IS_CASEFOLDED(dir)) {
+ if (len && IS_CASEFOLDED(dir) && um) {
buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL);
if (!buff)
return -ENOMEM;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 764ff4c56233..c5590ffc44b5 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -330,11 +330,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
percpu_counter_inc(&sbi->s_freeinodes_counter);
if (sbi->s_log_groups_per_flex) {
- ext4_group_t f = ext4_flex_group(sbi, block_group);
+ struct flex_groups *fg;
- atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ fg = sbi_array_rcu_deref(sbi, s_flex_groups,
+ ext4_flex_group(sbi, block_group));
+ atomic_inc(&fg->free_inodes);
if (is_directory)
- atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+ atomic_dec(&fg->used_dirs);
}
BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
@@ -370,12 +372,13 @@ static void get_orlov_stats(struct super_block *sb, ext4_group_t g,
int flex_size, struct orlov_stats *stats)
{
struct ext4_group_desc *desc;
- struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
if (flex_size > 1) {
- stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
- stats->free_clusters = atomic64_read(&flex_group[g].free_clusters);
- stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+ struct flex_groups *fg = sbi_array_rcu_deref(EXT4_SB(sb),
+ s_flex_groups, g);
+ stats->free_inodes = atomic_read(&fg->free_inodes);
+ stats->free_clusters = atomic64_read(&fg->free_clusters);
+ stats->used_dirs = atomic_read(&fg->used_dirs);
return;
}
@@ -662,7 +665,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
* block has been written back to disk. (Yes, these values are
* somewhat arbitrary...)
*/
-#define RECENTCY_MIN 5
+#define RECENTCY_MIN 60
#define RECENTCY_DIRTY 300
static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino)
@@ -1056,7 +1059,8 @@ got:
if (sbi->s_log_groups_per_flex) {
ext4_group_t f = ext4_flex_group(sbi, group);
- atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+ atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
+ f)->used_dirs);
}
}
if (ext4_has_group_desc_csum(sb)) {
@@ -1079,7 +1083,8 @@ got:
if (sbi->s_log_groups_per_flex) {
flex_group = ext4_flex_group(sbi, group);
- atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
+ atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_inodes);
}
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a084e84610f8..7335cff90a7d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -196,7 +196,12 @@ void ext4_evict_inode(struct inode *inode)
{
handle_t *handle;
int err;
- int extra_credits = 3;
+ /*
+ * Credits for final inode cleanup and freeing:
+ * sb + inode (ext4_orphan_del()), block bitmap, group descriptor
+ * (xattr block freeing), bitmap, group descriptor (inode freeing)
+ */
+ int extra_credits = 6;
struct ext4_xattr_inode_array *ea_inode_array = NULL;
trace_ext4_evict_inode(inode);
@@ -252,8 +257,12 @@ void ext4_evict_inode(struct inode *inode)
if (!IS_NOQUOTA(inode))
extra_credits += EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb);
+ /*
+ * Block bitmap, group descriptor, and inode are accounted in both
+ * ext4_blocks_for_truncate() and extra_credits. So subtract 3.
+ */
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
- ext4_blocks_for_truncate(inode)+extra_credits);
+ ext4_blocks_for_truncate(inode) + extra_credits - 3);
if (IS_ERR(handle)) {
ext4_std_error(inode->i_sb, PTR_ERR(handle));
/*
@@ -2145,7 +2154,7 @@ static int ext4_writepage(struct page *page,
bool keep_towrite = false;
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) {
- ext4_invalidatepage(page, 0, PAGE_SIZE);
+ inode->i_mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE);
unlock_page(page);
return -EIO;
}
@@ -2582,7 +2591,7 @@ update_disksize:
* truncate are avoided by checking i_size under i_data_sem.
*/
disksize = ((loff_t)mpd->first_page) << PAGE_SHIFT;
- if (disksize > EXT4_I(inode)->i_disksize) {
+ if (disksize > READ_ONCE(EXT4_I(inode)->i_disksize)) {
int err2;
loff_t i_size;
@@ -2743,7 +2752,7 @@ static int ext4_writepages(struct address_space *mapping,
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- percpu_down_read(&sbi->s_journal_flag_rwsem);
+ percpu_down_read(&sbi->s_writepages_rwsem);
trace_ext4_writepages(inode, wbc);
/*
@@ -2964,7 +2973,7 @@ unplug:
out_writepages:
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_journal_flag_rwsem);
+ percpu_up_read(&sbi->s_writepages_rwsem);
return ret;
}
@@ -2979,13 +2988,13 @@ static int ext4_dax_writepages(struct address_space *mapping,
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO;
- percpu_down_read(&sbi->s_journal_flag_rwsem);
+ percpu_down_read(&sbi->s_writepages_rwsem);
trace_ext4_writepages(inode, wbc);
ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
trace_ext4_writepages_result(inode, wbc, ret,
nr_to_write - wbc->nr_to_write);
- percpu_up_read(&sbi->s_journal_flag_rwsem);
+ percpu_up_read(&sbi->s_writepages_rwsem);
return ret;
}
@@ -3559,8 +3568,14 @@ retry:
return ret;
}
+ /*
+ * Writes that span EOF might trigger an I/O size update on completion,
+ * so consider them to be dirty for the purposes of O_DSYNC, even if
+ * there is no other metadata changes being made or are pending here.
+ */
iomap->flags = 0;
- if (ext4_inode_datasync_dirty(inode))
+ if (ext4_inode_datasync_dirty(inode) ||
+ offset + length > i_size_read(inode))
iomap->flags |= IOMAP_F_DIRTY;
iomap->bdev = inode->i_sb->s_bdev;
iomap->dax_dev = sbi->s_daxdev;
@@ -3857,13 +3872,24 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
ssize_t ret;
+ loff_t offset = iocb->ki_pos;
+ loff_t size = i_size_read(inode);
+
+ if (offset >= size)
+ return 0;
/*
* Shared inode_lock is enough for us - it protects against concurrent
* writes & truncates and since we take care of writing back page cache,
* we are protected against page writeback as well.
*/
- inode_lock_shared(inode);
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!inode_trylock_shared(inode))
+ return -EAGAIN;
+ } else {
+ inode_lock_shared(inode);
+ }
+
ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
iocb->ki_pos + count - 1);
if (ret)
@@ -4692,7 +4718,7 @@ make_io:
if (end > table)
end = table;
while (b <= end)
- sb_breadahead(sb, b++);
+ sb_breadahead_unmovable(sb, b++);
}
/*
@@ -4979,6 +5005,18 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
+ /*
+ * If dir_index is not enabled but there's dir with INDEX flag set,
+ * we'd normally treat htree data as empty space. But with metadata
+ * checksumming that corrupts checksums so forbid that.
+ */
+ if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) {
+ ext4_error_inode(inode, function, line, 0,
+ "iget: Dir with htree data on filesystem without dir_index feature.");
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
ei->i_disksize = inode->i_size;
#ifdef CONFIG_QUOTA
ei->i_reserved_quota = 0;
@@ -5116,6 +5154,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
"iget: bogus i_mode (%o)", inode->i_mode);
goto bad_inode;
}
+ if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
+ ext4_error_inode(inode, function, line, 0,
+ "casefold flag without casefold feature");
brelse(iloc.bh);
unlock_new_inode(inode);
@@ -5132,7 +5173,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
struct ext4_inode_info *ei)
{
struct inode *inode = &(ei->vfs_inode);
- u64 i_blocks = inode->i_blocks;
+ u64 i_blocks = READ_ONCE(inode->i_blocks);
struct super_block *sb = inode->i_sb;
if (i_blocks <= ~0U) {
@@ -5475,11 +5516,15 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
offset = inode->i_size & (PAGE_SIZE - 1);
/*
- * All buffers in the last page remain valid? Then there's nothing to
- * do. We do the check mainly to optimize the common PAGE_SIZE ==
- * blocksize case
+ * If the page is fully truncated, we don't need to wait for any commit
+ * (and we even should not as __ext4_journalled_invalidatepage() may
+ * strip all buffers from the page but keep the page dirty which can then
+ * confuse e.g. concurrent ext4_writepage() seeing dirty page without
+ * buffers). Also we don't need to wait for any commit if all buffers in
+ * the page remain valid. This is most beneficial for the common case of
+ * blocksize == PAGESIZE.
*/
- if (offset > PAGE_SIZE - i_blocksize(inode))
+ if (!offset || offset > (PAGE_SIZE - i_blocksize(inode)))
return;
while (1) {
page = find_lock_page(inode->i_mapping,
@@ -5929,8 +5974,23 @@ static int __ext4_expand_extra_isize(struct inode *inode,
{
struct ext4_inode *raw_inode;
struct ext4_xattr_ibody_header *header;
+ unsigned int inode_size = EXT4_INODE_SIZE(inode->i_sb);
+ struct ext4_inode_info *ei = EXT4_I(inode);
int error;
+ /* this was checked at iget time, but double check for good measure */
+ if ((EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > inode_size) ||
+ (ei->i_extra_isize & 3)) {
+ EXT4_ERROR_INODE(inode, "bad extra_isize %u (inode size %u)",
+ ei->i_extra_isize,
+ EXT4_INODE_SIZE(inode->i_sb));
+ return -EFSCORRUPTED;
+ }
+ if ((new_extra_isize < ei->i_extra_isize) ||
+ (new_extra_isize < 4) ||
+ (new_extra_isize > inode_size - EXT4_GOOD_OLD_INODE_SIZE))
+ return -EINVAL; /* Should never happen */
+
raw_inode = ext4_raw_inode(iloc);
header = IHDR(inode, raw_inode);
@@ -6024,7 +6084,7 @@ int ext4_expand_extra_isize(struct inode *inode,
error = ext4_journal_get_write_access(handle, iloc->bh);
if (error) {
brelse(iloc->bh);
- goto out_stop;
+ goto out_unlock;
}
error = __ext4_expand_extra_isize(inode, new_extra_isize, iloc,
@@ -6034,8 +6094,8 @@ int ext4_expand_extra_isize(struct inode *inode,
if (!error)
error = rc;
+out_unlock:
ext4_write_unlock_xattr(inode, &no_expand);
-out_stop:
ext4_journal_stop(handle);
return error;
}
@@ -6150,7 +6210,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
}
}
- percpu_down_write(&sbi->s_journal_flag_rwsem);
+ percpu_down_write(&sbi->s_writepages_rwsem);
jbd2_journal_lock_updates(journal);
/*
@@ -6167,7 +6227,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
err = jbd2_journal_flush(journal);
if (err < 0) {
jbd2_journal_unlock_updates(journal);
- percpu_up_write(&sbi->s_journal_flag_rwsem);
+ percpu_up_write(&sbi->s_writepages_rwsem);
return err;
}
ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
@@ -6175,7 +6235,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
ext4_set_aops(inode);
jbd2_journal_unlock_updates(journal);
- percpu_up_write(&sbi->s_journal_flag_rwsem);
+ percpu_up_write(&sbi->s_writepages_rwsem);
if (val)
up_write(&EXT4_I(inode)->i_mmap_sem);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 99ba720dbb7a..baf0cd18f17c 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1936,7 +1936,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
int free;
free = e4b->bd_info->bb_free;
- BUG_ON(free <= 0);
+ if (WARN_ON(free <= 0))
+ return;
i = e4b->bd_info->bb_first_free;
@@ -1959,7 +1960,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
}
mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
- BUG_ON(ex.fe_len <= 0);
+ if (WARN_ON(ex.fe_len <= 0))
+ break;
if (free < ex.fe_len) {
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
@@ -2356,7 +2358,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
unsigned size;
- struct ext4_group_info ***new_groupinfo;
+ struct ext4_group_info ***old_groupinfo, ***new_groupinfo;
size = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >>
EXT4_DESC_PER_BLOCK_BITS(sb);
@@ -2369,13 +2371,16 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups)
ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group");
return -ENOMEM;
}
- if (sbi->s_group_info) {
- memcpy(new_groupinfo, sbi->s_group_info,
+ rcu_read_lock();
+ old_groupinfo = rcu_dereference(sbi->s_group_info);
+ if (old_groupinfo)
+ memcpy(new_groupinfo, old_groupinfo,
sbi->s_group_info_size * sizeof(*sbi->s_group_info));
- kvfree(sbi->s_group_info);
- }
- sbi->s_group_info = new_groupinfo;
+ rcu_read_unlock();
+ rcu_assign_pointer(sbi->s_group_info, new_groupinfo);
sbi->s_group_info_size = size / sizeof(*sbi->s_group_info);
+ if (old_groupinfo)
+ ext4_kvfree_array_rcu(old_groupinfo);
ext4_debug("allocated s_groupinfo array for %d meta_bg's\n",
sbi->s_group_info_size);
return 0;
@@ -2387,6 +2392,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
{
int i;
int metalen = 0;
+ int idx = group >> EXT4_DESC_PER_BLOCK_BITS(sb);
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_info **meta_group_info;
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
@@ -2405,12 +2411,12 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
"for a buddy group");
goto exit_meta_group_info;
}
- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] =
- meta_group_info;
+ rcu_read_lock();
+ rcu_dereference(sbi->s_group_info)[idx] = meta_group_info;
+ rcu_read_unlock();
}
- meta_group_info =
- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)];
+ meta_group_info = sbi_array_rcu_deref(sbi, s_group_info, idx);
i = group & (EXT4_DESC_PER_BLOCK(sb) - 1);
meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS);
@@ -2458,8 +2464,13 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
exit_group_info:
/* If a meta_group_info table has been allocated, release it now */
if (group % EXT4_DESC_PER_BLOCK(sb) == 0) {
- kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]);
- sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = NULL;
+ struct ext4_group_info ***group_info;
+
+ rcu_read_lock();
+ group_info = rcu_dereference(sbi->s_group_info);
+ kfree(group_info[idx]);
+ group_info[idx] = NULL;
+ rcu_read_unlock();
}
exit_meta_group_info:
return -ENOMEM;
@@ -2472,6 +2483,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err;
struct ext4_group_desc *desc;
+ struct ext4_group_info ***group_info;
struct kmem_cache *cachep;
err = ext4_mb_alloc_groupinfo(sb, ngroups);
@@ -2507,11 +2519,16 @@ err_freebuddy:
while (i-- > 0)
kmem_cache_free(cachep, ext4_get_group_info(sb, i));
i = sbi->s_group_info_size;
+ rcu_read_lock();
+ group_info = rcu_dereference(sbi->s_group_info);
while (i-- > 0)
- kfree(sbi->s_group_info[i]);
+ kfree(group_info[i]);
+ rcu_read_unlock();
iput(sbi->s_buddy_cache);
err_freesgi:
- kvfree(sbi->s_group_info);
+ rcu_read_lock();
+ kvfree(rcu_dereference(sbi->s_group_info));
+ rcu_read_unlock();
return -ENOMEM;
}
@@ -2700,7 +2717,7 @@ int ext4_mb_release(struct super_block *sb)
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
int num_meta_group_infos;
- struct ext4_group_info *grinfo;
+ struct ext4_group_info *grinfo, ***group_info;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits);
@@ -2719,9 +2736,12 @@ int ext4_mb_release(struct super_block *sb)
num_meta_group_infos = (ngroups +
EXT4_DESC_PER_BLOCK(sb) - 1) >>
EXT4_DESC_PER_BLOCK_BITS(sb);
+ rcu_read_lock();
+ group_info = rcu_dereference(sbi->s_group_info);
for (i = 0; i < num_meta_group_infos; i++)
- kfree(sbi->s_group_info[i]);
- kvfree(sbi->s_group_info);
+ kfree(group_info[i]);
+ kvfree(group_info);
+ rcu_read_unlock();
}
kfree(sbi->s_mb_offsets);
kfree(sbi->s_mb_maxs);
@@ -3020,7 +3040,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
atomic64_sub(ac->ac_b_ex.fe_len,
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_clusters);
}
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -4913,7 +4934,8 @@ do_more:
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
atomic64_add(count_clusters,
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_clusters);
}
/*
@@ -5070,7 +5092,8 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb,
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
atomic64_add(clusters_freed,
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &sbi_array_rcu_deref(sbi, s_flex_groups,
+ flex_group)->free_clusters);
}
ext4_mb_unload_buddy(&e4b);
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index b1e4d359f73b..be4ee3dcc5cf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -427,6 +427,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode)
int ext4_ext_migrate(struct inode *inode)
{
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
int retval = 0, i;
__le32 *i_data;
@@ -451,6 +452,8 @@ int ext4_ext_migrate(struct inode *inode)
*/
return retval;
+ percpu_down_write(&sbi->s_writepages_rwsem);
+
/*
* Worst case we can touch the allocation bitmaps, a bgd
* block, and a block to link in the orphan list. We do need
@@ -461,7 +464,7 @@ int ext4_ext_migrate(struct inode *inode)
if (IS_ERR(handle)) {
retval = PTR_ERR(handle);
- return retval;
+ goto out_unlock;
}
goal = (((inode->i_ino - 1) / EXT4_INODES_PER_GROUP(inode->i_sb)) *
EXT4_INODES_PER_GROUP(inode->i_sb)) + 1;
@@ -472,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
if (IS_ERR(tmp_inode)) {
retval = PTR_ERR(tmp_inode);
ext4_journal_stop(handle);
- return retval;
+ goto out_unlock;
}
i_size_write(tmp_inode, i_size_read(inode));
/*
@@ -514,7 +517,7 @@ int ext4_ext_migrate(struct inode *inode)
*/
ext4_orphan_del(NULL, tmp_inode);
retval = PTR_ERR(handle);
- goto out;
+ goto out_tmp_inode;
}
ei = EXT4_I(inode);
@@ -595,10 +598,11 @@ err_out:
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
ext4_journal_stop(handle);
-out:
+out_tmp_inode:
unlock_new_inode(tmp_inode);
iput(tmp_inode);
-
+out_unlock:
+ percpu_up_write(&sbi->s_writepages_rwsem);
return retval;
}
@@ -608,7 +612,8 @@ out:
int ext4_ind_migrate(struct inode *inode)
{
struct ext4_extent_header *eh;
- struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct ext4_super_block *es = sbi->s_es;
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_extent *ex;
unsigned int i, len;
@@ -632,9 +637,13 @@ int ext4_ind_migrate(struct inode *inode)
if (test_opt(inode->i_sb, DELALLOC))
ext4_alloc_da_blocks(inode);
+ percpu_down_write(&sbi->s_writepages_rwsem);
+
handle = ext4_journal_start(inode, EXT4_HT_MIGRATE, 1);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ goto out_unlock;
+ }
down_write(&EXT4_I(inode)->i_data_sem);
ret = ext4_ext_check_inode(inode);
@@ -669,5 +678,7 @@ int ext4_ind_migrate(struct inode *inode)
errout:
ext4_journal_stop(handle);
up_write(&EXT4_I(inode)->i_data_sem);
+out_unlock:
+ percpu_up_write(&sbi->s_writepages_rwsem);
return ret;
}
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 2305b4374fd3..9d00e0dd2ba9 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -120,10 +120,10 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp,
{
__ext4_warning(sb, function, line, "%s", msg);
__ext4_warning(sb, function, line,
- "MMP failure info: last update time: %llu, last update "
- "node: %s, last update device: %s",
- (long long unsigned int) le64_to_cpu(mmp->mmp_time),
- mmp->mmp_nodename, mmp->mmp_bdevname);
+ "MMP failure info: last update time: %llu, last update node: %.*s, last update device: %.*s",
+ (unsigned long long)le64_to_cpu(mmp->mmp_time),
+ (int)sizeof(mmp->mmp_nodename), mmp->mmp_nodename,
+ (int)sizeof(mmp->mmp_bdevname), mmp->mmp_bdevname);
}
/*
@@ -154,6 +154,7 @@ static int kmmpd(void *data)
mmp_check_interval = max(EXT4_MMP_CHECK_MULT * mmp_update_interval,
EXT4_MMP_MIN_CHECK_INTERVAL);
mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval);
+ BUILD_BUG_ON(sizeof(mmp->mmp_bdevname) < BDEVNAME_SIZE);
bdevname(bh->b_bdev, mmp->mmp_bdevname);
memcpy(mmp->mmp_nodename, init_utsname()->nodename,
@@ -375,7 +376,8 @@ skip:
/*
* Start a kernel thread to update the MMP block periodically.
*/
- EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%s",
+ EXT4_SB(sb)->s_mmp_tsk = kthread_run(kmmpd, mmpd_data, "kmmpd-%.*s",
+ (int)sizeof(mmp->mmp_bdevname),
bdevname(bh->b_bdev,
mmp->mmp_bdevname));
if (IS_ERR(EXT4_SB(sb)->s_mmp_tsk)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 771fe02f317d..8e1d5912556a 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1272,19 +1272,24 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block)
#ifdef CONFIG_UNICODE
/*
* Test whether a case-insensitive directory entry matches the filename
- * being searched for.
+ * being searched for. If quick is set, assume the name being looked up
+ * is already in the casefolded form.
*
* Returns: 0 if the directory entry matches, more than 0 if it
* doesn't match or less than zero on error.
*/
int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
- const struct qstr *entry)
+ const struct qstr *entry, bool quick)
{
const struct ext4_sb_info *sbi = EXT4_SB(parent->i_sb);
const struct unicode_map *um = sbi->s_encoding;
int ret;
- ret = utf8_strncasecmp(um, name, entry);
+ if (quick)
+ ret = utf8_strncasecmp_folded(um, name, entry);
+ else
+ ret = utf8_strncasecmp(um, name, entry);
+
if (ret < 0) {
/* Handle invalid character sequence as either an error
* or as an opaque byte sequence.
@@ -1300,6 +1305,32 @@ int ext4_ci_compare(const struct inode *parent, const struct qstr *name,
return ret;
}
+
+void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname,
+ struct fscrypt_str *cf_name)
+{
+ int len;
+
+ if (!IS_CASEFOLDED(dir) || !EXT4_SB(dir->i_sb)->s_encoding) {
+ cf_name->name = NULL;
+ return;
+ }
+
+ cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS);
+ if (!cf_name->name)
+ return;
+
+ len = utf8_casefold(EXT4_SB(dir->i_sb)->s_encoding,
+ iname, cf_name->name,
+ EXT4_NAME_LEN);
+ if (len <= 0) {
+ kfree(cf_name->name);
+ cf_name->name = NULL;
+ return;
+ }
+ cf_name->len = (unsigned) len;
+
+}
#endif
/*
@@ -1326,8 +1357,15 @@ static inline bool ext4_match(const struct inode *parent,
#endif
#ifdef CONFIG_UNICODE
- if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent))
- return (ext4_ci_compare(parent, fname->usr_fname, &entry) == 0);
+ if (EXT4_SB(parent->i_sb)->s_encoding && IS_CASEFOLDED(parent)) {
+ if (fname->cf_name.name) {
+ struct qstr cf = {.name = fname->cf_name.name,
+ .len = fname->cf_name.len};
+ return !ext4_ci_compare(parent, &cf, &entry, true);
+ }
+ return !ext4_ci_compare(parent, fname->usr_fname, &entry,
+ false);
+ }
#endif
return fscrypt_match_name(&f, de->name, de->name_len);
@@ -1353,8 +1391,8 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size,
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
- if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data,
- bh->b_size, offset))
+ if (ext4_check_dir_entry(dir, NULL, de, bh, search_buf,
+ buf_size, offset))
return -1;
*res_dir = de;
return 1;
@@ -1468,6 +1506,7 @@ restart:
/*
* We deal with the read-ahead logic here.
*/
+ cond_resched();
if (ra_ptr >= ra_max) {
/* Refill the readahead buffer */
ra_ptr = 0;
@@ -1814,7 +1853,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
blocksize, hinfo, map);
map -= count;
dx_sort_map(map, count);
- /* Split the existing block in the middle, size-wise */
+ /* Ensure that neither split block is over half full */
size = 0;
move = 0;
for (i = count-1; i >= 0; i--) {
@@ -1824,8 +1863,18 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
size += map[i].size;
move++;
}
- /* map index at which we will split */
- split = count - move;
+ /*
+ * map index at which we will split
+ *
+ * If the sum of active entries didn't exceed half the block size, just
+ * split it in half by count; each resulting block will have at least
+ * half the space free.
+ */
+ if (i > 0)
+ split = count - move;
+ else
+ split = count/2;
+
hash2 = map[split].hash;
continued = hash2 == map[split - 1].hash;
dxtrace(printk(KERN_INFO "Split block %lu at %x, %i/%i\n",
@@ -2154,7 +2203,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
#ifdef CONFIG_UNICODE
if (ext4_has_strict_mode(sbi) && IS_CASEFOLDED(dir) &&
- utf8_validate(sbi->s_encoding, &dentry->d_name))
+ sbi->s_encoding && utf8_validate(sbi->s_encoding, &dentry->d_name))
return -EINVAL;
#endif
@@ -2176,6 +2225,13 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
retval = ext4_dx_add_entry(handle, &fname, dir, inode);
if (!retval || (retval != ERR_BAD_DX_DIR))
goto out;
+ /* Can we just ignore htree data? */
+ if (ext4_has_metadata_csum(sb)) {
+ EXT4_ERROR_INODE(dir,
+ "Directory has corrupted htree index.");
+ retval = -EFSCORRUPTED;
+ goto out;
+ }
ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
dx_fallback++;
ext4_mark_inode_dirty(handle, dir);
@@ -2427,7 +2483,7 @@ int ext4_generic_delete_entry(handle_t *handle,
de = (struct ext4_dir_entry_2 *)entry_buf;
while (i < buf_size - csum_size) {
if (ext4_check_dir_entry(dir, NULL, de, bh,
- bh->b_data, bh->b_size, i))
+ entry_buf, buf_size, i))
return -EFSCORRUPTED;
if (de == de_del) {
if (pde)
@@ -2784,7 +2840,7 @@ bool ext4_empty_dir(struct inode *inode)
{
unsigned int offset;
struct buffer_head *bh;
- struct ext4_dir_entry_2 *de, *de1;
+ struct ext4_dir_entry_2 *de;
struct super_block *sb;
if (ext4_has_inline_data(inode)) {
@@ -2809,19 +2865,25 @@ bool ext4_empty_dir(struct inode *inode)
return true;
de = (struct ext4_dir_entry_2 *) bh->b_data;
- de1 = ext4_next_entry(de, sb->s_blocksize);
- if (le32_to_cpu(de->inode) != inode->i_ino ||
- le32_to_cpu(de1->inode) == 0 ||
- strcmp(".", de->name) || strcmp("..", de1->name)) {
- ext4_warning_inode(inode, "directory missing '.' and/or '..'");
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
+ 0) ||
+ le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
+ ext4_warning_inode(inode, "directory missing '.'");
+ brelse(bh);
+ return true;
+ }
+ offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+ de = ext4_next_entry(de, sb->s_blocksize);
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
+ offset) ||
+ le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
return true;
}
- offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
- ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
- de = ext4_next_entry(de1, sb->s_blocksize);
+ offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
while (offset < inode->i_size) {
- if ((void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+ if (!(offset & (sb->s_blocksize - 1))) {
unsigned int lblock;
brelse(bh);
lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
@@ -2832,12 +2894,11 @@ bool ext4_empty_dir(struct inode *inode)
}
if (IS_ERR(bh))
return true;
- de = (struct ext4_dir_entry_2 *) bh->b_data;
}
+ de = (struct ext4_dir_entry_2 *) (bh->b_data +
+ (offset & (sb->s_blocksize - 1)));
if (ext4_check_dir_entry(inode, NULL, de, bh,
bh->b_data, bh->b_size, offset)) {
- de = (struct ext4_dir_entry_2 *)(bh->b_data +
- sb->s_blocksize);
offset = (offset | (sb->s_blocksize - 1)) + 1;
continue;
}
@@ -2846,7 +2907,6 @@ bool ext4_empty_dir(struct inode *inode)
return false;
}
offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
- de = ext4_next_entry(de, sb->s_blocksize);
}
brelse(bh);
return true;
@@ -3158,18 +3218,17 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
- if (inode->i_nlink == 0) {
- ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
- dentry->d_name.len, dentry->d_name.name);
- set_nlink(inode, 1);
- }
retval = ext4_delete_entry(handle, dir, de, bh);
if (retval)
goto end_unlink;
dir->i_ctime = dir->i_mtime = current_time(dir);
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
- drop_nlink(inode);
+ if (inode->i_nlink == 0)
+ ext4_warning_inode(inode, "Deleting file '%.*s' with no links",
+ dentry->d_name.len, dentry->d_name.name);
+ else
+ drop_nlink(inode);
if (!inode->i_nlink)
ext4_orphan_add(handle, inode);
inode->i_ctime = current_time(inode);
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4690618a92e9..6ef6f3134cc3 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -478,17 +478,26 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
if (IS_ENCRYPTED(inode) && S_ISREG(inode->i_mode) && nr_to_submit) {
gfp_t gfp_flags = GFP_NOFS;
+ /*
+ * Since bounce page allocation uses a mempool, we can only use
+ * a waiting mask (i.e. request guaranteed allocation) on the
+ * first page of the bio. Otherwise it can deadlock.
+ */
+ if (io->io_bio)
+ gfp_flags = GFP_NOWAIT | __GFP_NOWARN;
retry_encrypt:
data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
page->index, gfp_flags);
if (IS_ERR(data_page)) {
ret = PTR_ERR(data_page);
- if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
- if (io->io_bio) {
+ if (ret == -ENOMEM &&
+ (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
+ gfp_flags = GFP_NOFS;
+ if (io->io_bio)
ext4_io_submit(io);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
- }
- gfp_flags |= __GFP_NOFAIL;
+ else
+ gfp_flags |= __GFP_NOFAIL;
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
goto retry_encrypt;
}
data_page = NULL;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c0e9aef376a7..080e25f6ef56 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -17,6 +17,33 @@
#include "ext4_jbd2.h"
+struct ext4_rcu_ptr {
+ struct rcu_head rcu;
+ void *ptr;
+};
+
+static void ext4_rcu_ptr_callback(struct rcu_head *head)
+{
+ struct ext4_rcu_ptr *ptr;
+
+ ptr = container_of(head, struct ext4_rcu_ptr, rcu);
+ kvfree(ptr->ptr);
+ kfree(ptr);
+}
+
+void ext4_kvfree_array_rcu(void *to_free)
+{
+ struct ext4_rcu_ptr *ptr = kzalloc(sizeof(*ptr), GFP_KERNEL);
+
+ if (ptr) {
+ ptr->ptr = to_free;
+ call_rcu(&ptr->rcu, ext4_rcu_ptr_callback);
+ return;
+ }
+ synchronize_rcu();
+ kvfree(to_free);
+}
+
int ext4_resize_begin(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -560,8 +587,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
brelse(gdb);
goto out;
}
- memcpy(gdb->b_data, sbi->s_group_desc[j]->b_data,
- gdb->b_size);
+ memcpy(gdb->b_data, sbi_array_rcu_deref(sbi,
+ s_group_desc, j)->b_data, gdb->b_size);
set_buffer_uptodate(gdb);
err = ext4_handle_dirty_metadata(handle, NULL, gdb);
@@ -879,13 +906,15 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
}
brelse(dind);
- o_group_desc = EXT4_SB(sb)->s_group_desc;
+ rcu_read_lock();
+ o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ rcu_read_unlock();
n_group_desc[gdb_num] = gdb_bh;
- EXT4_SB(sb)->s_group_desc = n_group_desc;
+ rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
EXT4_SB(sb)->s_gdb_count++;
- kvfree(o_group_desc);
+ ext4_kvfree_array_rcu(o_group_desc);
le16_add_cpu(&es->s_reserved_gdt_blocks, -1);
err = ext4_handle_dirty_super(handle, sb);
@@ -929,9 +958,11 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
return err;
}
- o_group_desc = EXT4_SB(sb)->s_group_desc;
+ rcu_read_lock();
+ o_group_desc = rcu_dereference(EXT4_SB(sb)->s_group_desc);
memcpy(n_group_desc, o_group_desc,
EXT4_SB(sb)->s_gdb_count * sizeof(struct buffer_head *));
+ rcu_read_unlock();
n_group_desc[gdb_num] = gdb_bh;
BUFFER_TRACE(gdb_bh, "get_write_access");
@@ -942,9 +973,9 @@ static int add_new_gdb_meta_bg(struct super_block *sb,
return err;
}
- EXT4_SB(sb)->s_group_desc = n_group_desc;
+ rcu_assign_pointer(EXT4_SB(sb)->s_group_desc, n_group_desc);
EXT4_SB(sb)->s_gdb_count++;
- kvfree(o_group_desc);
+ ext4_kvfree_array_rcu(o_group_desc);
return err;
}
@@ -1210,7 +1241,8 @@ static int ext4_add_new_descs(handle_t *handle, struct super_block *sb,
* use non-sparse filesystems anymore. This is already checked above.
*/
if (gdb_off) {
- gdb_bh = sbi->s_group_desc[gdb_num];
+ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+ gdb_num);
BUFFER_TRACE(gdb_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, gdb_bh);
@@ -1292,7 +1324,7 @@ static int ext4_setup_new_descs(handle_t *handle, struct super_block *sb,
/*
* get_write_access() has been called on gdb_bh by ext4_add_new_desc().
*/
- gdb_bh = sbi->s_group_desc[gdb_num];
+ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc, gdb_num);
/* Update group descriptor block for new group */
gdp = (struct ext4_group_desc *)(gdb_bh->b_data +
gdb_off * EXT4_DESC_SIZE(sb));
@@ -1420,11 +1452,14 @@ static void ext4_update_super(struct super_block *sb,
percpu_counter_read(&sbi->s_freeclusters_counter));
if (ext4_has_feature_flex_bg(sb) && sbi->s_log_groups_per_flex) {
ext4_group_t flex_group;
+ struct flex_groups *fg;
+
flex_group = ext4_flex_group(sbi, group_data[0].group);
+ fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
atomic64_add(EXT4_NUM_B2C(sbi, free_blocks),
- &sbi->s_flex_groups[flex_group].free_clusters);
+ &fg->free_clusters);
atomic_add(EXT4_INODES_PER_GROUP(sb) * flex_gd->count,
- &sbi->s_flex_groups[flex_group].free_inodes);
+ &fg->free_inodes);
}
/*
@@ -1519,7 +1554,8 @@ exit_journal:
for (; gdb_num <= gdb_num_end; gdb_num++) {
struct buffer_head *gdb_bh;
- gdb_bh = sbi->s_group_desc[gdb_num];
+ gdb_bh = sbi_array_rcu_deref(sbi, s_group_desc,
+ gdb_num);
if (old_gdb == gdb_bh->b_blocknr)
continue;
update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4079605d437a..19f2e7c77be8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -365,12 +365,12 @@ static void __save_error_info(struct super_block *sb, const char *func,
return;
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
ext4_update_tstamp(es, s_last_error_time);
- strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
+ strscpy_pad(es->s_last_error_func, func, sizeof(es->s_last_error_func));
es->s_last_error_line = cpu_to_le32(line);
if (!es->s_first_error_time) {
es->s_first_error_time = es->s_last_error_time;
es->s_first_error_time_hi = es->s_last_error_time_hi;
- strncpy(es->s_first_error_func, func,
+ strscpy_pad(es->s_first_error_func, func,
sizeof(es->s_first_error_func));
es->s_first_error_line = cpu_to_le32(line);
es->s_first_error_ino = es->s_last_error_ino;
@@ -389,7 +389,8 @@ static void save_error_info(struct super_block *sb, const char *func,
unsigned int line)
{
__save_error_info(sb, func, line);
- ext4_commit_super(sb, 1);
+ if (!bdev_read_only(sb->s_bdev))
+ ext4_commit_super(sb, 1);
}
/*
@@ -970,6 +971,8 @@ static void ext4_put_super(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
+ struct buffer_head **group_desc;
+ struct flex_groups **flex_groups;
int aborted = 0;
int i, err;
@@ -1000,15 +1003,23 @@ static void ext4_put_super(struct super_block *sb)
if (!sb_rdonly(sb))
ext4_commit_super(sb, 1);
+ rcu_read_lock();
+ group_desc = rcu_dereference(sbi->s_group_desc);
for (i = 0; i < sbi->s_gdb_count; i++)
- brelse(sbi->s_group_desc[i]);
- kvfree(sbi->s_group_desc);
- kvfree(sbi->s_flex_groups);
+ brelse(group_desc[i]);
+ kvfree(group_desc);
+ flex_groups = rcu_dereference(sbi->s_flex_groups);
+ if (flex_groups) {
+ for (i = 0; i < sbi->s_flex_groups_allocated; i++)
+ kvfree(flex_groups[i]);
+ kvfree(flex_groups);
+ }
+ rcu_read_unlock();
percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
- percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
+ percpu_free_rwsem(&sbi->s_writepages_rwsem);
#ifdef CONFIG_QUOTA
for (i = 0; i < EXT4_MAXQUOTAS; i++)
kfree(get_qf_name(sb, sbi, i));
@@ -1876,6 +1887,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
sbi->s_commit_interval = HZ * arg;
} else if (token == Opt_debug_want_extra_isize) {
+ if ((arg & 1) ||
+ (arg < 4) ||
+ (arg > (sbi->s_inode_size - EXT4_GOOD_OLD_INODE_SIZE))) {
+ ext4_msg(sb, KERN_ERR,
+ "Invalid want_extra_isize %d", arg);
+ return -1;
+ }
sbi->s_want_extra_isize = arg;
} else if (token == Opt_max_batch_time) {
sbi->s_max_batch_time = arg;
@@ -2005,6 +2023,16 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
#endif
} else if (token == Opt_dax) {
#ifdef CONFIG_FS_DAX
+ if (is_remount && test_opt(sb, DAX)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "both data=journal and dax");
+ return -1;
+ }
+ if (is_remount && !(sbi->s_mount_opt & EXT4_MOUNT_DAX)) {
+ ext4_msg(sb, KERN_ERR, "can't change "
+ "dax mount option while remounting");
+ return -1;
+ }
ext4_msg(sb, KERN_WARNING,
"DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
sbi->s_mount_opt |= m->mount_opt;
@@ -2265,6 +2293,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
ext4_msg(sb, KERN_ERR, "revision level too high, "
"forcing read-only mode");
err = -EROFS;
+ goto done;
}
if (read_only)
goto done;
@@ -2314,8 +2343,8 @@ done:
int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct flex_groups *new_groups;
- int size;
+ struct flex_groups **old_groups, **new_groups;
+ int size, i, j;
if (!sbi->s_log_groups_per_flex)
return 0;
@@ -2324,22 +2353,37 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup)
if (size <= sbi->s_flex_groups_allocated)
return 0;
- size = roundup_pow_of_two(size * sizeof(struct flex_groups));
- new_groups = kvzalloc(size, GFP_KERNEL);
+ new_groups = kvzalloc(roundup_pow_of_two(size *
+ sizeof(*sbi->s_flex_groups)), GFP_KERNEL);
if (!new_groups) {
- ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups",
- size / (int) sizeof(struct flex_groups));
+ ext4_msg(sb, KERN_ERR,
+ "not enough memory for %d flex group pointers", size);
return -ENOMEM;
}
-
- if (sbi->s_flex_groups) {
- memcpy(new_groups, sbi->s_flex_groups,
- (sbi->s_flex_groups_allocated *
- sizeof(struct flex_groups)));
- kvfree(sbi->s_flex_groups);
+ for (i = sbi->s_flex_groups_allocated; i < size; i++) {
+ new_groups[i] = kvzalloc(roundup_pow_of_two(
+ sizeof(struct flex_groups)),
+ GFP_KERNEL);
+ if (!new_groups[i]) {
+ for (j = sbi->s_flex_groups_allocated; j < i; j++)
+ kvfree(new_groups[j]);
+ kvfree(new_groups);
+ ext4_msg(sb, KERN_ERR,
+ "not enough memory for %d flex groups", size);
+ return -ENOMEM;
+ }
}
- sbi->s_flex_groups = new_groups;
- sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups);
+ rcu_read_lock();
+ old_groups = rcu_dereference(sbi->s_flex_groups);
+ if (old_groups)
+ memcpy(new_groups, old_groups,
+ (sbi->s_flex_groups_allocated *
+ sizeof(struct flex_groups *)));
+ rcu_read_unlock();
+ rcu_assign_pointer(sbi->s_flex_groups, new_groups);
+ sbi->s_flex_groups_allocated = size;
+ if (old_groups)
+ ext4_kvfree_array_rcu(old_groups);
return 0;
}
@@ -2347,6 +2391,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_group_desc *gdp = NULL;
+ struct flex_groups *fg;
ext4_group_t flex_group;
int i, err;
@@ -2364,12 +2409,11 @@ static int ext4_fill_flex_info(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, NULL);
flex_group = ext4_flex_group(sbi, i);
- atomic_add(ext4_free_inodes_count(sb, gdp),
- &sbi->s_flex_groups[flex_group].free_inodes);
+ fg = sbi_array_rcu_deref(sbi, s_flex_groups, flex_group);
+ atomic_add(ext4_free_inodes_count(sb, gdp), &fg->free_inodes);
atomic64_add(ext4_free_group_clusters(sb, gdp),
- &sbi->s_flex_groups[flex_group].free_clusters);
- atomic_add(ext4_used_dirs_count(sb, gdp),
- &sbi->s_flex_groups[flex_group].used_dirs);
+ &fg->free_clusters);
+ atomic_add(ext4_used_dirs_count(sb, gdp), &fg->used_dirs);
}
return 1;
@@ -2943,17 +2987,11 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
return 0;
}
-#ifndef CONFIG_QUOTA
- if (ext4_has_feature_quota(sb) && !readonly) {
- ext4_msg(sb, KERN_ERR,
- "Filesystem with quota feature cannot be mounted RDWR "
- "without CONFIG_QUOTA");
- return 0;
- }
- if (ext4_has_feature_project(sb) && !readonly) {
+#if !IS_ENABLED(CONFIG_QUOTA) || !IS_ENABLED(CONFIG_QFMT_V2)
+ if (!readonly && (ext4_has_feature_quota(sb) ||
+ ext4_has_feature_project(sb))) {
ext4_msg(sb, KERN_ERR,
- "Filesystem with project quota feature cannot be mounted RDWR "
- "without CONFIG_QUOTA");
+ "The kernel was not built with CONFIG_QUOTA and CONFIG_QFMT_V2");
return 0;
}
#endif /* CONFIG_QUOTA */
@@ -3524,7 +3562,8 @@ int ext4_calculate_overhead(struct super_block *sb)
*/
if (sbi->s_journal && !sbi->journal_bdev)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
- else if (ext4_has_feature_journal(sb) && !sbi->s_journal) {
+ else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
+ /* j_inum for internal journal is non-zero */
j_inode = ext4_get_journal_inode(sb, j_inum);
if (j_inode) {
j_blocks = j_inode->i_size >> sb->s_blocksize_bits;
@@ -3540,37 +3579,6 @@ int ext4_calculate_overhead(struct super_block *sb)
return 0;
}
-static void ext4_clamp_want_extra_isize(struct super_block *sb)
-{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
-
- /* determine the minimum size of new large inodes, if present */
- if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE &&
- sbi->s_want_extra_isize == 0) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
- if (ext4_has_feature_extra_isize(sb)) {
- if (sbi->s_want_extra_isize <
- le16_to_cpu(es->s_want_extra_isize))
- sbi->s_want_extra_isize =
- le16_to_cpu(es->s_want_extra_isize);
- if (sbi->s_want_extra_isize <
- le16_to_cpu(es->s_min_extra_isize))
- sbi->s_want_extra_isize =
- le16_to_cpu(es->s_min_extra_isize);
- }
- }
- /* Check if enough inode space is available */
- if (EXT4_GOOD_OLD_INODE_SIZE + sbi->s_want_extra_isize >
- sbi->s_inode_size) {
- sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
- EXT4_GOOD_OLD_INODE_SIZE;
- ext4_msg(sb, KERN_INFO,
- "required extra inode space not available");
- }
-}
-
static void ext4_set_resv_clusters(struct super_block *sb)
{
ext4_fsblk_t resv_clusters;
@@ -3605,9 +3613,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
{
struct dax_device *dax_dev = fs_dax_get_by_bdev(sb->s_bdev);
char *orig_data = kstrdup(data, GFP_KERNEL);
- struct buffer_head *bh;
+ struct buffer_head *bh, **group_desc;
struct ext4_super_block *es = NULL;
struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+ struct flex_groups **flex_groups;
ext4_fsblk_t block;
ext4_fsblk_t sb_block = get_sb_block(&data);
ext4_fsblk_t logical_sb_block;
@@ -3778,6 +3787,75 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
*/
sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
+ blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
+ if (blocksize < EXT4_MIN_BLOCK_SIZE ||
+ blocksize > EXT4_MAX_BLOCK_SIZE) {
+ ext4_msg(sb, KERN_ERR,
+ "Unsupported filesystem blocksize %d (%d log_block_size)",
+ blocksize, le32_to_cpu(es->s_log_block_size));
+ goto failed_mount;
+ }
+
+ if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
+ sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
+ sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
+ } else {
+ sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
+ sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
+ if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
+ ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
+ sbi->s_first_ino);
+ goto failed_mount;
+ }
+ if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
+ (!is_power_of_2(sbi->s_inode_size)) ||
+ (sbi->s_inode_size > blocksize)) {
+ ext4_msg(sb, KERN_ERR,
+ "unsupported inode size: %d",
+ sbi->s_inode_size);
+ ext4_msg(sb, KERN_ERR, "blocksize: %d", blocksize);
+ goto failed_mount;
+ }
+ /*
+ * i_atime_extra is the last extra field available for
+ * [acm]times in struct ext4_inode. Checking for that
+ * field should suffice to ensure we have extra space
+ * for all three.
+ */
+ if (sbi->s_inode_size >= offsetof(struct ext4_inode, i_atime_extra) +
+ sizeof(((struct ext4_inode *)0)->i_atime_extra)) {
+ sb->s_time_gran = 1;
+ } else {
+ sb->s_time_gran = NSEC_PER_SEC;
+ }
+ }
+ if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE) {
+ sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
+ EXT4_GOOD_OLD_INODE_SIZE;
+ if (ext4_has_feature_extra_isize(sb)) {
+ unsigned v, max = (sbi->s_inode_size -
+ EXT4_GOOD_OLD_INODE_SIZE);
+
+ v = le16_to_cpu(es->s_want_extra_isize);
+ if (v > max) {
+ ext4_msg(sb, KERN_ERR,
+ "bad s_want_extra_isize: %d", v);
+ goto failed_mount;
+ }
+ if (sbi->s_want_extra_isize < v)
+ sbi->s_want_extra_isize = v;
+
+ v = le16_to_cpu(es->s_min_extra_isize);
+ if (v > max) {
+ ext4_msg(sb, KERN_ERR,
+ "bad s_min_extra_isize: %d", v);
+ goto failed_mount;
+ }
+ if (sbi->s_want_extra_isize < v)
+ sbi->s_want_extra_isize = v;
+ }
+ }
+
if (sbi->s_es->s_mount_opts[0]) {
char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
sizeof(sbi->s_es->s_mount_opts),
@@ -3936,14 +4014,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (!ext4_feature_set_ok(sb, (sb_rdonly(sb))))
goto failed_mount;
- blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
- if (blocksize < EXT4_MIN_BLOCK_SIZE ||
- blocksize > EXT4_MAX_BLOCK_SIZE) {
- ext4_msg(sb, KERN_ERR,
- "Unsupported filesystem blocksize %d (%d log_block_size)",
- blocksize, le32_to_cpu(es->s_log_block_size));
- goto failed_mount;
- }
if (le32_to_cpu(es->s_log_block_size) >
(EXT4_MAX_BLOCK_LOG_SIZE - EXT4_MIN_BLOCK_LOG_SIZE)) {
ext4_msg(sb, KERN_ERR,
@@ -4016,29 +4086,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
has_huge_files);
sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
- if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) {
- sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE;
- sbi->s_first_ino = EXT4_GOOD_OLD_FIRST_INO;
- } else {
- sbi->s_inode_size = le16_to_cpu(es->s_inode_size);
- sbi->s_first_ino = le32_to_cpu(es->s_first_ino);
- if (sbi->s_first_ino < EXT4_GOOD_OLD_FIRST_INO) {
- ext4_msg(sb, KERN_ERR, "invalid first ino: %u",
- sbi->s_first_ino);
- goto failed_mount;
- }
- if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
- (!is_power_of_2(sbi->s_inode_size)) ||
- (sbi->s_inode_size > blocksize)) {
- ext4_msg(sb, KERN_ERR,
- "unsupported inode size: %d",
- sbi->s_inode_size);
- goto failed_mount;
- }
- if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
- sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
- }
-
sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
if (ext4_has_feature_64bit(sb)) {
if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
@@ -4061,7 +4108,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
sbi->s_inodes_per_group > blocksize * 8) {
ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
- sbi->s_blocks_per_group);
+ sbi->s_inodes_per_group);
goto failed_mount;
}
sbi->s_itb_per_group = sbi->s_inodes_per_group /
@@ -4190,9 +4237,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
EXT4_BLOCKS_PER_GROUP(sb) - 1);
do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
- ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
+ ext4_msg(sb, KERN_WARNING, "groups count too large: %llu "
"(block count %llu, first data block %u, "
- "blocks per group %lu)", sbi->s_groups_count,
+ "blocks per group %lu)", blocks_count,
ext4_blocks_count(es),
le32_to_cpu(es->s_first_data_block),
EXT4_BLOCKS_PER_GROUP(sb));
@@ -4220,9 +4267,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
}
- sbi->s_group_desc = kvmalloc_array(db_count,
- sizeof(struct buffer_head *),
- GFP_KERNEL);
+ rcu_assign_pointer(sbi->s_group_desc,
+ kvmalloc_array(db_count,
+ sizeof(struct buffer_head *),
+ GFP_KERNEL));
if (sbi->s_group_desc == NULL) {
ext4_msg(sb, KERN_ERR, "not enough memory");
ret = -ENOMEM;
@@ -4234,18 +4282,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
/* Pre-read the descriptors into the buffer cache */
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
- sb_breadahead(sb, block);
+ sb_breadahead_unmovable(sb, block);
}
for (i = 0; i < db_count; i++) {
+ struct buffer_head *bh;
+
block = descriptor_loc(sb, logical_sb_block, i);
- sbi->s_group_desc[i] = sb_bread_unmovable(sb, block);
- if (!sbi->s_group_desc[i]) {
+ bh = sb_bread_unmovable(sb, block);
+ if (!bh) {
ext4_msg(sb, KERN_ERR,
"can't read group descriptor %d", i);
db_count = i;
goto failed_mount2;
}
+ rcu_read_lock();
+ rcu_dereference(sbi->s_group_desc)[i] = bh;
+ rcu_read_unlock();
}
sbi->s_gdb_count = db_count;
if (!ext4_check_descriptors(sb, logical_sb_block, &first_not_zeroed)) {
@@ -4486,8 +4539,6 @@ no_journal:
} else if (ret)
goto failed_mount4a;
- ext4_clamp_want_extra_isize(sb);
-
ext4_set_resv_clusters(sb);
err = ext4_setup_system_zone(sb);
@@ -4525,7 +4576,7 @@ no_journal:
err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
GFP_KERNEL);
if (!err)
- err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem);
+ err = percpu_init_rwsem(&sbi->s_writepages_rwsem);
if (err) {
ext4_msg(sb, KERN_ERR, "insufficient memory");
@@ -4613,13 +4664,19 @@ failed_mount7:
ext4_unregister_li_request(sb);
failed_mount6:
ext4_mb_release(sb);
- if (sbi->s_flex_groups)
- kvfree(sbi->s_flex_groups);
+ rcu_read_lock();
+ flex_groups = rcu_dereference(sbi->s_flex_groups);
+ if (flex_groups) {
+ for (i = 0; i < sbi->s_flex_groups_allocated; i++)
+ kvfree(flex_groups[i]);
+ kvfree(flex_groups);
+ }
+ rcu_read_unlock();
percpu_counter_destroy(&sbi->s_freeclusters_counter);
percpu_counter_destroy(&sbi->s_freeinodes_counter);
percpu_counter_destroy(&sbi->s_dirs_counter);
percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
- percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
+ percpu_free_rwsem(&sbi->s_writepages_rwsem);
failed_mount5:
ext4_ext_release(sb);
ext4_release_system_zone(sb);
@@ -4648,9 +4705,12 @@ failed_mount3:
if (sbi->s_mmp_tsk)
kthread_stop(sbi->s_mmp_tsk);
failed_mount2:
+ rcu_read_lock();
+ group_desc = rcu_dereference(sbi->s_group_desc);
for (i = 0; i < db_count; i++)
- brelse(sbi->s_group_desc[i]);
- kvfree(sbi->s_group_desc);
+ brelse(group_desc[i]);
+ kvfree(group_desc);
+ rcu_read_unlock();
failed_mount:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
@@ -5275,8 +5335,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- ext4_clamp_want_extra_isize(sb);
-
if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^
test_opt(sb, JOURNAL_CHECKSUM)) {
ext4_msg(sb, KERN_ERR, "changing journal_checksum "
@@ -5297,12 +5355,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
goto restore_opts;
}
- if (test_opt(sb, DAX)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "both data=journal and dax");
- err = -EINVAL;
- goto restore_opts;
- }
} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
@@ -5318,12 +5370,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
- if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
- ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
- "dax flag with busy inodes while remounting");
- sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
- }
-
if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
ext4_abort(sb, "Abort forced by user");
@@ -5514,9 +5560,15 @@ static int ext4_statfs_project(struct super_block *sb,
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = (dquot->dq_dqb.dqb_bsoftlimit ?
- dquot->dq_dqb.dqb_bsoftlimit :
- dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+ limit = 0;
+ if (dquot->dq_dqb.dqb_bsoftlimit &&
+ (!limit || dquot->dq_dqb.dqb_bsoftlimit < limit))
+ limit = dquot->dq_dqb.dqb_bsoftlimit;
+ if (dquot->dq_dqb.dqb_bhardlimit &&
+ (!limit || dquot->dq_dqb.dqb_bhardlimit < limit))
+ limit = dquot->dq_dqb.dqb_bhardlimit;
+ limit >>= sb->s_blocksize_bits;
+
if (limit && buf->f_blocks > limit) {
curblock = (dquot->dq_dqb.dqb_curspace +
dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
@@ -5526,9 +5578,14 @@ static int ext4_statfs_project(struct super_block *sb,
(buf->f_blocks - curblock) : 0;
}
- limit = dquot->dq_dqb.dqb_isoftlimit ?
- dquot->dq_dqb.dqb_isoftlimit :
- dquot->dq_dqb.dqb_ihardlimit;
+ limit = 0;
+ if (dquot->dq_dqb.dqb_isoftlimit &&
+ (!limit || dquot->dq_dqb.dqb_isoftlimit < limit))
+ limit = dquot->dq_dqb.dqb_isoftlimit;
+ if (dquot->dq_dqb.dqb_ihardlimit &&
+ (!limit || dquot->dq_dqb.dqb_ihardlimit < limit))
+ limit = dquot->dq_dqb.dqb_ihardlimit;
+
if (limit && buf->f_files > limit) {
buf->f_files = limit;
buf->f_ffree =
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 491f9ee4040e..894a61010ae9 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -1820,8 +1820,11 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
if (EXT4_I(inode)->i_file_acl) {
/* The inode already has an extended attribute block. */
bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO);
- if (IS_ERR(bs->bh))
- return PTR_ERR(bs->bh);
+ if (IS_ERR(bs->bh)) {
+ error = PTR_ERR(bs->bh);
+ bs->bh = NULL;
+ return error;
+ }
ea_bdebug(bs->bh, "b_count=%d, refcount=%d",
atomic_read(&(bs->bh->b_count)),
le32_to_cpu(BHDR(bs->bh)->h_refcount));
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 7d9fce215f46..9889d9526ef9 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -896,8 +896,8 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
int i;
int err;
- sbi->ckpt = f2fs_kzalloc(sbi, array_size(blk_size, cp_blks),
- GFP_KERNEL);
+ sbi->ckpt = f2fs_kvzalloc(sbi, array_size(blk_size, cp_blks),
+ GFP_KERNEL);
if (!sbi->ckpt)
return -ENOMEM;
/*
@@ -1257,20 +1257,20 @@ static void unblock_operations(struct f2fs_sb_info *sbi)
f2fs_unlock_all(sbi);
}
-void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
+void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
{
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (!get_pages(sbi, F2FS_WB_CP_DATA))
+ if (!get_pages(sbi, type))
break;
if (unlikely(f2fs_cp_error(sbi)))
break;
- io_schedule_timeout(5*HZ);
+ io_schedule_timeout(HZ/50);
}
finish_wait(&sbi->cp_wait, &wait);
}
@@ -1392,8 +1392,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Flush all the NAT/SIT pages */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
- f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
- !f2fs_cp_error(sbi));
/*
* modify checkpoint
@@ -1501,11 +1499,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
- f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
- !f2fs_cp_error(sbi));
+ /* Wait for all dirty meta pages to be submitted for IO */
+ f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META);
/* wait for previous submitted meta pages writeback */
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
/* flush all device cache */
err = f2fs_flush_device_cache(sbi);
@@ -1514,7 +1512,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* barrier and flush checkpoint cp pack 2 page if it can */
commit_checkpoint(sbi, ckpt, start_blk);
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
/*
* invalidate intermediate page cache borrowed from meta inode
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 85f3879a31cb..facfcdbb8355 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -949,19 +949,6 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
int err = 0;
bool direct_io = iocb->ki_flags & IOCB_DIRECT;
- /* convert inline data for Direct I/O*/
- if (direct_io) {
- err = f2fs_convert_inline_inode(inode);
- if (err)
- return err;
- }
-
- if (direct_io && allow_outplace_dio(inode, iocb, from))
- return 0;
-
- if (is_inode_flag_set(inode, FI_NO_PREALLOC))
- return 0;
-
map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
if (map.m_len > map.m_lblk)
@@ -1955,7 +1942,7 @@ static int __write_data_page(struct page *page, bool *submitted,
loff_t i_size = i_size_read(inode);
const pgoff_t end_index = ((unsigned long long) i_size)
>> PAGE_SHIFT;
- loff_t psize = (page->index + 1) << PAGE_SHIFT;
+ loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT;
unsigned offset = 0;
bool need_balance_fs = false;
int err = 0;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index b545beb8b04e..5a07de66b334 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -99,6 +99,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000
#define F2FS_MOUNT_RESERVE_ROOT 0x01000000
#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
+#define F2FS_MOUNT_NORECOVERY 0x04000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -2690,6 +2691,20 @@ static inline void clear_file(struct inode *inode, int type)
f2fs_mark_inode_dirty_sync(inode, true);
}
+static inline bool f2fs_is_time_consistent(struct inode *inode)
+{
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
+ return false;
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
+ return false;
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
+ return false;
+ if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3,
+ &F2FS_I(inode)->i_crtime))
+ return false;
+ return true;
+}
+
static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
{
bool ret;
@@ -2707,14 +2722,7 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
i_size_read(inode) & ~PAGE_MASK)
return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time, &inode->i_atime))
- return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 1, &inode->i_ctime))
- return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 2, &inode->i_mtime))
- return false;
- if (!timespec64_equal(F2FS_I(inode)->i_disk_time + 3,
- &F2FS_I(inode)->i_crtime))
+ if (!f2fs_is_time_consistent(inode))
return false;
down_read(&F2FS_I(inode)->i_sem);
@@ -2766,18 +2774,12 @@ static inline bool f2fs_may_extent_tree(struct inode *inode)
static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi,
size_t size, gfp_t flags)
{
- void *ret;
-
if (time_to_inject(sbi, FAULT_KMALLOC)) {
f2fs_show_injection_info(FAULT_KMALLOC);
return NULL;
}
- ret = kmalloc(size, flags);
- if (ret)
- return ret;
-
- return kvmalloc(size, flags);
+ return kmalloc(size, flags);
}
static inline void *f2fs_kzalloc(struct f2fs_sb_info *sbi,
@@ -3158,7 +3160,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
void f2fs_update_dirty_page(struct inode *inode, struct page *page);
void f2fs_remove_dirty_inode(struct inode *inode);
int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
-void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi);
+void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
int __init f2fs_create_checkpoint_caches(void);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 33c6e14d0c87..d676a18cb3fd 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -49,7 +49,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
struct page *page = vmf->page;
struct inode *inode = file_inode(vmf->vma->vm_file);
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- struct dnode_of_data dn = { .node_changed = false };
+ struct dnode_of_data dn;
int err;
if (unlikely(f2fs_cp_error(sbi))) {
@@ -57,6 +57,9 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
goto err;
}
+ /* should do out of any locked page */
+ f2fs_balance_fs(sbi, true);
+
sb_start_pagefault(inode->i_sb);
f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -114,8 +117,6 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
out_sem:
up_read(&F2FS_I(inode)->i_mmap_sem);
- f2fs_balance_fs(sbi, dn.node_changed);
-
sb_end_pagefault(inode->i_sb);
err:
return block_page_mkwrite_return(err);
@@ -1125,7 +1126,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
}
dn.ofs_in_node++;
i++;
- new_size = (dst + i) << PAGE_SHIFT;
+ new_size = (loff_t)(dst + i) << PAGE_SHIFT;
if (dst_inode->i_size < new_size)
f2fs_i_size_write(dst_inode, new_size);
} while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR));
@@ -1943,8 +1944,15 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
if (in != F2FS_GOING_DOWN_FULLSYNC) {
ret = mnt_want_write_file(filp);
- if (ret)
+ if (ret) {
+ if (ret == -EROFS) {
+ ret = 0;
+ f2fs_stop_checkpoint(sbi, false);
+ set_sbi_flag(sbi, SBI_IS_SHUTDOWN);
+ trace_f2fs_shutdown(sbi, in, ret);
+ }
return ret;
+ }
}
switch (in) {
@@ -3084,18 +3092,41 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
ret = -EAGAIN;
goto out;
}
- } else {
- preallocated = true;
- target_size = iocb->ki_pos + iov_iter_count(from);
+ goto write;
+ }
- err = f2fs_preallocate_blocks(iocb, from);
- if (err) {
- clear_inode_flag(inode, FI_NO_PREALLOC);
- inode_unlock(inode);
- ret = err;
- goto out;
- }
+ if (is_inode_flag_set(inode, FI_NO_PREALLOC))
+ goto write;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ /*
+ * Convert inline data for Direct I/O before entering
+ * f2fs_direct_IO().
+ */
+ err = f2fs_convert_inline_inode(inode);
+ if (err)
+ goto out_err;
+ /*
+ * If force_buffere_io() is true, we have to allocate
+ * blocks all the time, since f2fs_direct_IO will fall
+ * back to buffered IO.
+ */
+ if (!f2fs_force_buffered_io(inode, iocb, from) &&
+ allow_outplace_dio(inode, iocb, from))
+ goto write;
+ }
+ preallocated = true;
+ target_size = iocb->ki_pos + iov_iter_count(from);
+
+ err = f2fs_preallocate_blocks(iocb, from);
+ if (err) {
+out_err:
+ clear_inode_flag(inode, FI_NO_PREALLOC);
+ inode_unlock(inode);
+ ret = err;
+ goto out;
}
+write:
ret = __generic_file_write_iter(iocb, from);
clear_inode_flag(inode, FI_NO_PREALLOC);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index e26995975570..15d09235f1c8 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -627,7 +627,11 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
inode->i_ino == F2FS_META_INO(sbi))
return 0;
- if (!is_inode_flag_set(inode, FI_DIRTY_INODE))
+ /*
+ * atime could be updated without dirtying f2fs inode in lazytime mode
+ */
+ if (f2fs_is_time_consistent(inode) &&
+ !is_inode_flag_set(inode, FI_DIRTY_INODE))
return 0;
if (f2fs_is_checkpoint_ready(sbi))
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 0f77f9242751..6d44b7a9b0cc 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -782,6 +782,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
if (whiteout) {
f2fs_i_links_write(inode, false);
+ inode->i_state |= I_LINKABLE;
*whiteout = inode;
} else {
d_tmpfile(dentry, inode);
@@ -851,6 +852,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
F2FS_I(old_dentry->d_inode)->i_projid)))
return -EXDEV;
+ if (flags & RENAME_WHITEOUT) {
+ err = f2fs_create_whiteout(old_dir, &whiteout);
+ if (err)
+ return err;
+ }
+
err = dquot_initialize(old_dir);
if (err)
goto out;
@@ -882,17 +889,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
}
- if (flags & RENAME_WHITEOUT) {
- err = f2fs_create_whiteout(old_dir, &whiteout);
- if (err)
- goto out_dir;
- }
-
if (new_inode) {
err = -ENOTEMPTY;
if (old_dir_entry && !f2fs_empty_dir(new_inode))
- goto out_whiteout;
+ goto out_dir;
err = -ENOENT;
new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name,
@@ -900,7 +901,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!new_entry) {
if (IS_ERR(new_page))
err = PTR_ERR(new_page);
- goto out_whiteout;
+ goto out_dir;
}
f2fs_balance_fs(sbi, true);
@@ -932,7 +933,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
err = f2fs_add_link(new_dentry, old_inode);
if (err) {
f2fs_unlock_op(sbi);
- goto out_whiteout;
+ goto out_dir;
}
if (old_dir_entry)
@@ -956,7 +957,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (IS_ERR(old_page))
err = PTR_ERR(old_page);
f2fs_unlock_op(sbi);
- goto out_whiteout;
+ goto out_dir;
}
}
}
@@ -965,7 +966,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (!old_dir_entry || whiteout)
file_lost_pino(old_inode);
else
- F2FS_I(old_inode)->i_pino = new_dir->i_ino;
+ /* adjust dir's i_pino to pass fsck check */
+ f2fs_i_pino_write(old_inode, new_dir->i_ino);
up_write(&F2FS_I(old_inode)->i_sem);
old_inode->i_ctime = current_time(old_inode);
@@ -974,7 +976,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
if (whiteout) {
- whiteout->i_state |= I_LINKABLE;
set_inode_flag(whiteout, FI_INC_LINK);
err = f2fs_add_link(old_dentry, whiteout);
if (err)
@@ -1010,15 +1011,14 @@ put_out_dir:
f2fs_unlock_op(sbi);
if (new_page)
f2fs_put_page(new_page, 0);
-out_whiteout:
- if (whiteout)
- iput(whiteout);
out_dir:
if (old_dir_entry)
f2fs_put_page(old_dir_page, 0);
out_old:
f2fs_put_page(old_page, 0);
out:
+ if (whiteout)
+ iput(whiteout);
return err;
}
@@ -1126,7 +1126,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_set_link(old_dir, old_entry, old_page, new_inode);
down_write(&F2FS_I(old_inode)->i_sem);
- file_lost_pino(old_inode);
+ if (!old_dir_entry)
+ file_lost_pino(old_inode);
+ else
+ /* adjust dir's i_pino to pass fsck check */
+ f2fs_i_pino_write(old_inode, new_dir->i_ino);
up_write(&F2FS_I(old_inode)->i_sem);
old_dir->i_ctime = current_time(old_dir);
@@ -1141,7 +1145,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
f2fs_set_link(new_dir, new_entry, new_page, old_inode);
down_write(&F2FS_I(new_inode)->i_sem);
- file_lost_pino(new_inode);
+ if (!new_dir_entry)
+ file_lost_pino(new_inode);
+ else
+ /* adjust dir's i_pino to pass fsck check */
+ f2fs_i_pino_write(new_inode, old_dir->i_ino);
up_write(&F2FS_I(new_inode)->i_sem);
new_dir->i_ctime = current_time(new_dir);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index ad024b7b1d43..75fced457c24 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1565,15 +1565,16 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (atomic && !test_opt(sbi, NOBARRIER))
fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
- set_page_writeback(page);
- ClearPageError(page);
-
+ /* should add to global list before clearing PAGECACHE status */
if (f2fs_in_warm_node_list(sbi, page)) {
seq = f2fs_add_fsync_node_entry(sbi, page);
if (seq_id)
*seq_id = seq;
}
+ set_page_writeback(page);
+ ClearPageError(page);
+
fio.old_blkaddr = ni.blk_addr;
f2fs_do_write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
@@ -2887,7 +2888,7 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi)
return 0;
nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8);
- nm_i->nat_bits = f2fs_kzalloc(sbi,
+ nm_i->nat_bits = f2fs_kvzalloc(sbi,
nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, GFP_KERNEL);
if (!nm_i->nat_bits)
return -ENOMEM;
@@ -2970,7 +2971,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
/* not used nids: 0, node, meta, (and root counted as valid node) */
nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
- sbi->nquota_files - F2FS_RESERVED_NODE_NUM;
+ F2FS_RESERVED_NODE_NUM;
nm_i->nid_cnt[FREE_NID] = 0;
nm_i->nid_cnt[PREALLOC_NID] = 0;
nm_i->nat_cnt = 0;
@@ -3020,9 +3021,9 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi)
int i;
nm_i->free_nid_bitmap =
- f2fs_kzalloc(sbi, array_size(sizeof(unsigned char *),
- nm_i->nat_blocks),
- GFP_KERNEL);
+ f2fs_kvzalloc(sbi, array_size(sizeof(unsigned char *),
+ nm_i->nat_blocks),
+ GFP_KERNEL);
if (!nm_i->free_nid_bitmap)
return -ENOMEM;
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 01038aff5d8e..b7539664c937 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -409,7 +409,7 @@ static int parse_options(struct super_block *sb, char *options)
break;
case Opt_norecovery:
/* this option mounts f2fs with ro */
- set_opt(sbi, DISABLE_ROLL_FORWARD);
+ set_opt(sbi, NORECOVERY);
if (!f2fs_readonly(sb))
return -EINVAL;
break;
@@ -1086,7 +1086,7 @@ static void f2fs_put_super(struct super_block *sb)
/* our cp_error case, we can wait for any writeback page */
f2fs_flush_merged_writes(sbi);
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
f2fs_bug_on(sbi, sbi->fsync_node_num);
@@ -1191,20 +1191,23 @@ static int f2fs_statfs_project(struct super_block *sb,
return PTR_ERR(dquot);
spin_lock(&dquot->dq_dqb_lock);
- limit = (dquot->dq_dqb.dqb_bsoftlimit ?
- dquot->dq_dqb.dqb_bsoftlimit :
- dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+ limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit,
+ dquot->dq_dqb.dqb_bhardlimit);
+ if (limit)
+ limit >>= sb->s_blocksize_bits;
+
if (limit && buf->f_blocks > limit) {
- curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+ curblock = (dquot->dq_dqb.dqb_curspace +
+ dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits;
buf->f_blocks = limit;
buf->f_bfree = buf->f_bavail =
(buf->f_blocks > curblock) ?
(buf->f_blocks - curblock) : 0;
}
- limit = dquot->dq_dqb.dqb_isoftlimit ?
- dquot->dq_dqb.dqb_isoftlimit :
- dquot->dq_dqb.dqb_ihardlimit;
+ limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit,
+ dquot->dq_dqb.dqb_ihardlimit);
+
if (limit && buf->f_files > limit) {
buf->f_files = limit;
buf->f_ffree =
@@ -1249,8 +1252,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
else
buf->f_bavail = 0;
- avail_node_count = sbi->total_node_count - sbi->nquota_files -
- F2FS_RESERVED_NODE_NUM;
+ avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
if (avail_node_count > user_block_count) {
buf->f_files = user_block_count;
@@ -1325,6 +1327,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
}
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
+ if (test_opt(sbi, NORECOVERY))
+ seq_puts(seq, ",norecovery");
if (test_opt(sbi, DISCARD))
seq_puts(seq, ",discard");
if (test_opt(sbi, NOHEAP))
@@ -1793,6 +1797,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
int offset = off & (sb->s_blocksize - 1);
size_t towrite = len;
struct page *page;
+ void *fsdata = NULL;
char *kaddr;
int err = 0;
int tocopy;
@@ -1802,7 +1807,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
towrite);
retry:
err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
- &page, NULL);
+ &page, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
congestion_wait(BLK_RW_ASYNC, HZ/50);
@@ -1818,7 +1823,7 @@ retry:
flush_dcache_page(page);
a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
- page, NULL);
+ page, fsdata);
offset = 0;
towrite -= tocopy;
off += tocopy;
@@ -2444,6 +2449,12 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
size_t crc_offset = 0;
__u32 crc = 0;
+ if (le32_to_cpu(raw_super->magic) != F2FS_SUPER_MAGIC) {
+ f2fs_msg(sb, KERN_INFO, "Magic Mismatch, valid(0x%x) - read(0x%x)",
+ F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
+ return -EINVAL;
+ }
+
/* Check checksum_offset and crc in superblock */
if (__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_SB_CHKSUM)) {
crc_offset = le32_to_cpu(raw_super->checksum_offset);
@@ -2452,29 +2463,22 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
f2fs_msg(sb, KERN_INFO,
"Invalid SB checksum offset: %zu",
crc_offset);
- return 1;
+ return -EFSCORRUPTED;
}
crc = le32_to_cpu(raw_super->crc);
if (!f2fs_crc_valid(sbi, crc, raw_super, crc_offset)) {
f2fs_msg(sb, KERN_INFO,
"Invalid SB checksum value: %u", crc);
- return 1;
+ return -EFSCORRUPTED;
}
}
- if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
- f2fs_msg(sb, KERN_INFO,
- "Magic Mismatch, valid(0x%x) - read(0x%x)",
- F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
- return 1;
- }
-
/* Currently, support only 4KB page cache size */
if (F2FS_BLKSIZE != PAGE_SIZE) {
f2fs_msg(sb, KERN_INFO,
"Invalid page_cache_size (%lu), supports only 4KB",
PAGE_SIZE);
- return 1;
+ return -EFSCORRUPTED;
}
/* Currently, support only 4KB block size */
@@ -2483,7 +2487,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
f2fs_msg(sb, KERN_INFO,
"Invalid blocksize (%u), supports only 4KB",
blocksize);
- return 1;
+ return -EFSCORRUPTED;
}
/* check log blocks per segment */
@@ -2491,7 +2495,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
f2fs_msg(sb, KERN_INFO,
"Invalid log blocks per segment (%u)",
le32_to_cpu(raw_super->log_blocks_per_seg));
- return 1;
+ return -EFSCORRUPTED;
}
/* Currently, support 512/1024/2048/4096 bytes sector size */
@@ -2501,7 +2505,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
F2FS_MIN_LOG_SECTOR_SIZE) {
f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)",
le32_to_cpu(raw_super->log_sectorsize));
- return 1;
+ return -EFSCORRUPTED;
}
if (le32_to_cpu(raw_super->log_sectors_per_block) +
le32_to_cpu(raw_super->log_sectorsize) !=
@@ -2510,7 +2514,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
"Invalid log sectors per block(%u) log sectorsize(%u)",
le32_to_cpu(raw_super->log_sectors_per_block),
le32_to_cpu(raw_super->log_sectorsize));
- return 1;
+ return -EFSCORRUPTED;
}
segment_count = le32_to_cpu(raw_super->segment_count);
@@ -2526,7 +2530,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
f2fs_msg(sb, KERN_INFO,
"Invalid segment count (%u)",
segment_count);
- return 1;
+ return -EFSCORRUPTED;
}
if (total_sections > segment_count ||
@@ -2535,28 +2539,28 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
f2fs_msg(sb, KERN_INFO,
"Invalid segment/section count (%u, %u x %u)",
segment_count, total_sections, segs_per_sec);
- return 1;
+ return -EFSCORRUPTED;
}
if ((segment_count / segs_per_sec) < total_sections) {
f2fs_msg(sb, KERN_INFO,
"Small segment_count (%u < %u * %u)",
segment_count, segs_per_sec, total_sections);
- return 1;
+ return -EFSCORRUPTED;
}
if (segment_count > (le64_to_cpu(raw_super->block_count) >> 9)) {
f2fs_msg(sb, KERN_INFO,
"Wrong segment_count / block_count (%u > %llu)",
segment_count, le64_to_cpu(raw_super->block_count));
- return 1;
+ return -EFSCORRUPTED;
}
if (secs_per_zone > total_sections || !secs_per_zone) {
f2fs_msg(sb, KERN_INFO,
"Wrong secs_per_zone / total_sections (%u, %u)",
secs_per_zone, total_sections);
- return 1;
+ return -EFSCORRUPTED;
}
if (le32_to_cpu(raw_super->extension_count) > F2FS_MAX_EXTENSION ||
raw_super->hot_ext_count > F2FS_MAX_EXTENSION ||
@@ -2567,7 +2571,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
le32_to_cpu(raw_super->extension_count),
raw_super->hot_ext_count,
F2FS_MAX_EXTENSION);
- return 1;
+ return -EFSCORRUPTED;
}
if (le32_to_cpu(raw_super->cp_payload) >
@@ -2576,7 +2580,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
"Insane cp_payload (%u > %u)",
le32_to_cpu(raw_super->cp_payload),
blocks_per_seg - F2FS_CP_PACKS);
- return 1;
+ return -EFSCORRUPTED;
}
/* check reserved ino info */
@@ -2588,12 +2592,12 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi,
le32_to_cpu(raw_super->node_ino),
le32_to_cpu(raw_super->meta_ino),
le32_to_cpu(raw_super->root_ino));
- return 1;
+ return -EFSCORRUPTED;
}
/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
if (sanity_check_area_boundary(sbi, bh))
- return 1;
+ return -EFSCORRUPTED;
return 0;
}
@@ -2655,8 +2659,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi)
}
valid_node_count = le32_to_cpu(ckpt->valid_node_count);
- avail_node_count = sbi->total_node_count - sbi->nquota_files -
- F2FS_RESERVED_NODE_NUM;
+ avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
if (valid_node_count > avail_node_count) {
f2fs_msg(sbi->sb, KERN_ERR,
"Wrong valid_node_count: %u, avail_node_count: %u",
@@ -2844,7 +2847,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
if (nr_sectors & (bdev_zone_sectors(bdev) - 1))
FDEV(devi).nr_blkz++;
- FDEV(devi).blkz_seq = f2fs_kzalloc(sbi,
+ FDEV(devi).blkz_seq = f2fs_kvzalloc(sbi,
BITS_TO_LONGS(FDEV(devi).nr_blkz)
* sizeof(unsigned long),
GFP_KERNEL);
@@ -2918,11 +2921,11 @@ static int read_raw_super_block(struct f2fs_sb_info *sbi,
}
/* sanity checking of raw super */
- if (sanity_check_raw_super(sbi, bh)) {
+ err = sanity_check_raw_super(sbi, bh);
+ if (err) {
f2fs_msg(sb, KERN_ERR,
"Can't find valid F2FS filesystem in %dth superblock",
block + 1);
- err = -EFSCORRUPTED;
brelse(bh);
continue;
}
@@ -3400,7 +3403,8 @@ try_onemore:
goto reset_checkpoint;
/* recover fsynced data */
- if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+ if (!test_opt(sbi, DISABLE_ROLL_FORWARD) &&
+ !test_opt(sbi, NORECOVERY)) {
/*
* mount should be failed, when device has readonly mode, and
* previous checkpoint was not done by clean system shutdown.
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 729f46a3c9ee..85e3ed67e382 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -679,10 +679,12 @@ int __init f2fs_init_sysfs(void)
ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype,
NULL, "features");
- if (ret)
+ if (ret) {
+ kobject_put(&f2fs_feat);
kset_unregister(&f2fs_kset);
- else
+ } else {
f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
+ }
return ret;
}
@@ -703,8 +705,11 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi)
init_completion(&sbi->s_kobj_unregister);
err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL,
"%s", sb->s_id);
- if (err)
+ if (err) {
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
return err;
+ }
if (f2fs_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root);
@@ -732,4 +737,5 @@ void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi)
remove_proc_entry(sbi->sb->s_id, f2fs_proc_root);
}
kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
}
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 963242018663..b5e49f2565f5 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -535,8 +535,9 @@ out:
ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct inode *inode = d_inode(dentry);
+ nid_t xnid = F2FS_I(inode)->i_xattr_nid;
struct f2fs_xattr_entry *entry;
- void *base_addr;
+ void *base_addr, *last_base_addr;
int error = 0;
size_t rest = buffer_size;
@@ -546,6 +547,8 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (error)
return error;
+ last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
f2fs_xattr_handler(entry->e_name_index);
@@ -553,6 +556,15 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
size_t prefix_len;
size_t size;
+ if ((void *)(entry) + sizeof(__u32) > last_base_addr ||
+ (void *)XATTR_NEXT_ENTRY(entry) > last_base_addr) {
+ f2fs_msg(F2FS_I_SB(inode)->sb, KERN_ERR, "inode (%lu) has corrupted xattr",
+ inode->i_ino);
+ set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
+ error = -EFSCORRUPTED;
+ goto cleanup;
+ }
+
if (!handler || (handler->list && !handler->list(dentry)))
continue;
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 05689198f5af..45b4aa8878c3 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -744,6 +744,13 @@ static struct inode *fat_alloc_inode(struct super_block *sb)
return NULL;
init_rwsem(&ei->truncate_lock);
+ /* Zeroing to allow iput() even if partial initialized inode. */
+ ei->mmu_private = 0;
+ ei->i_start = 0;
+ ei->i_logstart = 0;
+ ei->i_attrs = 0;
+ ei->i_pos = 0;
+
return &ei->vfs_inode;
}
@@ -1368,16 +1375,6 @@ out:
return 0;
}
-static void fat_dummy_inode_init(struct inode *inode)
-{
- /* Initialize this dummy inode to work as no-op. */
- MSDOS_I(inode)->mmu_private = 0;
- MSDOS_I(inode)->i_start = 0;
- MSDOS_I(inode)->i_logstart = 0;
- MSDOS_I(inode)->i_attrs = 0;
- MSDOS_I(inode)->i_pos = 0;
-}
-
static int fat_read_root(struct inode *inode)
{
struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
@@ -1517,6 +1514,12 @@ static int fat_read_bpb(struct super_block *sb, struct fat_boot_sector *b,
goto out;
}
+ if (bpb->fat_fat_length == 0 && bpb->fat32_length == 0) {
+ if (!silent)
+ fat_msg(sb, KERN_ERR, "bogus number of FAT sectors");
+ goto out;
+ }
+
error = 0;
out:
@@ -1831,13 +1834,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, int isvfat,
fat_inode = new_inode(sb);
if (!fat_inode)
goto out_fail;
- fat_dummy_inode_init(fat_inode);
sbi->fat_inode = fat_inode;
fsinfo_inode = new_inode(sb);
if (!fsinfo_inode)
goto out_fail;
- fat_dummy_inode_init(fsinfo_inode);
fsinfo_inode->i_ino = MSDOS_FSINFO_INO;
sbi->fsinfo_inode = fsinfo_inode;
insert_inode_hash(fsinfo_inode);
diff --git a/fs/file.c b/fs/file.c
index 3da91a112bab..e5d328335f88 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -70,7 +70,7 @@ static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
*/
static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
- unsigned int cpy, set;
+ size_t cpy, set;
BUG_ON(nfdt->max_fds < ofdt->max_fds);
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 9135646e41ac..5e1a19013373 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -271,7 +271,9 @@ struct file_system_type *get_fs_type(const char *name)
fs = __get_fs_type(name, len);
if (!fs && (request_module("fs-%.*s", len, name) == 0)) {
fs = __get_fs_type(name, len);
- WARN_ONCE(!fs, "request_module fs-%.*s succeeded, but still no fs?\n", len, name);
+ if (!fs)
+ pr_warn_once("request_module fs-%.*s succeeded, but still no fs?\n",
+ len, name);
}
if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9ebfb1b28430..76ff35d3a430 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -270,6 +270,7 @@ void __inode_attach_wb(struct inode *inode, struct page *page)
if (unlikely(cmpxchg(&inode->i_wb, NULL, wb)))
wb_put(wb);
}
+EXPORT_SYMBOL_GPL(__inode_attach_wb);
/**
* locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it
@@ -576,10 +577,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc,
spin_unlock(&inode->i_lock);
/*
- * A dying wb indicates that the memcg-blkcg mapping has changed
- * and a new wb is already serving the memcg. Switch immediately.
+ * A dying wb indicates that either the blkcg associated with the
+ * memcg changed or the associated memcg is dying. In the first
+ * case, a replacement wb should already be available and we should
+ * refresh the wb immediately. In the second case, trying to
+ * refresh will keep failing.
*/
- if (unlikely(wb_dying(wbc->wb)))
+ if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css)))
inode_switch_wbs(inode, wbc->wb_id);
}
@@ -1972,7 +1976,7 @@ void wb_workfn(struct work_struct *work)
struct bdi_writeback, dwork);
long pages_written;
- set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
+ set_worker_desc("flush-%s", bdi_dev_name(wb->bdi));
current->flags |= PF_SWAPWRITE;
if (likely(!current_is_workqueue_rescuer() ||
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 186468fba82e..9b7ddd7f9691 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -905,7 +905,8 @@ static int fuse_check_page(struct page *page)
1 << PG_uptodate |
1 << PG_lru |
1 << PG_active |
- 1 << PG_reclaim))) {
+ 1 << PG_reclaim |
+ 1 << PG_waiters))) {
pr_warn("trying to steal weird page\n");
pr_warn(" page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
return 1;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index dd0f64f7bc06..2d417bf778ef 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -214,7 +214,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
kfree(forget);
if (ret == -ENOMEM)
goto out;
- if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+ if (ret || fuse_invalid_attr(&outarg.attr) ||
+ (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
goto invalid;
forget_all_cached_acls(inode);
@@ -272,6 +273,12 @@ int fuse_valid_type(int m)
S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
}
+bool fuse_invalid_attr(struct fuse_attr *attr)
+{
+ return !fuse_valid_type(attr->mode) ||
+ attr->size > LLONG_MAX;
+}
+
int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name,
struct fuse_entry_out *outarg, struct inode **inode)
{
@@ -303,7 +310,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name
err = -EIO;
if (!outarg->nodeid)
goto out_put_forget;
- if (!fuse_valid_type(outarg->attr.mode))
+ if (fuse_invalid_attr(&outarg->attr))
goto out_put_forget;
*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
@@ -427,7 +434,8 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
goto out_free_ff;
err = -EIO;
- if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
+ if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid) ||
+ fuse_invalid_attr(&outentry.attr))
goto out_free_ff;
ff->fh = outopen.fh;
@@ -535,7 +543,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
goto out_put_forget_req;
err = -EIO;
- if (invalid_nodeid(outarg.nodeid))
+ if (invalid_nodeid(outarg.nodeid) || fuse_invalid_attr(&outarg.attr))
goto out_put_forget_req;
if ((outarg.attr.mode ^ mode) & S_IFMT)
@@ -814,7 +822,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
spin_lock(&fi->lock);
fi->attr_version = atomic64_inc_return(&fc->attr_version);
- inc_nlink(inode);
+ if (likely(inode->i_nlink < UINT_MAX))
+ inc_nlink(inode);
spin_unlock(&fi->lock);
fuse_invalidate_attr(inode);
fuse_update_ctime(inode);
@@ -894,7 +903,8 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
args.out.args[0].value = &outarg;
err = fuse_simple_request(fc, &args);
if (!err) {
- if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+ if (fuse_invalid_attr(&outarg.attr) ||
+ (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
make_bad_inode(inode);
err = -EIO;
} else {
@@ -1476,6 +1486,19 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
is_truncate = true;
}
+ /* Flush dirty data/metadata before non-truncate SETATTR */
+ if (is_wb && S_ISREG(inode->i_mode) &&
+ attr->ia_valid &
+ (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_MTIME_SET |
+ ATTR_TIMES_SET)) {
+ err = write_inode_now(inode, true);
+ if (err)
+ return err;
+
+ fuse_set_nowrite(inode);
+ fuse_release_nowrite(inode);
+ }
+
if (is_truncate) {
fuse_set_nowrite(inode);
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
@@ -1504,7 +1527,8 @@ int fuse_do_setattr(struct dentry *dentry, struct iattr *attr,
goto error;
}
- if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+ if (fuse_invalid_attr(&outarg.attr) ||
+ (inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
make_bad_inode(inode);
err = -EIO;
goto error;
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index fdff7bf4fa4f..c053266235e2 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -18,6 +18,7 @@
#include <linux/swap.h>
#include <linux/falloc.h>
#include <linux/uio.h>
+#include <linux/fs.h>
static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
int opcode, struct fuse_open_out *outargp)
@@ -201,7 +202,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
{
struct fuse_conn *fc = get_fuse_conn(inode);
int err;
- bool lock_inode = (file->f_flags & O_TRUNC) &&
+ bool is_wb_truncate = (file->f_flags & O_TRUNC) &&
fc->atomic_o_trunc &&
fc->writeback_cache;
@@ -209,16 +210,20 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
if (err)
return err;
- if (lock_inode)
+ if (is_wb_truncate) {
inode_lock(inode);
+ fuse_set_nowrite(inode);
+ }
err = fuse_do_open(fc, get_node_id(inode), file, isdir);
if (!err)
fuse_finish_open(inode, file);
- if (lock_inode)
+ if (is_wb_truncate) {
+ fuse_release_nowrite(inode);
inode_unlock(inode);
+ }
return err;
}
@@ -2002,10 +2007,8 @@ static int fuse_writepages(struct address_space *mapping,
err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
if (data.req) {
- /* Ignore errors if we can write at least one page */
BUG_ON(!data.req->num_pages);
fuse_writepages_send(&data);
- err = 0;
}
if (data.ff)
fuse_file_put(data.ff, false, false);
@@ -2612,7 +2615,16 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
struct iovec *iov = iov_page;
iov->iov_base = (void __user *)arg;
- iov->iov_len = _IOC_SIZE(cmd);
+
+ switch (cmd) {
+ case FS_IOC_GETFLAGS:
+ case FS_IOC_SETFLAGS:
+ iov->iov_len = sizeof(int);
+ break;
+ default:
+ iov->iov_len = _IOC_SIZE(cmd);
+ break;
+ }
if (_IOC_DIR(cmd) & _IOC_WRITE) {
in_iov = iov;
@@ -3143,21 +3155,35 @@ static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (fc->no_copy_file_range)
return -EOPNOTSUPP;
- if (fc->writeback_cache) {
- inode_lock(inode_in);
- err = fuse_writeback_range(inode_in, pos_in, pos_in + len);
- inode_unlock(inode_in);
- if (err)
- return err;
- }
+ inode_lock(inode_in);
+ err = fuse_writeback_range(inode_in, pos_in, pos_in + len - 1);
+ inode_unlock(inode_in);
+ if (err)
+ return err;
inode_lock(inode_out);
- if (fc->writeback_cache) {
- err = fuse_writeback_range(inode_out, pos_out, pos_out + len);
- if (err)
- goto out;
- }
+ /*
+ * Write out dirty pages in the destination file before sending the COPY
+ * request to userspace. After the request is completed, truncate off
+ * pages (including partial ones) from the cache that have been copied,
+ * since these contain stale data at that point.
+ *
+ * This should be mostly correct, but if the COPY writes to partial
+ * pages (at the start or end) and the parts not covered by the COPY are
+ * written through a memory map after calling fuse_writeback_range(),
+ * then these partial page modifications will be lost on truncation.
+ *
+ * It is unlikely that someone would rely on such mixed style
+ * modifications. Yet this does give less guarantees than if the
+ * copying was performed with write(2).
+ *
+ * To fix this a i_mmap_sem style lock could be used to prevent new
+ * faults while the copy is ongoing.
+ */
+ err = fuse_writeback_range(inode_out, pos_out, pos_out + len - 1);
+ if (err)
+ goto out;
if (is_unstable)
set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state);
@@ -3178,6 +3204,10 @@ static ssize_t fuse_copy_file_range(struct file *file_in, loff_t pos_in,
if (err)
goto out;
+ truncate_inode_pages_range(inode_out->i_mapping,
+ ALIGN_DOWN(pos_out, PAGE_SIZE),
+ ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1);
+
if (fc->writeback_cache) {
fuse_write_update_size(inode_out, pos_out + outarg.size);
file_update_time(file_out);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 89bdc41e0d86..28ba17462325 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1008,6 +1008,8 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc);
*/
int fuse_valid_type(int m);
+bool fuse_invalid_attr(struct fuse_attr *attr);
+
/**
* Is current process allowed to perform filesystem operation?
*/
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index b2da3de6a78e..656afd035e21 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -184,7 +184,7 @@ static int fuse_direntplus_link(struct file *file,
if (invalid_nodeid(o->nodeid))
return -EIO;
- if (!fuse_valid_type(o->attr.mode))
+ if (fuse_invalid_attr(&o->attr))
return -EIO;
fc = get_fuse_conn(dir);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index b47939900932..d33f1f6edd0a 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -528,10 +528,12 @@ lower_metapath:
/* Advance in metadata tree. */
(mp->mp_list[hgt])++;
- if (mp->mp_list[hgt] >= sdp->sd_inptrs) {
- if (!hgt)
+ if (hgt) {
+ if (mp->mp_list[hgt] >= sdp->sd_inptrs)
+ goto lower_metapath;
+ } else {
+ if (mp->mp_list[hgt] >= sdp->sd_diptrs)
break;
- goto lower_metapath;
}
fill_up_metapath:
@@ -877,10 +879,9 @@ static int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
ret = -ENOENT;
goto unlock;
} else {
- /* report a hole */
iomap->offset = pos;
iomap->length = length;
- goto do_alloc;
+ goto hole_found;
}
}
iomap->length = size;
@@ -934,8 +935,6 @@ unlock:
return ret;
do_alloc:
- iomap->addr = IOMAP_NULL_ADDR;
- iomap->type = IOMAP_HOLE;
if (flags & IOMAP_REPORT) {
if (pos >= size)
ret = -ENOENT;
@@ -957,6 +956,9 @@ do_alloc:
if (pos < size && height == ip->i_height)
ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
}
+hole_found:
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
goto out;
}
@@ -1041,11 +1043,16 @@ static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
unsigned copied, struct page *page,
struct iomap *iomap)
{
+ struct gfs2_trans *tr = current->journal_info;
struct gfs2_inode *ip = GFS2_I(inode);
struct gfs2_sbd *sdp = GFS2_SB(inode);
if (page && !gfs2_is_stuffed(ip))
gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
+
+ if (tr->tr_num_buf_new)
+ __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+
gfs2_trans_end(sdp);
}
@@ -1138,8 +1145,6 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
tr = current->journal_info;
if (tr->tr_num_buf_new)
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
- else
- gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[0]);
gfs2_trans_end(sdp);
}
@@ -1220,8 +1225,16 @@ static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
if (ip->i_qadata && ip->i_qadata->qa_qd_num)
gfs2_quota_unlock(ip);
- gfs2_write_unlock(inode);
+ if (unlikely(!written))
+ goto out_unlock;
+
+ if (iomap->flags & IOMAP_F_SIZE_CHANGED)
+ mark_inode_dirty(inode);
+ set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
+
+out_unlock:
+ gfs2_write_unlock(inode);
out:
return 0;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index d174b1f8fd08..0cdc54258cab 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -6,6 +6,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
+#include <linux/compat.h>
#include <linux/completion.h>
#include <linux/buffer_head.h>
#include <linux/pagemap.h>
@@ -338,6 +339,31 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return -ENOTTY;
}
+#ifdef CONFIG_COMPAT
+static long gfs2_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ switch(cmd) {
+ /* These are just misnamed, they actually get/put from/to user an int */
+ case FS_IOC32_GETFLAGS:
+ cmd = FS_IOC_GETFLAGS;
+ break;
+ case FS_IOC32_SETFLAGS:
+ cmd = FS_IOC_SETFLAGS;
+ break;
+ /* Keep this list in sync with gfs2_ioctl */
+ case FITRIM:
+ case FS_IOC_GETFSLABEL:
+ break;
+ default:
+ return -ENOIOCTLCMD;
+ }
+
+ return gfs2_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#else
+#define gfs2_compat_ioctl NULL
+#endif
+
/**
* gfs2_size_hint - Give a hint to the size of a write request
* @filep: The struct file
@@ -791,7 +817,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct gfs2_inode *ip = GFS2_I(inode);
- ssize_t written = 0, ret;
+ ssize_t ret;
ret = gfs2_rsqa_alloc(ip);
if (ret)
@@ -811,68 +837,58 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
inode_lock(inode);
ret = generic_write_checks(iocb, from);
if (ret <= 0)
- goto out;
-
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = inode_to_bdi(inode);
+ goto out_unlock;
ret = file_remove_privs(file);
if (ret)
- goto out2;
+ goto out_unlock;
ret = file_update_time(file);
if (ret)
- goto out2;
+ goto out_unlock;
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = file->f_mapping;
- loff_t pos, endbyte;
- ssize_t buffered;
+ ssize_t buffered, ret2;
- written = gfs2_file_direct_write(iocb, from);
- if (written < 0 || !iov_iter_count(from))
- goto out2;
+ ret = gfs2_file_direct_write(iocb, from);
+ if (ret < 0 || !iov_iter_count(from))
+ goto out_unlock;
- ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
- if (unlikely(ret < 0))
- goto out2;
- buffered = ret;
+ iocb->ki_flags |= IOCB_DSYNC;
+ current->backing_dev_info = inode_to_bdi(inode);
+ buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
+ current->backing_dev_info = NULL;
+ if (unlikely(buffered <= 0))
+ goto out_unlock;
/*
* We need to ensure that the page cache pages are written to
* disk and invalidated to preserve the expected O_DIRECT
- * semantics.
+ * semantics. If the writeback or invalidate fails, only report
+ * the direct I/O range as we don't know if the buffered pages
+ * made it to disk.
*/
- pos = iocb->ki_pos;
- endbyte = pos + buffered - 1;
- ret = filemap_write_and_wait_range(mapping, pos, endbyte);
- if (!ret) {
- iocb->ki_pos += buffered;
- written += buffered;
- invalidate_mapping_pages(mapping,
- pos >> PAGE_SHIFT,
- endbyte >> PAGE_SHIFT);
- } else {
- /*
- * We don't know how much we wrote, so just return
- * the number of bytes which were direct-written
- */
- }
+ iocb->ki_pos += buffered;
+ ret2 = generic_write_sync(iocb, buffered);
+ invalidate_mapping_pages(mapping,
+ (iocb->ki_pos - buffered) >> PAGE_SHIFT,
+ (iocb->ki_pos - 1) >> PAGE_SHIFT);
+ if (!ret || ret2 > 0)
+ ret += ret2;
} else {
+ current->backing_dev_info = inode_to_bdi(inode);
ret = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops);
- if (likely(ret > 0))
+ current->backing_dev_info = NULL;
+ if (likely(ret > 0)) {
iocb->ki_pos += ret;
+ ret = generic_write_sync(iocb, ret);
+ }
}
-out2:
- current->backing_dev_info = NULL;
-out:
+out_unlock:
inode_unlock(inode);
- if (likely(ret > 0)) {
- /* Handle various SYNC-type writes */
- ret = generic_write_sync(iocb, ret);
- }
- return written ? written : ret;
+ return ret;
}
static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
@@ -1279,6 +1295,7 @@ const struct file_operations gfs2_file_fops = {
.write_iter = gfs2_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
.release = gfs2_release,
@@ -1294,6 +1311,7 @@ const struct file_operations gfs2_file_fops = {
const struct file_operations gfs2_dir_fops = {
.iterate_shared = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
@@ -1310,6 +1328,7 @@ const struct file_operations gfs2_file_fops_nolock = {
.write_iter = gfs2_file_write_iter,
.iopoll = iomap_dio_iopoll,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.mmap = gfs2_mmap,
.open = gfs2_open,
.release = gfs2_release,
@@ -1323,6 +1342,7 @@ const struct file_operations gfs2_file_fops_nolock = {
const struct file_operations gfs2_dir_fops_nolock = {
.iterate_shared = gfs2_readdir,
.unlocked_ioctl = gfs2_ioctl,
+ .compat_ioctl = gfs2_compat_ioctl,
.open = gfs2_open,
.release = gfs2_release,
.fsync = gfs2_fsync,
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index cf4c767005b1..02cb8dc2d76a 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -89,8 +89,32 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
INIT_LIST_HEAD(&tr.tr_databuf);
tr.tr_revokes = atomic_read(&gl->gl_ail_count);
- if (!tr.tr_revokes)
+ if (!tr.tr_revokes) {
+ bool have_revokes;
+ bool log_in_flight;
+
+ /*
+ * We have nothing on the ail, but there could be revokes on
+ * the sdp revoke queue, in which case, we still want to flush
+ * the log and wait for it to finish.
+ *
+ * If the sdp revoke list is empty too, we might still have an
+ * io outstanding for writing revokes, so we should wait for
+ * it before returning.
+ *
+ * If none of these conditions are true, our revokes are all
+ * flushed and we can return.
+ */
+ gfs2_log_lock(sdp);
+ have_revokes = !list_empty(&sdp->sd_log_revokes);
+ log_in_flight = atomic_read(&sdp->sd_log_in_flight);
+ gfs2_log_unlock(sdp);
+ if (have_revokes)
+ goto flush;
+ if (log_in_flight)
+ log_flush_wait(sdp);
return;
+ }
/* A shortened, inline version of gfs2_trans_begin()
* tr->alloced is not set since the transaction structure is
@@ -105,6 +129,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
__gfs2_ail_flush(gl, 0, tr.tr_revokes);
gfs2_trans_end(sdp);
+flush:
gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_NORMAL |
GFS2_LFC_AIL_EMPTY_GL);
}
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index b296c59832a7..abb8230324a0 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1248,7 +1248,7 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry,
if (!(file->f_mode & FMODE_OPENED))
return finish_no_open(file, d);
dput(d);
- return 0;
+ return excl && (flags & O_CREAT) ? -EEXIST : 0;
}
BUG_ON(d != NULL);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index c4c9700c366e..cc0b71d44a1a 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -513,7 +513,7 @@ static void log_pull_tail(struct gfs2_sbd *sdp, unsigned int new_tail)
}
-static void log_flush_wait(struct gfs2_sbd *sdp)
+void log_flush_wait(struct gfs2_sbd *sdp)
{
DEFINE_WAIT(wait);
@@ -598,17 +598,25 @@ void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
struct buffer_head *bh = bd->bd_bh;
struct gfs2_glock *gl = bd->bd_gl;
+ sdp->sd_log_num_revoke++;
+ if (atomic_inc_return(&gl->gl_revokes) == 1)
+ gfs2_glock_hold(gl);
bh->b_private = NULL;
bd->bd_blkno = bh->b_blocknr;
gfs2_remove_from_ail(bd); /* drops ref on bh */
bd->bd_bh = NULL;
- sdp->sd_log_num_revoke++;
- if (atomic_inc_return(&gl->gl_revokes) == 1)
- gfs2_glock_hold(gl);
set_bit(GLF_LFLUSH, &gl->gl_flags);
list_add(&bd->bd_list, &sdp->sd_log_revokes);
}
+void gfs2_glock_remove_revoke(struct gfs2_glock *gl)
+{
+ if (atomic_dec_return(&gl->gl_revokes) == 0) {
+ clear_bit(GLF_LFLUSH, &gl->gl_flags);
+ gfs2_glock_queue_put(gl);
+ }
+}
+
void gfs2_write_revokes(struct gfs2_sbd *sdp)
{
struct gfs2_trans *tr;
@@ -873,8 +881,10 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, u32 flags)
* @new: New transaction to be merged
*/
-static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
+static void gfs2_merge_trans(struct gfs2_sbd *sdp, struct gfs2_trans *new)
{
+ struct gfs2_trans *old = sdp->sd_log_tr;
+
WARN_ON_ONCE(!test_bit(TR_ATTACHED, &old->tr_flags));
old->tr_num_buf_new += new->tr_num_buf_new;
@@ -886,6 +896,11 @@ static void gfs2_merge_trans(struct gfs2_trans *old, struct gfs2_trans *new)
list_splice_tail_init(&new->tr_databuf, &old->tr_databuf);
list_splice_tail_init(&new->tr_buf, &old->tr_buf);
+
+ spin_lock(&sdp->sd_ail_lock);
+ list_splice_tail_init(&new->tr_ail1_list, &old->tr_ail1_list);
+ list_splice_tail_init(&new->tr_ail2_list, &old->tr_ail2_list);
+ spin_unlock(&sdp->sd_ail_lock);
}
static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
@@ -897,7 +912,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
gfs2_log_lock(sdp);
if (sdp->sd_log_tr) {
- gfs2_merge_trans(sdp->sd_log_tr, tr);
+ gfs2_merge_trans(sdp, tr);
} else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
gfs2_assert_withdraw(sdp, test_bit(TR_ALLOCED, &tr->tr_flags));
sdp->sd_log_tr = tr;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 2315fca47a2b..52b9bf27e918 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -73,10 +73,12 @@ extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
u32 type);
extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
+extern void log_flush_wait(struct gfs2_sbd *sdp);
extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
extern int gfs2_logd(void *data);
extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
+extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl);
extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
#endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 1921cda034fd..ab5d9fca73f6 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -264,7 +264,7 @@ static struct bio *gfs2_log_alloc_bio(struct gfs2_sbd *sdp, u64 blkno,
struct super_block *sb = sdp->sd_vfs;
struct bio *bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
- bio->bi_iter.bi_sector = blkno * (sb->s_blocksize >> 9);
+ bio->bi_iter.bi_sector = blkno << sdp->sd_fsb2bb_shift;
bio_set_dev(bio, sb->s_bdev);
bio->bi_end_io = end_io;
bio->bi_private = sdp;
@@ -421,7 +421,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) {
if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) {
- if (lh.lh_sequence > head->lh_sequence)
+ if (lh.lh_sequence >= head->lh_sequence)
*head = lh;
else {
ret = true;
@@ -471,6 +471,20 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
put_page(page); /* Once more for find_or_create_page */
}
+static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs)
+{
+ struct bio *new;
+
+ new = bio_alloc(GFP_NOIO, nr_iovecs);
+ bio_copy_dev(new, prev);
+ new->bi_iter.bi_sector = bio_end_sector(prev);
+ new->bi_opf = prev->bi_opf;
+ new->bi_write_hint = prev->bi_write_hint;
+ bio_chain(new, prev);
+ submit_bio(prev);
+ return new;
+}
+
/**
* gfs2_find_jhead - find the head of a log
* @jd: The journal descriptor
@@ -487,10 +501,10 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
struct address_space *mapping = jd->jd_inode->i_mapping;
unsigned int block = 0, blocks_submitted = 0, blocks_read = 0;
- unsigned int bsize = sdp->sd_sb.sb_bsize;
+ unsigned int bsize = sdp->sd_sb.sb_bsize, off;
unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
unsigned int shift = PAGE_SHIFT - bsize_shift;
- unsigned int readhead_blocks = BIO_MAX_PAGES << shift;
+ unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift;
struct gfs2_journal_extent *je;
int sz, ret = 0;
struct bio *bio = NULL;
@@ -504,9 +518,9 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
since = filemap_sample_wb_err(mapping);
list_for_each_entry(je, &jd->extent_list, list) {
- for (; block < je->lblock + je->blocks; block++) {
- u64 dblock;
+ u64 dblock = je->dblock;
+ for (; block < je->lblock + je->blocks; block++, dblock++) {
if (!page) {
page = find_or_create_page(mapping,
block >> shift, GFP_NOFS);
@@ -515,35 +529,41 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head,
done = true;
goto out;
}
+ off = 0;
}
- if (bio) {
- unsigned int off;
-
- off = (block << bsize_shift) & ~PAGE_MASK;
- sz = bio_add_page(bio, page, bsize, off);
- if (sz == bsize) { /* block added */
- if (off + bsize == PAGE_SIZE) {
- page = NULL;
- goto page_added;
- }
- continue;
+ if (bio && (off || block < blocks_submitted + max_blocks)) {
+ sector_t sector = dblock << sdp->sd_fsb2bb_shift;
+
+ if (bio_end_sector(bio) == sector) {
+ sz = bio_add_page(bio, page, bsize, off);
+ if (sz == bsize)
+ goto block_added;
+ }
+ if (off) {
+ unsigned int blocks =
+ (PAGE_SIZE - off) >> bsize_shift;
+
+ bio = gfs2_chain_bio(bio, blocks);
+ goto add_block_to_new_bio;
}
- blocks_submitted = block + 1;
+ }
+
+ if (bio) {
+ blocks_submitted = block;
submit_bio(bio);
- bio = NULL;
}
- dblock = je->dblock + (block - je->lblock);
bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read);
bio->bi_opf = REQ_OP_READ;
- sz = bio_add_page(bio, page, bsize, 0);
- gfs2_assert_warn(sdp, sz == bsize);
- if (bsize == PAGE_SIZE)
+add_block_to_new_bio:
+ sz = bio_add_page(bio, page, bsize, off);
+ BUG_ON(sz != bsize);
+block_added:
+ off += bsize;
+ if (off == PAGE_SIZE)
page = NULL;
-
-page_added:
- if (blocks_submitted < blocks_read + readhead_blocks) {
+ if (blocks_submitted <= blocks_read + max_blocks) {
/* Keep at least one bio in flight */
continue;
}
@@ -864,10 +884,7 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
bd = list_entry(head->next, struct gfs2_bufdata, bd_list);
list_del_init(&bd->bd_list);
gl = bd->bd_gl;
- if (atomic_dec_return(&gl->gl_revokes) == 0) {
- clear_bit(GLF_LFLUSH, &gl->gl_flags);
- gfs2_glock_queue_put(gl);
- }
+ gfs2_glock_remove_revoke(gl);
kmem_cache_free(gfs2_bufdata_cachep, bd);
}
}
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 08823bb3b2d0..aa98abd177c3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -901,7 +901,7 @@ fail:
}
static const match_table_t nolock_tokens = {
- { Opt_jid, "jid=%d\n", },
+ { Opt_jid, "jid=%d", },
{ Opt_err, NULL },
};
@@ -1158,7 +1158,17 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
goto fail_per_node;
}
- if (!sb_rdonly(sb)) {
+ if (sb_rdonly(sb)) {
+ struct gfs2_holder freeze_gh;
+
+ error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
+ GL_EXACT, &freeze_gh);
+ if (error) {
+ fs_err(sdp, "can't make FS RO: %d\n", error);
+ goto fail_per_node;
+ }
+ gfs2_glock_dq_uninit(&freeze_gh);
+ } else {
error = gfs2_make_fs_rw(sdp);
if (error) {
fs_err(sdp, "can't make FS RW: %d\n", error);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 8189b581236d..43e117308279 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1040,8 +1040,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
u32 x;
int error = 0;
- if (capable(CAP_SYS_RESOURCE) ||
- sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
+ if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
return 0;
error = gfs2_quota_hold(ip, uid, gid);
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 765627d9a91e..fe68a91dc16f 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -44,7 +44,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip,
int ret;
ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
- if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+ if (capable(CAP_SYS_RESOURCE) ||
+ sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
ret = gfs2_quota_lock(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
if (ret)
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 6f67ef7aa412..16bebcbfc716 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -262,6 +262,8 @@ void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
list_del_init(&bd->bd_list);
gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
sdp->sd_log_num_revoke--;
+ if (bd->bd_gl)
+ gfs2_glock_remove_revoke(bd->bd_gl);
kmem_cache_free(gfs2_bufdata_cachep, bd);
tr->tr_num_revoke_rm++;
if (--n == 0)
diff --git a/fs/hfsplus/attributes.c b/fs/hfsplus/attributes.c
index e6d554476db4..eeebe80c6be4 100644
--- a/fs/hfsplus/attributes.c
+++ b/fs/hfsplus/attributes.c
@@ -292,6 +292,10 @@ static int __hfsplus_delete_attr(struct inode *inode, u32 cnid,
return -ENOENT;
}
+ /* Avoid btree corruption */
+ hfs_bnode_read(fd->bnode, fd->search_key,
+ fd->keyoffset, fd->keylength);
+
err = hfs_brec_remove(fd);
if (err)
return err;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 1dcc57189382..0305b618e7ed 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1461,28 +1461,43 @@ static int __init init_hugetlbfs_fs(void)
sizeof(struct hugetlbfs_inode_info),
0, SLAB_ACCOUNT, init_once);
if (hugetlbfs_inode_cachep == NULL)
- goto out2;
+ goto out;
error = register_filesystem(&hugetlbfs_fs_type);
if (error)
- goto out;
+ goto out_free;
+ /* default hstate mount is required */
+ mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]);
+ if (IS_ERR(mnt)) {
+ error = PTR_ERR(mnt);
+ goto out_unreg;
+ }
+ hugetlbfs_vfsmount[default_hstate_idx] = mnt;
+
+ /* other hstates are optional */
i = 0;
for_each_hstate(h) {
- mnt = mount_one_hugetlbfs(h);
- if (IS_ERR(mnt) && i == 0) {
- error = PTR_ERR(mnt);
- goto out;
+ if (i == default_hstate_idx) {
+ i++;
+ continue;
}
- hugetlbfs_vfsmount[i] = mnt;
+
+ mnt = mount_one_hugetlbfs(h);
+ if (IS_ERR(mnt))
+ hugetlbfs_vfsmount[i] = NULL;
+ else
+ hugetlbfs_vfsmount[i] = mnt;
i++;
}
return 0;
- out:
+ out_unreg:
+ (void)unregister_filesystem(&hugetlbfs_fs_type);
+ out_free:
kmem_cache_destroy(hugetlbfs_inode_cachep);
- out2:
+ out:
return error;
}
fs_initcall(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index ed54d3759219..b6748888f140 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -137,6 +137,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
+ atomic64_set(&inode->i_sequence, 0);
atomic_set(&inode->i_count, 1);
inode->i_op = &empty_iops;
inode->i_fop = &no_open_fops;
@@ -673,6 +674,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
struct inode *inode, *next;
LIST_HEAD(dispose);
+again:
spin_lock(&sb->s_inode_list_lock);
list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
spin_lock(&inode->i_lock);
@@ -695,6 +697,12 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
inode_lru_list_del(inode);
spin_unlock(&inode->i_lock);
list_add(&inode->i_lru, &dispose);
+ if (need_resched()) {
+ spin_unlock(&sb->s_inode_list_lock);
+ cond_resched();
+ dispose_list(&dispose);
+ goto again;
+ }
}
spin_unlock(&sb->s_inode_list_lock);
diff --git a/fs/internal.h b/fs/internal.h
index a48ef81be37d..e891a8a8748e 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -37,7 +37,7 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
/*
* buffer.c
*/
-extern void guard_bio_eod(int rw, struct bio *bio);
+extern void guard_bio_eod(struct bio *bio);
extern int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
get_block_t *get_block, struct iomap *iomap);
void __generic_write_end(struct inode *inode, loff_t pos, unsigned copied,
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 5f5d809d529a..98f43d0427bc 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -258,6 +258,8 @@ struct io_ring_ctx {
struct user_struct *user;
+ const struct cred *creds;
+
struct completion ctx_done;
struct {
@@ -406,6 +408,7 @@ static struct io_ring_ctx *io_ring_ctx_alloc(struct io_uring_params *p)
}
ctx->flags = p->flags;
+ init_waitqueue_head(&ctx->sqo_wait);
init_waitqueue_head(&ctx->cq_wait);
init_completion(&ctx->ctx_done);
init_completion(&ctx->sqo_thread_started);
@@ -754,11 +757,17 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
mutex_unlock(&ctx->uring_lock);
}
-static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
- long min)
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+ long min)
{
int iters = 0, ret = 0;
+ /*
+ * We disallow the app entering submit/complete with polling, but we
+ * still need to lock the ring to prevent racing with polled issue
+ * that got punted to a workqueue.
+ */
+ mutex_lock(&ctx->uring_lock);
do {
int tmin = 0;
@@ -794,21 +803,6 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
ret = 0;
} while (min && !*nr_events && !need_resched());
- return ret;
-}
-
-static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
- long min)
-{
- int ret;
-
- /*
- * We disallow the app entering submit/complete with polling, but we
- * still need to lock the ring to prevent racing with polled issue
- * that got punted to a workqueue.
- */
- mutex_lock(&ctx->uring_lock);
- ret = __io_iopoll_check(ctx, nr_events, min);
mutex_unlock(&ctx->uring_lock);
return ret;
}
@@ -1468,8 +1462,11 @@ static void io_poll_complete_work(struct work_struct *work)
struct io_poll_iocb *poll = &req->poll;
struct poll_table_struct pt = { ._key = poll->events };
struct io_ring_ctx *ctx = req->ctx;
+ const struct cred *old_cred;
__poll_t mask = 0;
+ old_cred = override_creds(ctx->creds);
+
if (!READ_ONCE(poll->canceled))
mask = vfs_poll(poll->file, &pt) & poll->events;
@@ -1484,7 +1481,7 @@ static void io_poll_complete_work(struct work_struct *work)
if (!mask && !READ_ONCE(poll->canceled)) {
add_wait_queue(poll->head, &poll->wait);
spin_unlock_irq(&ctx->completion_lock);
- return;
+ goto out;
}
list_del_init(&req->list);
io_poll_complete(ctx, req, mask);
@@ -1492,6 +1489,8 @@ static void io_poll_complete_work(struct work_struct *work)
io_cqring_ev_posted(ctx);
io_put_req(req);
+out:
+ revert_creds(old_cred);
}
static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
@@ -1613,7 +1612,7 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
}
static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+ struct sqe_submit *s)
{
struct io_uring_sqe *sqe_copy;
@@ -1631,7 +1630,8 @@ static int io_req_defer(struct io_ring_ctx *ctx, struct io_kiocb *req,
return 0;
}
- memcpy(sqe_copy, sqe, sizeof(*sqe_copy));
+ memcpy(&req->submit, s, sizeof(*s));
+ memcpy(sqe_copy, s->sqe, sizeof(*sqe_copy));
req->submit.sqe = sqe_copy;
INIT_WORK(&req->work, io_sq_wq_submit_work);
@@ -1734,10 +1734,12 @@ static void io_sq_wq_submit_work(struct work_struct *work)
struct io_ring_ctx *ctx = req->ctx;
struct mm_struct *cur_mm = NULL;
struct async_list *async_list;
+ const struct cred *old_cred;
LIST_HEAD(req_list);
mm_segment_t old_fs;
int ret;
+ old_cred = override_creds(ctx->creds);
async_list = io_async_list_from_sqe(ctx, req->submit.sqe);
restart:
do {
@@ -1845,6 +1847,7 @@ out:
unuse_mm(cur_mm);
mmput(cur_mm);
}
+ revert_creds(old_cred);
}
/*
@@ -1943,7 +1946,7 @@ static int io_submit_sqe(struct io_ring_ctx *ctx, struct sqe_submit *s,
if (unlikely(ret))
goto out;
- ret = io_req_defer(ctx, req, s->sqe);
+ ret = io_req_defer(ctx, req, s);
if (ret) {
if (ret == -EIOCBQUEUED)
ret = 0;
@@ -2108,6 +2111,7 @@ static int io_sq_thread(void *data)
struct sqe_submit sqes[IO_IOPOLL_BATCH];
struct io_ring_ctx *ctx = data;
struct mm_struct *cur_mm = NULL;
+ const struct cred *old_cred;
mm_segment_t old_fs;
DEFINE_WAIT(wait);
unsigned inflight;
@@ -2117,6 +2121,7 @@ static int io_sq_thread(void *data)
old_fs = get_fs();
set_fs(USER_DS);
+ old_cred = override_creds(ctx->creds);
timeout = inflight = 0;
while (!kthread_should_park()) {
@@ -2137,7 +2142,7 @@ static int io_sq_thread(void *data)
*/
mutex_lock(&ctx->uring_lock);
if (!list_empty(&ctx->poll_list))
- __io_iopoll_check(ctx, &nr_events, 0);
+ io_iopoll_getevents(ctx, &nr_events, 0);
else
inflight = 0;
mutex_unlock(&ctx->uring_lock);
@@ -2156,16 +2161,6 @@ static int io_sq_thread(void *data)
if (!io_get_sqring(ctx, &sqes[0])) {
/*
- * We're polling. If we're within the defined idle
- * period, then let us spin without work before going
- * to sleep.
- */
- if (inflight || !time_after(jiffies, timeout)) {
- cpu_relax();
- continue;
- }
-
- /*
* Drop cur_mm before scheduling, we can't hold it for
* long periods (or over schedule()). Do this before
* adding ourselves to the waitqueue, as the unuse/drop
@@ -2177,6 +2172,16 @@ static int io_sq_thread(void *data)
cur_mm = NULL;
}
+ /*
+ * We're polling. If we're within the defined idle
+ * period, then let us spin without work before going
+ * to sleep.
+ */
+ if (inflight || !time_after(jiffies, timeout)) {
+ cpu_relax();
+ continue;
+ }
+
prepare_to_wait(&ctx->sqo_wait, &wait,
TASK_INTERRUPTIBLE);
@@ -2235,6 +2240,7 @@ static int io_sq_thread(void *data)
unuse_mm(cur_mm);
mmput(cur_mm);
}
+ revert_creds(old_cred);
kthread_parkme();
@@ -2283,7 +2289,6 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
const sigset_t __user *sig, size_t sigsz)
{
struct io_cq_ring *ring = ctx->cq_ring;
- sigset_t ksigmask, sigsaved;
int ret;
if (io_cqring_events(ring) >= min_events)
@@ -2293,21 +2298,17 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
#ifdef CONFIG_COMPAT
if (in_compat_syscall())
ret = set_compat_user_sigmask((const compat_sigset_t __user *)sig,
- &ksigmask, &sigsaved, sigsz);
+ sigsz);
else
#endif
- ret = set_user_sigmask(sig, &ksigmask,
- &sigsaved, sigsz);
+ ret = set_user_sigmask(sig, sigsz);
if (ret)
return ret;
}
ret = wait_event_interruptible(ctx->wait, io_cqring_events(ring) >= min_events);
-
- if (sig)
- restore_user_sigmask(sig, &sigsaved, ret == -ERESTARTSYS);
-
+ restore_saved_sigmask_unless(ret == -ERESTARTSYS);
if (ret == -ERESTARTSYS)
ret = -EINTR;
@@ -2391,13 +2392,6 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
struct sk_buff *skb;
int i;
- if (!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
- unsigned long inflight = ctx->user->unix_inflight + nr;
-
- if (inflight > task_rlimit(current, RLIMIT_NOFILE))
- return -EMFILE;
- }
-
fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
if (!fpl)
return -ENOMEM;
@@ -2532,7 +2526,6 @@ static int io_sq_offload_start(struct io_ring_ctx *ctx,
{
int ret;
- init_waitqueue_head(&ctx->sqo_wait);
mmgrab(current->mm);
ctx->sqo_mm = current->mm;
@@ -2750,8 +2743,8 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, void __user *arg,
ret = 0;
if (!pages || nr_pages > got_pages) {
- kfree(vmas);
- kfree(pages);
+ kvfree(vmas);
+ kvfree(pages);
pages = kvmalloc_array(nr_pages, sizeof(struct page *),
GFP_KERNEL);
vmas = kvmalloc_array(nr_pages,
@@ -2898,6 +2891,8 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_unaccount_mem(ctx->user,
ring_pages(ctx->sq_entries, ctx->cq_entries));
free_uid(ctx->user);
+ if (ctx->creds)
+ put_cred(ctx->creds);
kfree(ctx);
}
@@ -3175,6 +3170,12 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p)
ctx->account_mem = account_mem;
ctx->user = user;
+ ctx->creds = get_current_cred();
+ if (!ctx->creds) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
ret = io_allocate_scq_urings(ctx, p);
if (ret)
goto err;
diff --git a/fs/ioctl.c b/fs/ioctl.c
index fef3a6bf7c78..3118da0de158 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -8,6 +8,7 @@
#include <linux/syscalls.h>
#include <linux/mm.h>
#include <linux/capability.h>
+#include <linux/compat.h>
#include <linux/file.h>
#include <linux/fs.h>
#include <linux/security.h>
@@ -719,3 +720,37 @@ SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg)
{
return ksys_ioctl(fd, cmd, arg);
}
+
+#ifdef CONFIG_COMPAT
+/**
+ * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation
+ *
+ * This is not normally called as a function, but instead set in struct
+ * file_operations as
+ *
+ * .compat_ioctl = compat_ptr_ioctl,
+ *
+ * On most architectures, the compat_ptr_ioctl() just passes all arguments
+ * to the corresponding ->ioctl handler. The exception is arch/s390, where
+ * compat_ptr() clears the top bit of a 32-bit pointer value, so user space
+ * pointers to the second 2GB alias the first 2GB, as is the case for
+ * native 32-bit s390 user space.
+ *
+ * The compat_ptr_ioctl() function must therefore be used only with ioctl
+ * functions that either ignore the argument or pass a pointer to a
+ * compatible data type.
+ *
+ * If any ioctl command handled by fops->unlocked_ioctl passes a plain
+ * integer instead of a pointer, or any of the passed data types
+ * is incompatible between 32-bit and 64-bit architectures, a proper
+ * handler is required instead of compat_ptr_ioctl.
+ */
+long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ if (!file->f_op->unlocked_ioctl)
+ return -ENOIOCTLCMD;
+
+ return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+EXPORT_SYMBOL(compat_ptr_ioctl);
+#endif
diff --git a/fs/iomap.c b/fs/iomap.c
index da961fca3180..a486202b198f 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -777,6 +777,7 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
unsigned copied, struct page *page, struct iomap *iomap)
{
const struct iomap_page_ops *page_ops = iomap->page_ops;
+ loff_t old_size = inode->i_size;
int ret;
if (iomap->type == IOMAP_INLINE) {
@@ -788,7 +789,19 @@ iomap_write_end(struct inode *inode, loff_t pos, unsigned len,
ret = __iomap_write_end(inode, pos, len, copied, page, iomap);
}
- __generic_write_end(inode, pos, ret, page);
+ /*
+ * Update the in-memory inode size after copying the data into the page
+ * cache. It's up to the file system to write the updated size to disk,
+ * preferably after I/O completion so that no stale data is exposed.
+ */
+ if (pos + ret > old_size) {
+ i_size_write(inode, pos + ret);
+ iomap->flags |= IOMAP_F_SIZE_CHANGED;
+ }
+ unlock_page(page);
+
+ if (old_size < pos)
+ pagecache_isize_extended(inode, old_size, pos);
if (page_ops && page_ops->page_done)
page_ops->page_done(inode, pos, copied, page, iomap);
put_page(page);
@@ -1754,7 +1767,9 @@ zero_tail:
if (pad)
iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
}
- return copied ? copied : ret;
+ if (copied)
+ return copied;
+ return ret;
}
static loff_t
@@ -1933,8 +1948,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
}
pos += ret;
- if (iov_iter_rw(iter) == READ && pos >= dio->i_size)
+ if (iov_iter_rw(iter) == READ && pos >= dio->i_size) {
+ /*
+ * We only report that we've read data up to i_size.
+ * Revert iter to a state corresponding to that as
+ * some callers (such as splice code) rely on it.
+ */
+ iov_iter_revert(iter, pos - dio->i_size);
break;
+ }
} while ((count = iov_iter_count(iter)) > 0);
blk_finish_plug(&plug);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index a1909066bde6..62cf497f18eb 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -164,7 +164,7 @@ void __jbd2_log_wait_for_space(journal_t *journal)
"journal space in %s\n", __func__,
journal->j_devname);
WARN_ON(1);
- jbd2_journal_abort(journal, 0);
+ jbd2_journal_abort(journal, -EIO);
}
write_lock(&journal->j_state_lock);
} else {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 668f9021cf11..5b99036b91e0 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -727,7 +727,6 @@ start_journal_io:
submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
}
cond_resched();
- stats.run.rs_blocks_logged += bufs;
/* Force a new descriptor to be generated next
time round the loop. */
@@ -785,7 +784,7 @@ start_journal_io:
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
- __jbd2_journal_abort_hard(journal);
+ jbd2_journal_abort(journal, err);
}
blk_finish_plug(&plug);
@@ -814,6 +813,7 @@ start_journal_io:
if (unlikely(!buffer_uptodate(bh)))
err = -EIO;
jbd2_unfile_log_bh(bh);
+ stats.run.rs_blocks_logged++;
/*
* The list contains temporary buffer heads created by
@@ -859,6 +859,7 @@ start_journal_io:
BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
clear_buffer_jwrite(bh);
jbd2_unfile_log_bh(bh);
+ stats.run.rs_blocks_logged++;
__brelse(bh); /* One for getblk */
/* AKPM: bforget here */
}
@@ -876,10 +877,11 @@ start_journal_io:
err = journal_submit_commit_record(journal, commit_transaction,
&cbh, crc32_sum);
if (err)
- __jbd2_journal_abort_hard(journal);
+ jbd2_journal_abort(journal, err);
}
if (cbh)
err = journal_wait_on_commit_record(journal, cbh);
+ stats.run.rs_blocks_logged++;
if (jbd2_has_feature_async_commit(journal) &&
journal->j_flags & JBD2_BARRIER) {
blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
@@ -972,29 +974,34 @@ restart_loop:
* it. */
/*
- * A buffer which has been freed while still being journaled by
- * a previous transaction.
- */
- if (buffer_freed(bh)) {
+ * A buffer which has been freed while still being journaled
+ * by a previous transaction, refile the buffer to BJ_Forget of
+ * the running transaction. If the just committed transaction
+ * contains "add to orphan" operation, we can completely
+ * invalidate the buffer now. We are rather through in that
+ * since the buffer may be still accessible when blocksize <
+ * pagesize and it is attached to the last partial page.
+ */
+ if (buffer_freed(bh) && !jh->b_next_transaction) {
+ struct address_space *mapping;
+
+ clear_buffer_freed(bh);
+ clear_buffer_jbddirty(bh);
+
/*
- * If the running transaction is the one containing
- * "add to orphan" operation (b_next_transaction !=
- * NULL), we have to wait for that transaction to
- * commit before we can really get rid of the buffer.
- * So just clear b_modified to not confuse transaction
- * credit accounting and refile the buffer to
- * BJ_Forget of the running transaction. If the just
- * committed transaction contains "add to orphan"
- * operation, we can completely invalidate the buffer
- * now. We are rather through in that since the
- * buffer may be still accessible when blocksize <
- * pagesize and it is attached to the last partial
- * page.
+ * Block device buffers need to stay mapped all the
+ * time, so it is enough to clear buffer_jbddirty and
+ * buffer_freed bits. For the file mapping buffers (i.e.
+ * journalled data) we need to unmap buffer and clear
+ * more bits. We also need to be careful about the check
+ * because the data page mapping can get cleared under
+ * our hands. Note that if mapping == NULL, we don't
+ * need to make buffer unmapped because the page is
+ * already detached from the mapping and buffers cannot
+ * get reused.
*/
- jh->b_modified = 0;
- if (!jh->b_next_transaction) {
- clear_buffer_freed(bh);
- clear_buffer_jbddirty(bh);
+ mapping = READ_ONCE(bh->b_page->mapping);
+ if (mapping && !sb_is_blkdev_sb(mapping->host->i_sb)) {
clear_buffer_mapped(bh);
clear_buffer_new(bh);
clear_buffer_req(bh);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e0382067c824..92a1df0a9e56 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -101,7 +101,6 @@ EXPORT_SYMBOL(jbd2_journal_release_jbd_inode);
EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
EXPORT_SYMBOL(jbd2_inode_cache);
-static void __journal_abort_soft (journal_t *journal, int errno);
static int jbd2_journal_create_slab(size_t slab_size);
#ifdef CONFIG_JBD2_DEBUG
@@ -810,7 +809,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr,
"at offset %lu on %s\n",
__func__, blocknr, journal->j_devname);
err = -EIO;
- __journal_abort_soft(journal, err);
+ jbd2_journal_abort(journal, err);
}
} else {
*retp = blocknr; /* +journal->j_blk_offset */
@@ -986,6 +985,7 @@ static void *jbd2_seq_info_start(struct seq_file *seq, loff_t *pos)
static void *jbd2_seq_info_next(struct seq_file *seq, void *v, loff_t *pos)
{
+ (*pos)++;
return NULL;
}
@@ -1353,8 +1353,10 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
int ret;
/* Buffer got discarded which means block device got invalidated */
- if (!buffer_mapped(bh))
+ if (!buffer_mapped(bh)) {
+ unlock_buffer(bh);
return -EIO;
+ }
trace_jbd2_write_superblock(journal, write_flags);
if (!(journal->j_flags & JBD2_BARRIER))
@@ -1686,6 +1688,11 @@ int jbd2_journal_load(journal_t *journal)
journal->j_devname);
return -EFSCORRUPTED;
}
+ /*
+ * clear JBD2_ABORT flag initialized in journal_init_common
+ * here to update log tail information with the newest seq.
+ */
+ journal->j_flags &= ~JBD2_ABORT;
/* OK, we've finished with the dynamic journal bits:
* reinitialise the dynamic contents of the superblock in memory
@@ -1693,7 +1700,6 @@ int jbd2_journal_load(journal_t *journal)
if (journal_reset(journal))
goto recovery_error;
- journal->j_flags &= ~JBD2_ABORT;
journal->j_flags |= JBD2_LOADED;
return 0;
@@ -2070,67 +2076,6 @@ int jbd2_journal_wipe(journal_t *journal, int write)
return err;
}
-/*
- * Journal abort has very specific semantics, which we describe
- * for journal abort.
- *
- * Two internal functions, which provide abort to the jbd layer
- * itself are here.
- */
-
-/*
- * Quick version for internal journal use (doesn't lock the journal).
- * Aborts hard --- we mark the abort as occurred, but do _nothing_ else,
- * and don't attempt to make any other journal updates.
- */
-void __jbd2_journal_abort_hard(journal_t *journal)
-{
- transaction_t *transaction;
-
- if (journal->j_flags & JBD2_ABORT)
- return;
-
- printk(KERN_ERR "Aborting journal on device %s.\n",
- journal->j_devname);
-
- write_lock(&journal->j_state_lock);
- journal->j_flags |= JBD2_ABORT;
- transaction = journal->j_running_transaction;
- if (transaction)
- __jbd2_log_start_commit(journal, transaction->t_tid);
- write_unlock(&journal->j_state_lock);
-}
-
-/* Soft abort: record the abort error status in the journal superblock,
- * but don't do any other IO. */
-static void __journal_abort_soft (journal_t *journal, int errno)
-{
- int old_errno;
-
- write_lock(&journal->j_state_lock);
- old_errno = journal->j_errno;
- if (!journal->j_errno || errno == -ESHUTDOWN)
- journal->j_errno = errno;
-
- if (journal->j_flags & JBD2_ABORT) {
- write_unlock(&journal->j_state_lock);
- if (!old_errno && old_errno != -ESHUTDOWN &&
- errno == -ESHUTDOWN)
- jbd2_journal_update_sb_errno(journal);
- return;
- }
- write_unlock(&journal->j_state_lock);
-
- __jbd2_journal_abort_hard(journal);
-
- if (errno) {
- jbd2_journal_update_sb_errno(journal);
- write_lock(&journal->j_state_lock);
- journal->j_flags |= JBD2_REC_ERR;
- write_unlock(&journal->j_state_lock);
- }
-}
-
/**
* void jbd2_journal_abort () - Shutdown the journal immediately.
* @journal: the journal to shutdown.
@@ -2170,16 +2115,51 @@ static void __journal_abort_soft (journal_t *journal, int errno)
* failure to disk. ext3_error, for example, now uses this
* functionality.
*
- * Errors which originate from within the journaling layer will NOT
- * supply an errno; a null errno implies that absolutely no further
- * writes are done to the journal (unless there are any already in
- * progress).
- *
*/
void jbd2_journal_abort(journal_t *journal, int errno)
{
- __journal_abort_soft(journal, errno);
+ transaction_t *transaction;
+
+ /*
+ * ESHUTDOWN always takes precedence because a file system check
+ * caused by any other journal abort error is not required after
+ * a shutdown triggered.
+ */
+ write_lock(&journal->j_state_lock);
+ if (journal->j_flags & JBD2_ABORT) {
+ int old_errno = journal->j_errno;
+
+ write_unlock(&journal->j_state_lock);
+ if (old_errno != -ESHUTDOWN && errno == -ESHUTDOWN) {
+ journal->j_errno = errno;
+ jbd2_journal_update_sb_errno(journal);
+ }
+ return;
+ }
+
+ /*
+ * Mark the abort as occurred and start current running transaction
+ * to release all journaled buffer.
+ */
+ pr_err("Aborting journal on device %s.\n", journal->j_devname);
+
+ journal->j_flags |= JBD2_ABORT;
+ journal->j_errno = errno;
+ transaction = journal->j_running_transaction;
+ if (transaction)
+ __jbd2_log_start_commit(journal, transaction->t_tid);
+ write_unlock(&journal->j_state_lock);
+
+ /*
+ * Record errno to the journal super block, so that fsck and jbd2
+ * layer could realise that a filesystem check is needed.
+ */
+ jbd2_journal_update_sb_errno(journal);
+
+ write_lock(&journal->j_state_lock);
+ journal->j_flags |= JBD2_REC_ERR;
+ write_unlock(&journal->j_state_lock);
}
/**
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 990e7b5062e7..f6163da64bb5 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -862,8 +862,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh,
char *frozen_buffer = NULL;
unsigned long start_lock, time_lock;
- if (is_handle_aborted(handle))
- return -EROFS;
journal = transaction->t_journal;
jbd_debug(5, "journal_head %p, force_copy %d\n", jh, force_copy);
@@ -1078,8 +1076,8 @@ static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh,
/* For undo access buffer must have data copied */
if (undo && !jh->b_committed_data)
goto out;
- if (jh->b_transaction != handle->h_transaction &&
- jh->b_next_transaction != handle->h_transaction)
+ if (READ_ONCE(jh->b_transaction) != handle->h_transaction &&
+ READ_ONCE(jh->b_next_transaction) != handle->h_transaction)
goto out;
/*
* There are two reasons for the barrier here:
@@ -1115,6 +1113,9 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int rc;
+ if (is_handle_aborted(handle))
+ return -EROFS;
+
if (jbd2_write_access_granted(handle, bh, false))
return 0;
@@ -1252,6 +1253,9 @@ int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
char *committed_data = NULL;
+ if (is_handle_aborted(handle))
+ return -EROFS;
+
if (jbd2_write_access_granted(handle, bh, true))
return 0;
@@ -2293,14 +2297,16 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh,
return -EBUSY;
}
/*
- * OK, buffer won't be reachable after truncate. We just set
- * j_next_transaction to the running transaction (if there is
- * one) and mark buffer as freed so that commit code knows it
- * should clear dirty bits when it is done with the buffer.
+ * OK, buffer won't be reachable after truncate. We just clear
+ * b_modified to not confuse transaction credit accounting, and
+ * set j_next_transaction to the running transaction (if there
+ * is one) and mark buffer as freed so that commit code knows
+ * it should clear dirty bits when it is done with the buffer.
*/
set_buffer_freed(bh);
if (journal->j_running_transaction && buffer_jbddirty(bh))
jh->b_next_transaction = journal->j_running_transaction;
+ jh->b_modified = 0;
jbd2_journal_put_journal_head(jh);
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
@@ -2526,8 +2532,8 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh)
* our jh reference and thus __jbd2_journal_file_buffer() must not
* take a new one.
*/
- jh->b_transaction = jh->b_next_transaction;
- jh->b_next_transaction = NULL;
+ WRITE_ONCE(jh->b_transaction, jh->b_next_transaction);
+ WRITE_ONCE(jh->b_next_transaction, NULL);
if (buffer_freed(bh))
jlist = BJ_Forget;
else if (jh->b_modified)
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index a387534c9577..a5a65de50318 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -621,7 +621,6 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
{
struct kernfs_node *kn;
u32 gen;
- int cursor;
int ret;
name = kstrdup_const(name, GFP_KERNEL);
@@ -634,11 +633,11 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
idr_preload(GFP_KERNEL);
spin_lock(&kernfs_idr_lock);
- cursor = idr_get_cursor(&root->ino_idr);
ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
- if (ret >= 0 && ret < cursor)
+ if (ret >= 0 && ret < root->last_ino)
root->next_generation++;
gen = root->next_generation;
+ root->last_ino = ret;
spin_unlock(&kernfs_idr_lock);
idr_preload_end();
if (ret < 0)
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e8c792b49616..c35bbaa19486 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -912,7 +912,7 @@ repeat:
}
fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
- &name, 0);
+ NULL, 0);
iput(inode);
}
diff --git a/fs/libfs.c b/fs/libfs.c
index 7ac2d4bb4735..0d95cc9d3f2f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -805,7 +805,7 @@ int simple_attr_open(struct inode *inode, struct file *file,
{
struct simple_attr *attr;
- attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
if (!attr)
return -ENOMEM;
@@ -845,9 +845,11 @@ ssize_t simple_attr_read(struct file *file, char __user *buf,
if (ret)
return ret;
- if (*ppos) { /* continued read */
+ if (*ppos && attr->get_buf[0]) {
+ /* continued read */
size = strlen(attr->get_buf);
- } else { /* first read */
+ } else {
+ /* first read */
u64 val;
ret = attr->get(attr->data, &val);
if (ret)
diff --git a/fs/locks.c b/fs/locks.c
index ec1e4a5df629..ceb202de1307 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -729,7 +729,6 @@ static void __locks_delete_block(struct file_lock *waiter)
{
locks_delete_global_blocked(waiter);
list_del_init(&waiter->fl_blocked_member);
- waiter->fl_blocker = NULL;
}
static void __locks_wake_up_blocks(struct file_lock *blocker)
@@ -744,6 +743,13 @@ static void __locks_wake_up_blocks(struct file_lock *blocker)
waiter->fl_lmops->lm_notify(waiter);
else
wake_up(&waiter->fl_wait);
+
+ /*
+ * The setting of fl_blocker to NULL marks the "done"
+ * point in deleting a block. Paired with acquire at the top
+ * of locks_delete_block().
+ */
+ smp_store_release(&waiter->fl_blocker, NULL);
}
}
@@ -758,24 +764,41 @@ int locks_delete_block(struct file_lock *waiter)
int status = -ENOENT;
/*
- * If fl_blocker is NULL, it won't be set again as this thread
- * "owns" the lock and is the only one that might try to claim
- * the lock. So it is safe to test fl_blocker locklessly.
- * Also if fl_blocker is NULL, this waiter is not listed on
- * fl_blocked_requests for some lock, so no other request can
- * be added to the list of fl_blocked_requests for this
- * request. So if fl_blocker is NULL, it is safe to
- * locklessly check if fl_blocked_requests is empty. If both
- * of these checks succeed, there is no need to take the lock.
+ * If fl_blocker is NULL, it won't be set again as this thread "owns"
+ * the lock and is the only one that might try to claim the lock.
+ *
+ * We use acquire/release to manage fl_blocker so that we can
+ * optimize away taking the blocked_lock_lock in many cases.
+ *
+ * The smp_load_acquire guarantees two things:
+ *
+ * 1/ that fl_blocked_requests can be tested locklessly. If something
+ * was recently added to that list it must have been in a locked region
+ * *before* the locked region when fl_blocker was set to NULL.
+ *
+ * 2/ that no other thread is accessing 'waiter', so it is safe to free
+ * it. __locks_wake_up_blocks is careful not to touch waiter after
+ * fl_blocker is released.
+ *
+ * If a lockless check of fl_blocker shows it to be NULL, we know that
+ * no new locks can be inserted into its fl_blocked_requests list, and
+ * can avoid doing anything further if the list is empty.
*/
- if (waiter->fl_blocker == NULL &&
+ if (!smp_load_acquire(&waiter->fl_blocker) &&
list_empty(&waiter->fl_blocked_requests))
return status;
+
spin_lock(&blocked_lock_lock);
if (waiter->fl_blocker)
status = 0;
__locks_wake_up_blocks(waiter);
__locks_delete_block(waiter);
+
+ /*
+ * The setting of fl_blocker to NULL marks the "done" point in deleting
+ * a block. Paired with acquire at the top of this function.
+ */
+ smp_store_release(&waiter->fl_blocker, NULL);
spin_unlock(&blocked_lock_lock);
return status;
}
@@ -1368,7 +1391,8 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = posix_lock_inode(inode, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ error = wait_event_interruptible(fl->fl_wait,
+ list_empty(&fl->fl_blocked_member));
if (error)
break;
}
@@ -1453,7 +1477,8 @@ int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
error = posix_lock_inode(inode, &fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl.fl_wait, !fl.fl_blocker);
+ error = wait_event_interruptible(fl.fl_wait,
+ list_empty(&fl.fl_blocked_member));
if (!error) {
/*
* If we've been sleeping someone might have
@@ -1646,7 +1671,8 @@ restart:
locks_dispose_list(&dispose);
error = wait_event_interruptible_timeout(new_fl->fl_wait,
- !new_fl->fl_blocker, break_time);
+ list_empty(&new_fl->fl_blocked_member),
+ break_time);
percpu_down_read(&file_rwsem);
spin_lock(&ctx->flc_lock);
@@ -2057,7 +2083,8 @@ static int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl)
error = flock_lock_inode(inode, fl);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ error = wait_event_interruptible(fl->fl_wait,
+ list_empty(&fl->fl_blocked_member));
if (error)
break;
}
@@ -2334,7 +2361,8 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
error = vfs_lock_file(filp, cmd, fl, NULL);
if (error != FILE_LOCK_DEFERRED)
break;
- error = wait_event_interruptible(fl->fl_wait, !fl->fl_blocker);
+ error = wait_event_interruptible(fl->fl_wait,
+ list_empty(&fl->fl_blocked_member));
if (error)
break;
}
@@ -2774,7 +2802,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
}
if (inode) {
/* userspace relies on this representation of dev_t */
- seq_printf(f, "%d %02x:%02x:%ld ", fl_pid,
+ seq_printf(f, "%d %02x:%02x:%lu ", fl_pid,
MAJOR(inode->i_sb->s_dev),
MINOR(inode->i_sb->s_dev), inode->i_ino);
} else {
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f96073f25432..53337020d9d0 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -150,6 +150,25 @@ static int minix_remount (struct super_block * sb, int * flags, char * data)
return 0;
}
+static bool minix_check_superblock(struct super_block *sb)
+{
+ struct minix_sb_info *sbi = minix_sb(sb);
+
+ if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
+ return false;
+
+ /*
+ * s_max_size must not exceed the block mapping limitation. This check
+ * is only needed for V1 filesystems, since V2/V3 support an extra level
+ * of indirect blocks which places the limit well above U32_MAX.
+ */
+ if (sbi->s_version == MINIX_V1 &&
+ sb->s_maxbytes > (7 + 512 + 512*512) * BLOCK_SIZE)
+ return false;
+
+ return true;
+}
+
static int minix_fill_super(struct super_block *s, void *data, int silent)
{
struct buffer_head *bh;
@@ -185,7 +204,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
sbi->s_zmap_blocks = ms->s_zmap_blocks;
sbi->s_firstdatazone = ms->s_firstdatazone;
sbi->s_log_zone_size = ms->s_log_zone_size;
- sbi->s_max_size = ms->s_max_size;
+ s->s_maxbytes = ms->s_max_size;
s->s_magic = ms->s_magic;
if (s->s_magic == MINIX_SUPER_MAGIC) {
sbi->s_version = MINIX_V1;
@@ -216,7 +235,7 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
sbi->s_zmap_blocks = m3s->s_zmap_blocks;
sbi->s_firstdatazone = m3s->s_firstdatazone;
sbi->s_log_zone_size = m3s->s_log_zone_size;
- sbi->s_max_size = m3s->s_max_size;
+ s->s_maxbytes = m3s->s_max_size;
sbi->s_ninodes = m3s->s_ninodes;
sbi->s_nzones = m3s->s_zones;
sbi->s_dirsize = 64;
@@ -228,11 +247,12 @@ static int minix_fill_super(struct super_block *s, void *data, int silent)
} else
goto out_no_fs;
+ if (!minix_check_superblock(s))
+ goto out_illegal_sb;
+
/*
* Allocate the buffer map to keep the superblock small.
*/
- if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0)
- goto out_illegal_sb;
i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh);
map = kzalloc(i, GFP_KERNEL);
if (!map)
@@ -466,6 +486,13 @@ static struct inode *V1_minix_iget(struct inode *inode)
iget_failed(inode);
return ERR_PTR(-EIO);
}
+ if (raw_inode->i_nlinks == 0) {
+ printk("MINIX-fs: deleted inode referenced: %lu\n",
+ inode->i_ino);
+ brelse(bh);
+ iget_failed(inode);
+ return ERR_PTR(-ESTALE);
+ }
inode->i_mode = raw_inode->i_mode;
i_uid_write(inode, raw_inode->i_uid);
i_gid_write(inode, raw_inode->i_gid);
@@ -499,6 +526,13 @@ static struct inode *V2_minix_iget(struct inode *inode)
iget_failed(inode);
return ERR_PTR(-EIO);
}
+ if (raw_inode->i_nlinks == 0) {
+ printk("MINIX-fs: deleted inode referenced: %lu\n",
+ inode->i_ino);
+ brelse(bh);
+ iget_failed(inode);
+ return ERR_PTR(-ESTALE);
+ }
inode->i_mode = raw_inode->i_mode;
i_uid_write(inode, raw_inode->i_uid);
i_gid_write(inode, raw_inode->i_gid);
diff --git a/fs/minix/itree_common.c b/fs/minix/itree_common.c
index 043c3fdbc8e7..446148792f41 100644
--- a/fs/minix/itree_common.c
+++ b/fs/minix/itree_common.c
@@ -75,6 +75,7 @@ static int alloc_branch(struct inode *inode,
int n = 0;
int i;
int parent = minix_new_block(inode);
+ int err = -ENOSPC;
branch[0].key = cpu_to_block(parent);
if (parent) for (n = 1; n < num; n++) {
@@ -85,6 +86,11 @@ static int alloc_branch(struct inode *inode,
break;
branch[n].key = cpu_to_block(nr);
bh = sb_getblk(inode->i_sb, parent);
+ if (!bh) {
+ minix_free_block(inode, nr);
+ err = -ENOMEM;
+ break;
+ }
lock_buffer(bh);
memset(bh->b_data, 0, bh->b_size);
branch[n].bh = bh;
@@ -103,7 +109,7 @@ static int alloc_branch(struct inode *inode,
bforget(branch[i].bh);
for (i = 0; i < n; i++)
minix_free_block(inode, block_to_cpu(branch[i].key));
- return -ENOSPC;
+ return err;
}
static inline int splice_branch(struct inode *inode,
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 046cc96ee7ad..1fed906042aa 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -29,12 +29,12 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
if (block < 0) {
printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
block, inode->i_sb->s_bdev);
- } else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) {
- if (printk_ratelimit())
- printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %pg\n",
- block, inode->i_sb->s_bdev);
- } else if (block < 7) {
+ return 0;
+ }
+ if ((u64)block * BLOCK_SIZE >= inode->i_sb->s_maxbytes)
+ return 0;
+
+ if (block < 7) {
offsets[n++] = block;
} else if ((block -= 7) < 512) {
offsets[n++] = 7;
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f7fc7ecccccc..9d00f31a2d9d 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -32,13 +32,12 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
if (block < 0) {
printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
block, sb->s_bdev);
- } else if ((u64)block * (u64)sb->s_blocksize >=
- minix_sb(sb)->s_max_size) {
- if (printk_ratelimit())
- printk("MINIX-fs: block_to_path: "
- "block %ld too big on dev %pg\n",
- block, sb->s_bdev);
- } else if (block < DIRCOUNT) {
+ return 0;
+ }
+ if ((u64)block * (u64)sb->s_blocksize >= sb->s_maxbytes)
+ return 0;
+
+ if (block < DIRCOUNT) {
offsets[n++] = block;
} else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
offsets[n++] = DIRCOUNT;
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index df081e8afcc3..168d45d3de73 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -32,7 +32,6 @@ struct minix_sb_info {
unsigned long s_zmap_blocks;
unsigned long s_firstdatazone;
unsigned long s_log_zone_size;
- unsigned long s_max_size;
int s_dirsize;
int s_namelen;
struct buffer_head ** s_imap;
diff --git a/fs/mpage.c b/fs/mpage.c
index 436a85260394..af6c93f3ef6b 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -62,7 +62,7 @@ static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
{
bio->bi_end_io = mpage_end_io;
bio_set_op_attrs(bio, op, op_flags);
- guard_bio_eod(op, bio);
+ guard_bio_eod(bio);
submit_bio(bio);
return NULL;
}
diff --git a/fs/namei.c b/fs/namei.c
index 20831c2fbb34..3508a571d042 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1007,7 +1007,8 @@ static int may_linkat(struct path *link)
* may_create_in_sticky - Check whether an O_CREAT open in a sticky directory
* should be allowed, or not, on files that already
* exist.
- * @dir: the sticky parent directory
+ * @dir_mode: mode bits of directory
+ * @dir_uid: owner of directory
* @inode: the inode of the file to open
*
* Block an O_CREAT open of a FIFO (or a regular file) when:
@@ -1023,18 +1024,18 @@ static int may_linkat(struct path *link)
*
* Returns 0 if the open is allowed, -ve on error.
*/
-static int may_create_in_sticky(struct dentry * const dir,
+static int may_create_in_sticky(umode_t dir_mode, kuid_t dir_uid,
struct inode * const inode)
{
if ((!sysctl_protected_fifos && S_ISFIFO(inode->i_mode)) ||
(!sysctl_protected_regular && S_ISREG(inode->i_mode)) ||
- likely(!(dir->d_inode->i_mode & S_ISVTX)) ||
- uid_eq(inode->i_uid, dir->d_inode->i_uid) ||
+ likely(!(dir_mode & S_ISVTX)) ||
+ uid_eq(inode->i_uid, dir_uid) ||
uid_eq(current_fsuid(), inode->i_uid))
return 0;
- if (likely(dir->d_inode->i_mode & 0002) ||
- (dir->d_inode->i_mode & 0020 &&
+ if (likely(dir_mode & 0002) ||
+ (dir_mode & 0020 &&
((sysctl_protected_fifos >= 2 && S_ISFIFO(inode->i_mode)) ||
(sysctl_protected_regular >= 2 && S_ISREG(inode->i_mode))))) {
return -EACCES;
@@ -1365,7 +1366,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
nd->path.dentry = parent;
nd->seq = seq;
if (unlikely(!path_connected(&nd->path)))
- return -ENOENT;
+ return -ECHILD;
break;
} else {
struct mount *mnt = real_mount(nd->path.mnt);
@@ -3256,6 +3257,8 @@ static int do_last(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct dentry *dir = nd->path.dentry;
+ kuid_t dir_uid = nd->inode->i_uid;
+ umode_t dir_mode = nd->inode->i_mode;
int open_flag = op->open_flag;
bool will_truncate = (open_flag & O_TRUNC) != 0;
bool got_write = false;
@@ -3391,7 +3394,7 @@ finish_open:
error = -EISDIR;
if (d_is_dir(nd->path.dentry))
goto out;
- error = may_create_in_sticky(dir,
+ error = may_create_in_sticky(dir_mode, dir_uid,
d_backing_inode(nd->path.dentry));
if (unlikely(error))
goto out;
diff --git a/fs/namespace.c b/fs/namespace.c
index 6cb11bd6ff9a..7083e1388150 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3655,8 +3655,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
/* make certain new is below the root */
if (!is_path_reachable(new_mnt, new.dentry, &root))
goto out4;
- root_mp->m_count++; /* pin it so it won't go away */
lock_mount_hash();
+ root_mp->m_count++; /* pin it so it won't go away */
detach_mnt(new_mnt, &parent_path);
detach_mnt(root_mnt, &root_parent);
if (root_mnt->mnt.mnt_flags & MNT_LOCKED) {
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 714ce5cbd08c..5b1b365939b5 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -103,7 +103,7 @@ config NFS_V4
config NFS_SWAP
bool "Provide swap over NFS support"
default n
- depends on NFS_FS
+ depends on NFS_FS && SWAP
select SUNRPC_SWAP
help
This option enables swapon to work on files located on NFS mounts.
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 315967354954..bcc51f131a49 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -130,6 +130,8 @@ static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ if (!pnfs_layout_is_valid(lo))
+ continue;
if (stateid != NULL &&
!nfs4_stateid_match_other(stateid, &lo->plh_stateid))
continue;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2ec71d2450c7..913a37fc3e11 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -151,6 +151,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init)
if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL)
goto error_0;
+ clp->cl_minorversion = cl_init->minorversion;
clp->cl_nfs_mod = cl_init->nfs_mod;
if (!try_module_get(clp->cl_nfs_mod->owner))
goto error_dealloc;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 071b90a45933..af549d70ec50 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -53,6 +53,16 @@ nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
return false;
}
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode)
+{
+ struct nfs_delegation *delegation;
+
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (nfs4_is_valid_delegation(delegation, 0))
+ return delegation;
+ return NULL;
+}
+
static int
nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
{
@@ -1181,7 +1191,7 @@ bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode)
if (delegation != NULL &&
nfs4_stateid_match_other(dst, &delegation->stateid)) {
dst->seqid = delegation->stateid.seqid;
- return ret;
+ ret = true;
}
rcu_read_unlock();
out:
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 9eb87ae4c982..8b14d441e699 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -68,6 +68,7 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state,
bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags, nfs4_stateid *dst, const struct cred **cred);
bool nfs4_refresh_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
+struct nfs_delegation *nfs4_get_valid_delegation(const struct inode *inode);
void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
int nfs4_have_delegation(struct inode *inode, fmode_t flags);
int nfs4_check_delegation(struct inode *inode, fmode_t flags);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3321cc7a7ead..90d02dc8e15d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -158,6 +158,17 @@ typedef struct {
bool eof;
} nfs_readdir_descriptor_t;
+static
+void nfs_readdir_init_array(struct page *page)
+{
+ struct nfs_cache_array *array;
+
+ array = kmap_atomic(page);
+ memset(array, 0, sizeof(struct nfs_cache_array));
+ array->eof_index = -1;
+ kunmap_atomic(array);
+}
+
/*
* we are freeing strings created by nfs_add_to_readdir_array()
*/
@@ -170,6 +181,7 @@ void nfs_readdir_clear_array(struct page *page)
array = kmap_atomic(page);
for (i = 0; i < array->size; i++)
kfree(array->array[i].string.name);
+ array->size = 0;
kunmap_atomic(array);
}
@@ -606,6 +618,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
int status = -ENOMEM;
unsigned int array_size = ARRAY_SIZE(pages);
+ nfs_readdir_init_array(page);
+
entry.prev_cookie = 0;
entry.cookie = desc->last_cookie;
entry.eof = 0;
@@ -622,8 +636,6 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page,
}
array = kmap(page);
- memset(array, 0, sizeof(struct nfs_cache_array));
- array->eof_index = -1;
status = nfs_readdir_alloc_pages(pages, array_size);
if (status < 0)
@@ -678,6 +690,7 @@ int nfs_readdir_filler(void *data, struct page* page)
unlock_page(page);
return 0;
error:
+ nfs_readdir_clear_array(page);
unlock_page(page);
return ret;
}
@@ -685,8 +698,6 @@ int nfs_readdir_filler(void *data, struct page* page)
static
void cache_page_release(nfs_readdir_descriptor_t *desc)
{
- if (!desc->page->mapping)
- nfs_readdir_clear_array(desc->page);
put_page(desc->page);
desc->page = NULL;
}
@@ -700,19 +711,28 @@ struct page *get_cache_page(nfs_readdir_descriptor_t *desc)
/*
* Returns 0 if desc->dir_cookie was found on page desc->page_index
+ * and locks the page to prevent removal from the page cache.
*/
static
-int find_cache_page(nfs_readdir_descriptor_t *desc)
+int find_and_lock_cache_page(nfs_readdir_descriptor_t *desc)
{
int res;
desc->page = get_cache_page(desc);
if (IS_ERR(desc->page))
return PTR_ERR(desc->page);
-
- res = nfs_readdir_search_array(desc);
+ res = lock_page_killable(desc->page);
if (res != 0)
- cache_page_release(desc);
+ goto error;
+ res = -EAGAIN;
+ if (desc->page->mapping != NULL) {
+ res = nfs_readdir_search_array(desc);
+ if (res == 0)
+ return 0;
+ }
+ unlock_page(desc->page);
+error:
+ cache_page_release(desc);
return res;
}
@@ -727,7 +747,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
desc->last_cookie = 0;
}
do {
- res = find_cache_page(desc);
+ res = find_and_lock_cache_page(desc);
} while (res == -EAGAIN);
return res;
}
@@ -766,7 +786,6 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
desc->eof = true;
kunmap(desc->page);
- cache_page_release(desc);
dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n",
(unsigned long long)*desc->dir_cookie, res);
return res;
@@ -812,13 +831,13 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
status = nfs_do_filldir(desc);
+ out_release:
+ nfs_readdir_clear_array(desc->page);
+ cache_page_release(desc);
out:
dfprintk(DIRCACHE, "NFS: %s: returns %d\n",
__func__, status);
return status;
- out_release:
- cache_page_release(desc);
- goto out;
}
/* The file offset position represents the dirent entry number. A
@@ -883,6 +902,8 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
break;
res = nfs_do_filldir(desc);
+ unlock_page(desc->page);
+ cache_page_release(desc);
if (res < 0)
break;
} while (!desc->eof);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 98a9a0bcdf38..1aae0606dd87 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -262,10 +262,10 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
data->ds_commit_index);
/* verifier not set so always fail */
- if (verfp->committed < 0)
+ if (verfp->committed < 0 || data->res.verf->committed <= NFS_UNSTABLE)
return 1;
- return nfs_direct_cmp_verf(verfp, &data->verf);
+ return nfs_direct_cmp_verf(verfp, data->res.verf);
}
/**
@@ -601,6 +601,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
goto out_release;
}
dreq->l_ctx = l_ctx;
@@ -1031,6 +1032,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
goto out_release;
}
dreq->l_ctx = l_ctx;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 95dc90570786..387a2cfa7e17 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -140,6 +140,7 @@ static int
nfs_file_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
+ errseq_t since;
dprintk("NFS: flush(%pD2)\n", file);
@@ -148,7 +149,9 @@ nfs_file_flush(struct file *file, fl_owner_t id)
return 0;
/* Flush writes to the server and return any errors */
- return nfs_wb_all(inode);
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_wb_all(inode);
+ return filemap_check_wb_err(file->f_mapping, since);
}
ssize_t
@@ -580,12 +583,14 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
.page_mkwrite = nfs_vm_page_mkwrite,
};
-static int nfs_need_check_write(struct file *filp, struct inode *inode)
+static int nfs_need_check_write(struct file *filp, struct inode *inode,
+ int error)
{
struct nfs_open_context *ctx;
ctx = nfs_file_open_context(filp);
- if (nfs_ctx_key_to_expire(ctx, inode))
+ if (nfs_error_is_fatal_on_server(error) ||
+ nfs_ctx_key_to_expire(ctx, inode))
return 1;
return 0;
}
@@ -596,6 +601,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
struct inode *inode = file_inode(file);
unsigned long written = 0;
ssize_t result;
+ errseq_t since;
+ int error;
result = nfs_key_timeout_notify(file, inode);
if (result)
@@ -620,6 +627,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_pos > i_size_read(inode))
nfs_revalidate_mapping(inode, file->f_mapping);
+ since = filemap_sample_wb_err(file->f_mapping);
nfs_start_io_write(inode);
result = generic_write_checks(iocb, from);
if (result > 0) {
@@ -638,7 +646,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
/* Return error values */
- if (nfs_need_check_write(file, inode)) {
+ error = filemap_check_wb_err(file->f_mapping, since);
+ if (nfs_need_check_write(file, inode, error)) {
int err = nfs_wb_all(inode);
if (err < 0)
result = err;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 38d915814221..eebaa24e1cb7 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -980,9 +980,8 @@ retry:
goto out_mds;
/* Use a direct mapping of ds_idx to pgio mirror_idx */
- if (WARN_ON_ONCE(pgio->pg_mirror_count !=
- FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
- goto out_mds;
+ if (pgio->pg_mirror_count != FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))
+ goto out_eagain;
for (i = 0; i < pgio->pg_mirror_count; i++) {
mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
@@ -1004,12 +1003,16 @@ retry:
(NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR))
pgio->pg_maxretrans = io_maxretrans;
return;
-
+out_eagain:
+ pnfs_generic_pg_cleanup(pgio);
+ pgio->pg_error = -EAGAIN;
+ return;
out_mds:
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = NULL;
pgio->pg_maxretrans = 0;
nfs_pageio_reset_write_mds(pgio);
+ pgio->pg_error = -EAGAIN;
}
static unsigned int
diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c
index 15f271401dcc..573b1da9342c 100644
--- a/fs/nfs/fscache-index.c
+++ b/fs/nfs/fscache-index.c
@@ -84,8 +84,10 @@ enum fscache_checkaux nfs_fscache_inode_check_aux(void *cookie_netfs_data,
return FSCACHE_CHECKAUX_OBSOLETE;
memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
- auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
+ auxdata.mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
+ auxdata.mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
+ auxdata.ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
+ auxdata.ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index 3800ab6f08fa..7d6721ec31d4 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -31,6 +31,7 @@ static DEFINE_SPINLOCK(nfs_fscache_keys_lock);
struct nfs_server_key {
struct {
uint16_t nfsversion; /* NFS protocol version */
+ uint32_t minorversion; /* NFSv4 minor version */
uint16_t family; /* address family */
__be16 port; /* IP port */
} hdr;
@@ -55,6 +56,7 @@ void nfs_fscache_get_client_cookie(struct nfs_client *clp)
memset(&key, 0, sizeof(key));
key.hdr.nfsversion = clp->rpc_ops->version;
+ key.hdr.minorversion = clp->cl_minorversion;
key.hdr.family = clp->cl_addr.ss_family;
switch (clp->cl_addr.ss_family) {
@@ -186,7 +188,8 @@ void nfs_fscache_get_super_cookie(struct super_block *sb, const char *uniq, int
/* create a cache index for looking up filehandles */
nfss->fscache = fscache_acquire_cookie(nfss->nfs_client->fscache,
&nfs_fscache_super_index_def,
- key, sizeof(*key) + ulen,
+ &key->key,
+ sizeof(key->key) + ulen,
NULL, 0,
nfss, 0, true);
dfprintk(FSCACHE, "NFS: get superblock cookie (0x%p/0x%p)\n",
@@ -224,6 +227,19 @@ void nfs_fscache_release_super_cookie(struct super_block *sb)
}
}
+static void nfs_fscache_update_auxdata(struct nfs_fscache_inode_auxdata *auxdata,
+ struct nfs_inode *nfsi)
+{
+ memset(auxdata, 0, sizeof(*auxdata));
+ auxdata->mtime_sec = nfsi->vfs_inode.i_mtime.tv_sec;
+ auxdata->mtime_nsec = nfsi->vfs_inode.i_mtime.tv_nsec;
+ auxdata->ctime_sec = nfsi->vfs_inode.i_ctime.tv_sec;
+ auxdata->ctime_nsec = nfsi->vfs_inode.i_ctime.tv_nsec;
+
+ if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
+ auxdata->change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
+}
+
/*
* Initialise the per-inode cache cookie pointer for an NFS inode.
*/
@@ -237,12 +253,7 @@ void nfs_fscache_init_inode(struct inode *inode)
if (!(nfss->fscache && S_ISREG(inode->i_mode)))
return;
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
- auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
-
- if (NFS_SERVER(&nfsi->vfs_inode)->nfs_client->rpc_ops->version == 4)
- auxdata.change_attr = inode_peek_iversion_raw(&nfsi->vfs_inode);
+ nfs_fscache_update_auxdata(&auxdata, nfsi);
nfsi->fscache = fscache_acquire_cookie(NFS_SB(inode->i_sb)->fscache,
&nfs_fscache_inode_object_def,
@@ -262,9 +273,7 @@ void nfs_fscache_clear_inode(struct inode *inode)
dfprintk(FSCACHE, "NFS: clear cookie (0x%p/0x%p)\n", nfsi, cookie);
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
- auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
+ nfs_fscache_update_auxdata(&auxdata, nfsi);
fscache_relinquish_cookie(cookie, &auxdata, false);
nfsi->fscache = NULL;
}
@@ -304,9 +313,7 @@ void nfs_fscache_open_file(struct inode *inode, struct file *filp)
if (!fscache_cookie_valid(cookie))
return;
- memset(&auxdata, 0, sizeof(auxdata));
- auxdata.mtime = timespec64_to_timespec(nfsi->vfs_inode.i_mtime);
- auxdata.ctime = timespec64_to_timespec(nfsi->vfs_inode.i_ctime);
+ nfs_fscache_update_auxdata(&auxdata, nfsi);
if (inode_is_open_for_write(inode)) {
dfprintk(FSCACHE, "NFS: nfsi 0x%p disabling cache\n", nfsi);
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index ad041cfbf9ec..6754c8607230 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -62,9 +62,11 @@ struct nfs_fscache_key {
* cache object.
*/
struct nfs_fscache_inode_auxdata {
- struct timespec mtime;
- struct timespec ctime;
- u64 change_attr;
+ s64 mtime_sec;
+ s64 mtime_nsec;
+ s64 ctime_sec;
+ s64 ctime_nsec;
+ u64 change_attr;
};
/*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 53777813ca95..82b440630d14 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -824,6 +824,8 @@ int nfs_getattr(const struct path *path, struct kstat *stat,
do_update |= cache_validity & NFS_INO_INVALID_ATIME;
if (request_mask & (STATX_CTIME|STATX_MTIME))
do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE;
+ if (request_mask & STATX_BLOCKS)
+ do_update |= cache_validity & NFS_INO_INVALID_BLOCKS;
if (do_update) {
/* Update the attribute cache */
if (!(server->flags & NFS_MOUNT_NOAC))
@@ -957,16 +959,16 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
struct file *filp)
{
struct nfs_open_context *ctx;
- const struct cred *cred = get_current_cred();
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx) {
- put_cred(cred);
+ if (!ctx)
return ERR_PTR(-ENOMEM);
- }
nfs_sb_active(dentry->d_sb);
ctx->dentry = dget(dentry);
- ctx->cred = cred;
+ if (filp)
+ ctx->cred = get_cred(filp->f_cred);
+ else
+ ctx->cred = get_current_cred();
ctx->ll_cred = NULL;
ctx->state = NULL;
ctx->mode = f_mode;
@@ -1733,7 +1735,8 @@ out_noforce:
status = nfs_post_op_update_inode_locked(inode, fattr,
NFS_INO_INVALID_CHANGE
| NFS_INO_INVALID_CTIME
- | NFS_INO_INVALID_MTIME);
+ | NFS_INO_INVALID_MTIME
+ | NFS_INO_INVALID_BLOCKS);
return status;
}
@@ -1843,7 +1846,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
| NFS_INO_INVALID_ATIME
| NFS_INO_REVAL_FORCED
- | NFS_INO_REVAL_PAGECACHE);
+ | NFS_INO_REVAL_PAGECACHE
+ | NFS_INO_INVALID_BLOCKS);
/* Do atomic weak cache consistency updates */
nfs_wcc_update_inode(inode, fattr);
@@ -2004,8 +2008,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
} else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
inode->i_blocks = fattr->du.nfs2.blocks;
- else
+ else {
+ nfsi->cache_validity |= save_cache_validity &
+ (NFS_INO_INVALID_BLOCKS
+ | NFS_INO_REVAL_FORCED);
cache_revalidated = false;
+ }
/* Update attrtimeo value if we're out of the unstable period */
if (attr_changed) {
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 266da4cac411..1f2f6f45e174 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -29,9 +29,8 @@
*/
#define encode_dirpath_sz (1 + XDR_QUADLEN(MNTPATHLEN))
#define MNT_status_sz (1)
-#define MNT_fhs_status_sz (1)
#define MNT_fhandle_sz XDR_QUADLEN(NFS2_FHSIZE)
-#define MNT_fhandle3_sz (1 + XDR_QUADLEN(NFS3_FHSIZE))
+#define MNT_fhandlev3_sz XDR_QUADLEN(NFS3_FHSIZE)
#define MNT_authflav3_sz (1 + NFS_MAX_SECFLAVORS)
/*
@@ -39,7 +38,7 @@
*/
#define MNT_enc_dirpath_sz encode_dirpath_sz
#define MNT_dec_mountres_sz (MNT_status_sz + MNT_fhandle_sz)
-#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandle_sz + \
+#define MNT_dec_mountres3_sz (MNT_status_sz + MNT_fhandlev3_sz + \
MNT_authflav3_sz)
/*
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 572794dab4b1..696a13bfaff5 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -370,7 +370,7 @@ static void encode_sattr(struct xdr_stream *xdr, const struct iattr *attr,
} else
p = xdr_time_not_set(p);
if (attr->ia_valid & ATTR_MTIME_SET) {
- ts = timespec64_to_timespec(attr->ia_atime);
+ ts = timespec64_to_timespec(attr->ia_mtime);
xdr_encode_time(p, &ts);
} else if (attr->ia_valid & ATTR_MTIME) {
ts = timespec64_to_timespec(attr->ia_mtime);
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index c5c3fc6e6c60..26c94b32d6f4 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -253,37 +253,45 @@ int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl,
int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type)
{
- struct posix_acl *alloc = NULL, *dfacl = NULL;
+ struct posix_acl *orig = acl, *dfacl = NULL, *alloc;
int status;
if (S_ISDIR(inode->i_mode)) {
switch(type) {
case ACL_TYPE_ACCESS:
- alloc = dfacl = get_acl(inode, ACL_TYPE_DEFAULT);
+ alloc = get_acl(inode, ACL_TYPE_DEFAULT);
if (IS_ERR(alloc))
goto fail;
+ dfacl = alloc;
break;
case ACL_TYPE_DEFAULT:
- dfacl = acl;
- alloc = acl = get_acl(inode, ACL_TYPE_ACCESS);
+ alloc = get_acl(inode, ACL_TYPE_ACCESS);
if (IS_ERR(alloc))
goto fail;
+ dfacl = acl;
+ acl = alloc;
break;
}
}
if (acl == NULL) {
- alloc = acl = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
+ alloc = posix_acl_from_mode(inode->i_mode, GFP_KERNEL);
if (IS_ERR(alloc))
goto fail;
+ acl = alloc;
}
status = __nfs3_proc_setacls(inode, acl, dfacl);
- posix_acl_release(alloc);
+out:
+ if (acl != orig)
+ posix_acl_release(acl);
+ if (dfacl != orig)
+ posix_acl_release(dfacl);
return status;
fail:
- return PTR_ERR(alloc);
+ status = PTR_ERR(alloc);
+ goto out;
}
const struct xattr_handler *nfs3_xattr_handlers[] = {
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index abbbdde97e31..45a6691f0893 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -2338,6 +2338,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
void *data)
{
struct nfs_commitres *result = data;
+ struct nfs_writeverf *verf = result->verf;
enum nfs_stat status;
int error;
@@ -2350,7 +2351,9 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
result->op_status = status;
if (status != NFS3_OK)
goto out_status;
- error = decode_writeverf3(xdr, &result->verf->verifier);
+ error = decode_writeverf3(xdr, &verf->verifier);
+ if (!error)
+ verf->committed = NFS_FILE_SYNC;
out:
return error;
out_status:
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 5196bfa7894d..9b61c80a93e9 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -283,14 +283,14 @@ static ssize_t _nfs42_proc_copy(struct file *src,
status = handle_async_copy(res, server, src, dst,
&args->src_stateid);
if (status)
- return status;
+ goto out;
}
if ((!res->synchronous || !args->sync) &&
res->write_res.verifier.committed != NFS_FILE_SYNC) {
status = process_copy_commit(dst, pos_dst, res);
if (status)
- return status;
+ goto out;
}
truncate_pagecache_range(dst_inode, pos_dst,
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 235919156edd..c9113808e68e 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -439,9 +439,7 @@ extern void nfs4_schedule_state_renewal(struct nfs_client *);
extern void nfs4_renewd_prepare_shutdown(struct nfs_server *);
extern void nfs4_kill_renewd(struct nfs_client *);
extern void nfs4_renew_state(struct work_struct *);
-extern void nfs4_set_lease_period(struct nfs_client *clp,
- unsigned long lease,
- unsigned long lastrenewed);
+extern void nfs4_set_lease_period(struct nfs_client *clp, unsigned long lease);
/* nfs4state.c */
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 208a236dc235..8b96a4dd6c4a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -216,7 +216,6 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
INIT_LIST_HEAD(&clp->cl_ds_clients);
rpc_init_wait_queue(&clp->cl_rpcwaitq, "NFS client");
clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED;
- clp->cl_minorversion = cl_init->minorversion;
clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
clp->cl_mig_gen = 1;
#if IS_ENABLED(CONFIG_NFS_V4_1)
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 336643b82188..f39651196a7e 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -86,7 +86,6 @@ nfs4_file_open(struct inode *inode, struct file *filp)
if (inode != d_inode(dentry))
goto out_drop;
- nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
nfs_file_set_open_context(filp, ctx);
nfs_fscache_open_file(inode, filp);
err = 0;
@@ -110,6 +109,7 @@ static int
nfs4_file_flush(struct file *file, fl_owner_t id)
{
struct inode *inode = file_inode(file);
+ errseq_t since;
dprintk("NFS: flush(%pD2)\n", file);
@@ -125,7 +125,9 @@ nfs4_file_flush(struct file *file, fl_owner_t id)
return filemap_fdatawrite(file->f_mapping);
/* Flush writes to the server and return any errors */
- return nfs_wb_all(inode);
+ since = filemap_sample_wb_err(file->f_mapping);
+ nfs_wb_all(inode);
+ return filemap_check_wb_err(file->f_mapping, since);
}
#ifdef CONFIG_NFS_V4_2
@@ -194,6 +196,9 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
if (remap_flags & ~REMAP_FILE_ADVISORY)
return -EINVAL;
+ if (IS_SWAPFILE(dst_inode) || IS_SWAPFILE(src_inode))
+ return -ETXTBSY;
+
/* check alignment w.r.t. clone_blksize */
ret = -EINVAL;
if (bs) {
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 2023011c7a8f..f3c17c8fb022 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -504,9 +504,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_DEADSESSION:
case -NFS4ERR_SEQ_FALSE_RETRY:
case -NFS4ERR_SEQ_MISORDERED:
- dprintk("%s ERROR: %d Reset session\n", __func__,
- errorcode);
- nfs4_schedule_session_recovery(clp->cl_session, errorcode);
+ /* Handled in nfs41_sequence_process() */
goto wait_on_recovery;
#endif /* defined(CONFIG_NFS_V4_1) */
case -NFS4ERR_FILE_OPEN:
@@ -759,12 +757,21 @@ static void nfs4_slot_sequence_acked(struct nfs4_slot *slot,
slot->seq_nr_last_acked = seqnr;
}
+static void nfs4_probe_sequence(struct nfs_client *client, const struct cred *cred,
+ struct nfs4_slot *slot)
+{
+ struct rpc_task *task = _nfs41_proc_sequence(client, cred, slot, true);
+ if (!IS_ERR(task))
+ rpc_put_task_async(task);
+}
+
static int nfs41_sequence_process(struct rpc_task *task,
struct nfs4_sequence_res *res)
{
struct nfs4_session *session;
struct nfs4_slot *slot = res->sr_slot;
struct nfs_client *clp;
+ int status;
int ret = 1;
if (slot == NULL)
@@ -774,16 +781,21 @@ static int nfs41_sequence_process(struct rpc_task *task,
goto out;
session = slot->table->session;
+ clp = session->clp;
trace_nfs4_sequence_done(session, res);
+
+ status = res->sr_status;
+ if (task->tk_status == -NFS4ERR_DEADSESSION)
+ status = -NFS4ERR_DEADSESSION;
+
/* Check the SEQUENCE operation status */
- switch (res->sr_status) {
+ switch (status) {
case 0:
/* Mark this sequence number as having been acked */
nfs4_slot_sequence_acked(slot, slot->seq_nr);
/* Update the slot's sequence and clientid lease timer */
slot->seq_done = 1;
- clp = session->clp;
do_renew_lease(clp, res->sr_timestamp);
/* Check sequence flags */
nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags,
@@ -831,10 +843,18 @@ static int nfs41_sequence_process(struct rpc_task *task,
/*
* Were one or more calls using this slot interrupted?
* If the server never received the request, then our
- * transmitted slot sequence number may be too high.
+ * transmitted slot sequence number may be too high. However,
+ * if the server did receive the request then it might
+ * accidentally give us a reply with a mismatched operation.
+ * We can sort this out by sending a lone sequence operation
+ * to the server on the same slot.
*/
if ((s32)(slot->seq_nr - slot->seq_nr_last_acked) > 1) {
slot->seq_nr--;
+ if (task->tk_msg.rpc_proc != &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE]) {
+ nfs4_probe_sequence(clp, task->tk_msg.rpc_cred, slot);
+ res->sr_slot = NULL;
+ }
goto retry_nowait;
}
/*
@@ -849,6 +869,10 @@ static int nfs41_sequence_process(struct rpc_task *task,
*/
slot->seq_nr = slot->seq_nr_highest_sent;
goto out_retry;
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+ goto session_recover;
default:
/* Just update the slot sequence no. */
slot->seq_done = 1;
@@ -859,8 +883,10 @@ out:
out_noaction:
return ret;
session_recover:
- nfs4_schedule_session_recovery(session, res->sr_status);
- goto retry_nowait;
+ nfs4_schedule_session_recovery(session, status);
+ dprintk("%s ERROR: %d Reset session\n", __func__, status);
+ nfs41_sequence_free_slot(res);
+ goto out;
retry_new_seq:
++slot->seq_nr;
retry_nowait:
@@ -1406,8 +1432,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode,
return 0;
if ((delegation->type & fmode) != fmode)
return 0;
- if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
- return 0;
switch (claim) {
case NFS4_OPEN_CLAIM_NULL:
case NFS4_OPEN_CLAIM_FH:
@@ -1776,7 +1800,6 @@ static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmo
static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
{
struct nfs4_state *state = opendata->state;
- struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs_delegation *delegation;
int open_mode = opendata->o_arg.open_flags;
fmode_t fmode = opendata->o_arg.fmode;
@@ -1793,7 +1816,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
}
spin_unlock(&state->owner->so_lock);
rcu_read_lock();
- delegation = rcu_dereference(nfsi->delegation);
+ delegation = nfs4_get_valid_delegation(state->inode);
if (!can_open_delegated(delegation, fmode, claim)) {
rcu_read_unlock();
break;
@@ -2157,7 +2180,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
case -NFS4ERR_BAD_HIGH_SLOT:
case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
case -NFS4ERR_DEADSESSION:
- nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
return -EAGAIN;
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_STALE_STATEID:
@@ -2337,7 +2359,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
data->o_arg.open_flags, claim))
goto out_no_action;
rcu_read_lock();
- delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
+ delegation = nfs4_get_valid_delegation(data->state->inode);
if (can_open_delegated(delegation, data->o_arg.fmode, claim))
goto unlock_no_action;
rcu_read_unlock();
@@ -2923,10 +2945,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
struct nfs_server *server = sp->so_server;
struct dentry *dentry;
struct nfs4_state *state;
+ struct inode *dir = d_inode(opendata->dir);
+ unsigned long dir_verifier;
unsigned int seq;
int ret;
seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+ dir_verifier = nfs_save_change_attribute(dir);
ret = _nfs4_proc_open(opendata, ctx);
if (ret != 0)
@@ -2954,8 +2979,19 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
dput(ctx->dentry);
ctx->dentry = dentry = alias;
}
- nfs_set_verifier(dentry,
- nfs_save_change_attribute(d_inode(opendata->dir)));
+ }
+
+ switch(opendata->o_arg.claim) {
+ default:
+ break;
+ case NFS4_OPEN_CLAIM_NULL:
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ case NFS4_OPEN_CLAIM_DELEGATE_PREV:
+ if (!opendata->rpc_done)
+ break;
+ if (opendata->o_res.delegation_type != 0)
+ dir_verifier = nfs_save_change_attribute(dir);
+ nfs_set_verifier(dentry, dir_verifier);
}
/* Parse layoutget results before we check for access */
@@ -3147,6 +3183,11 @@ static struct nfs4_state *nfs4_do_open(struct inode *dir,
exception.retry = 1;
continue;
}
+ if (status == -NFS4ERR_EXPIRED) {
+ nfs4_schedule_lease_recovery(server->nfs_client);
+ exception.retry = 1;
+ continue;
+ }
if (status == -EAGAIN) {
/* We must have found a delegation */
exception.retry = 1;
@@ -4931,16 +4972,13 @@ static int nfs4_do_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, str
struct nfs4_exception exception = {
.interruptible = true,
};
- unsigned long now = jiffies;
int err;
do {
err = _nfs4_do_fsinfo(server, fhandle, fsinfo);
trace_nfs4_fsinfo(server, fhandle, fsinfo->fattr, err);
if (err == 0) {
- nfs4_set_lease_period(server->nfs_client,
- fsinfo->lease_time * HZ,
- now);
+ nfs4_set_lease_period(server->nfs_client, fsinfo->lease_time * HZ);
break;
}
err = nfs4_handle_exception(server, err, &exception);
@@ -5203,7 +5241,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
hdr->timestamp = jiffies;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
- nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 1, 0);
+ nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr);
}
@@ -5691,8 +5729,6 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf,
return ret;
if (!(fattr.valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL))
return -ENOENT;
- if (buflen < label.len)
- return -ERANGE;
return 0;
}
@@ -5997,6 +6033,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
.callback_data = &setclientid,
.flags = RPC_TASK_TIMEOUT,
};
+ unsigned long now = jiffies;
int status;
/* nfs_client_id4 */
@@ -6029,10 +6066,14 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program,
}
status = task->tk_status;
if (setclientid.sc_cred) {
+ kfree(clp->cl_acceptor);
clp->cl_acceptor = rpcauth_stringify_acceptor(setclientid.sc_cred);
put_rpccred(setclientid.sc_cred);
}
rpc_put_task(task);
+
+ if (status == 0)
+ do_renew_lease(clp, now);
out:
trace_nfs4_setclientid(clp, status);
dprintk("NFS reply setclientid: %d\n", status);
@@ -6191,8 +6232,10 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
d_data = (struct nfs4_delegreturndata *)data;
- if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task))
+ if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task)) {
+ nfs4_sequence_done(task, &d_data->res.seq_res);
return;
+ }
lo = d_data->args.lr_args ? d_data->args.lr_args->layout : NULL;
if (lo && !pnfs_layout_is_valid(lo)) {
@@ -7712,10 +7755,26 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
static void
nfs4_bind_one_conn_to_session_done(struct rpc_task *task, void *calldata)
{
+ struct nfs41_bind_conn_to_session_args *args = task->tk_msg.rpc_argp;
+ struct nfs41_bind_conn_to_session_res *res = task->tk_msg.rpc_resp;
+ struct nfs_client *clp = args->client;
+
+ switch (task->tk_status) {
+ case -NFS4ERR_BADSESSION:
+ case -NFS4ERR_DEADSESSION:
+ nfs4_schedule_session_recovery(clp->cl_session,
+ task->tk_status);
+ }
+ if (args->dir == NFS4_CDFC4_FORE_OR_BOTH &&
+ res->dir != NFS4_CDFS4_BOTH) {
+ rpc_task_close_connection(task);
+ if (args->retries++ < MAX_BIND_CONN_TO_SESSION_RETRIES)
+ rpc_restart_call(task);
+ }
}
static const struct rpc_call_ops nfs4_bind_one_conn_to_session_ops = {
- .rpc_call_done = &nfs4_bind_one_conn_to_session_done,
+ .rpc_call_done = nfs4_bind_one_conn_to_session_done,
};
/*
@@ -7734,6 +7793,7 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt,
struct nfs41_bind_conn_to_session_args args = {
.client = clp,
.dir = NFS4_CDFC4_FORE_OR_BOTH,
+ .retries = 0,
};
struct nfs41_bind_conn_to_session_res res;
struct rpc_message msg = {
@@ -8071,6 +8131,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre
struct rpc_task *task;
struct nfs41_exchange_id_args *argp;
struct nfs41_exchange_id_res *resp;
+ unsigned long now = jiffies;
int status;
task = nfs4_run_exchange_id(clp, cred, sp4_how, NULL);
@@ -8091,6 +8152,8 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, const struct cred *cre
if (status != 0)
goto out;
+ do_renew_lease(clp, now);
+
clp->cl_clientid = resp->clientid;
clp->cl_exchange_flags = resp->flags;
clp->cl_seqid = resp->seqid;
@@ -8758,8 +8821,6 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
case -NFS4ERR_BADSESSION:
case -NFS4ERR_DEADSESSION:
case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
- nfs4_schedule_session_recovery(clp->cl_session,
- task->tk_status);
break;
default:
nfs4_schedule_lease_recovery(clp);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 6ea431b067dd..ff876dda7f06 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -138,15 +138,12 @@ nfs4_kill_renewd(struct nfs_client *clp)
*
* @clp: pointer to nfs_client
* @lease: new value for lease period
- * @lastrenewed: time at which lease was last renewed
*/
void nfs4_set_lease_period(struct nfs_client *clp,
- unsigned long lease,
- unsigned long lastrenewed)
+ unsigned long lease)
{
spin_lock(&clp->cl_lock);
clp->cl_lease_time = lease;
- clp->cl_last_renewal = lastrenewed;
spin_unlock(&clp->cl_lock);
/* Cap maximum reconnect timeout at 1/2 lease period */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0e69cd846afb..6f7d16086785 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -290,17 +290,15 @@ static int nfs41_setup_state_renewal(struct nfs_client *clp)
{
int status;
struct nfs_fsinfo fsinfo;
- unsigned long now;
if (!test_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state)) {
nfs4_schedule_state_renewal(clp);
return 0;
}
- now = jiffies;
status = nfs4_proc_get_lease_time(clp, &fsinfo);
if (status == 0) {
- nfs4_set_lease_period(clp, fsinfo.lease_time * HZ, now);
+ nfs4_set_lease_period(clp, fsinfo.lease_time * HZ);
nfs4_schedule_state_renewal(clp);
}
@@ -735,9 +733,9 @@ nfs4_get_open_state(struct inode *inode, struct nfs4_state_owner *owner)
state = new;
state->owner = owner;
atomic_inc(&owner->so_count);
- list_add_rcu(&state->inode_states, &nfsi->open_states);
ihold(inode);
state->inode = inode;
+ list_add_rcu(&state->inode_states, &nfsi->open_states);
spin_unlock(&inode->i_lock);
/* Note: The reclaim code dictates that we add stateless
* and read-only stateids to the end of the list */
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index ff06820b9efb..d554baabef3e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4163,7 +4163,11 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
return -EIO;
if (len < NFS4_MAXLABELLEN) {
if (label) {
- memcpy(label->label, p, len);
+ if (label->len) {
+ if (label->len < len)
+ return -ERANGE;
+ memcpy(label->label, p, len);
+ }
label->len = len;
label->pi = pi;
label->lfs = lfs;
@@ -4310,11 +4314,14 @@ static int decode_write_verifier(struct xdr_stream *xdr, struct nfs_write_verifi
static int decode_commit(struct xdr_stream *xdr, struct nfs_commitres *res)
{
+ struct nfs_writeverf *verf = res->verf;
int status;
status = decode_op_hdr(xdr, OP_COMMIT);
if (!status)
- status = decode_write_verifier(xdr, &res->verf->verifier);
+ status = decode_write_verifier(xdr, &verf->verifier);
+ if (!status)
+ verf->committed = NFS_FILE_SYNC;
return status;
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index eae584dbfa08..64d87576a6c1 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -888,15 +888,6 @@ static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
pgio->pg_mirror_count = mirror_count;
}
-/*
- * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
- */
-void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
-{
- pgio->pg_mirror_count = 1;
- pgio->pg_mirror_idx = 0;
-}
-
static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
{
pgio->pg_mirror_count = 1;
@@ -1179,38 +1170,38 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (desc->pg_error < 0)
goto out_failed;
- for (midx = 0; midx < desc->pg_mirror_count; midx++) {
- if (midx) {
- nfs_page_group_lock(req);
+ /* Create the mirror instances first, and fire them off */
+ for (midx = 1; midx < desc->pg_mirror_count; midx++) {
+ nfs_page_group_lock(req);
- /* find the last request */
- for (lastreq = req->wb_head;
- lastreq->wb_this_page != req->wb_head;
- lastreq = lastreq->wb_this_page)
- ;
+ /* find the last request */
+ for (lastreq = req->wb_head;
+ lastreq->wb_this_page != req->wb_head;
+ lastreq = lastreq->wb_this_page)
+ ;
- dupreq = nfs_create_subreq(req, lastreq,
- pgbase, offset, bytes);
+ dupreq = nfs_create_subreq(req, lastreq,
+ pgbase, offset, bytes);
- nfs_page_group_unlock(req);
- if (IS_ERR(dupreq)) {
- desc->pg_error = PTR_ERR(dupreq);
- goto out_failed;
- }
- } else
- dupreq = req;
+ nfs_page_group_unlock(req);
+ if (IS_ERR(dupreq)) {
+ desc->pg_error = PTR_ERR(dupreq);
+ goto out_failed;
+ }
- if (nfs_pgio_has_mirroring(desc))
- desc->pg_mirror_idx = midx;
+ desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
goto out_cleanup_subreq;
}
+ desc->pg_mirror_idx = 0;
+ if (!nfs_pageio_add_request_mirror(desc, req))
+ goto out_failed;
+
return 1;
out_cleanup_subreq:
- if (req != dupreq)
- nfs_pageio_cleanup_request(desc, dupreq);
+ nfs_pageio_cleanup_request(desc, dupreq);
out_failed:
nfs_pageio_error_cleanup(desc);
return 0;
@@ -1322,6 +1313,14 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
}
}
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+ nfs_pageio_complete(pgio);
+}
+
int __init nfs_init_nfspagecache(void)
{
nfs_page_cachep = kmem_cache_create("nfs_page",
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 33c2ef416564..9b58894eb9b7 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1188,31 +1188,27 @@ out:
return status;
}
+static bool
+pnfs_layout_segments_returnable(struct pnfs_layout_hdr *lo,
+ enum pnfs_iomode iomode,
+ u32 seq)
+{
+ struct pnfs_layout_range recall_range = {
+ .length = NFS4_MAX_UINT64,
+ .iomode = iomode,
+ };
+ return pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs,
+ &recall_range, seq) != -EBUSY;
+}
+
/* Return true if layoutreturn is needed */
static bool
pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
{
- struct pnfs_layout_segment *s;
- enum pnfs_iomode iomode;
- u32 seq;
-
if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
return false;
-
- seq = lo->plh_return_seq;
- iomode = lo->plh_return_iomode;
-
- /* Defer layoutreturn until all recalled lsegs are done */
- list_for_each_entry(s, &lo->plh_segs, pls_list) {
- if (seq && pnfs_seqid_is_newer(s->pls_seq, seq))
- continue;
- if (iomode != IOMODE_ANY && s->pls_range.iomode != iomode)
- continue;
- if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
- return false;
- }
-
- return true;
+ return pnfs_layout_segments_returnable(lo, lo->plh_return_iomode,
+ lo->plh_return_seq);
}
static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
@@ -1415,7 +1411,7 @@ retry:
/* lo ref dropped in pnfs_roc_release() */
layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
/* If the creds don't match, we can't compound the layoutreturn */
- if (!layoutreturn || cred != lo->plh_lc_cred)
+ if (!layoutreturn || cred_fscmp(cred, lo->plh_lc_cred) != 0)
goto out_noroc;
roc = layoutreturn;
@@ -2301,16 +2297,6 @@ out_forget:
return ERR_PTR(-EAGAIN);
}
-static int
-mark_lseg_invalid_or_return(struct pnfs_layout_segment *lseg,
- struct list_head *tmp_list)
-{
- if (!mark_lseg_invalid(lseg, tmp_list))
- return 0;
- pnfs_cache_lseg_for_layoutreturn(lseg->pls_layout, lseg);
- return 1;
-}
-
/**
* pnfs_mark_matching_lsegs_return - Free or return matching layout segments
* @lo: pointer to layout header
@@ -2347,7 +2333,7 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
lseg, lseg->pls_range.iomode,
lseg->pls_range.offset,
lseg->pls_range.length);
- if (mark_lseg_invalid_or_return(lseg, tmp_list))
+ if (mark_lseg_invalid(lseg, tmp_list))
continue;
remaining++;
set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index c0046c348910..2cbe2bc99d1b 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -31,12 +31,11 @@ EXPORT_SYMBOL_GPL(pnfs_generic_rw_release);
/* Fake up some data that will cause nfs_commit_release to retry the writes. */
void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data)
{
- struct nfs_page *first = nfs_list_entry(data->pages.next);
+ struct nfs_writeverf *verf = data->res.verf;
data->task.tk_status = 0;
- memcpy(&data->verf.verifier, &first->wb_verf,
- sizeof(data->verf.verifier));
- data->verf.verifier.data[0]++; /* ensure verifier mismatch */
+ memset(&verf->verifier, 0, sizeof(verf->verifier));
+ verf->committed = NFS_UNSTABLE;
}
EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index ee6932c9819e..0e6afe72f6ce 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -243,13 +243,24 @@ out:
/* A writeback failed: mark the page as bad, and invalidate the page cache */
static void nfs_set_pageerror(struct address_space *mapping)
{
+ struct inode *inode = mapping->host;
+
nfs_zap_mapping(mapping->host, mapping);
+ /* Force file size revalidation */
+ spin_lock(&inode->i_lock);
+ NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED |
+ NFS_INO_REVAL_PAGECACHE |
+ NFS_INO_INVALID_SIZE;
+ spin_unlock(&inode->i_lock);
}
static void nfs_mapping_set_error(struct page *page, int error)
{
+ struct address_space *mapping = page_file_mapping(page);
+
SetPageError(page);
- mapping_set_error(page_file_mapping(page), error);
+ mapping_set_error(mapping, error);
+ nfs_set_pageerror(mapping);
}
/*
@@ -433,6 +444,7 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
}
subreq->wb_head = subreq;
+ nfs_release_request(old_head);
if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
nfs_release_request(subreq);
@@ -592,7 +604,6 @@ release_request:
static void nfs_write_error(struct nfs_page *req, int error)
{
- nfs_set_pageerror(page_file_mapping(req->wb_page));
nfs_mapping_set_error(req->wb_page, error);
nfs_inode_remove_request(req);
nfs_end_page_writeback(req);
@@ -789,7 +800,6 @@ static void nfs_inode_remove_request(struct nfs_page *req)
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_page *head;
- atomic_long_dec(&nfsi->nrequests);
if (nfs_page_group_sync_on_bit(req, PG_REMOVE)) {
head = req->wb_head;
@@ -802,8 +812,10 @@ static void nfs_inode_remove_request(struct nfs_page *req)
spin_unlock(&mapping->private_lock);
}
- if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags))
+ if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
nfs_release_request(req);
+ atomic_long_dec(&nfsi->nrequests);
+ }
}
static void
@@ -1000,7 +1012,6 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
nfs_list_remove_request(req);
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) &&
(hdr->good_bytes < bytes)) {
- nfs_set_pageerror(page_file_mapping(req->wb_page));
nfs_mapping_set_error(req->wb_page, hdr->error);
goto remove_req;
}
@@ -1831,6 +1842,7 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata)
static void nfs_commit_release_pages(struct nfs_commit_data *data)
{
+ const struct nfs_writeverf *verf = data->res.verf;
struct nfs_page *req;
int status = data->task.tk_status;
struct nfs_commit_info cinfo;
@@ -1858,7 +1870,8 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* Okay, COMMIT succeeded, apparently. Check the verifier
* returned by the server against all stored verfs. */
- if (!nfs_write_verifier_cmp(&req->wb_verf, &data->verf.verifier)) {
+ if (verf->committed > NFS_UNSTABLE &&
+ !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier)) {
/* We have a match */
if (req->wb_page)
nfs_inode_remove_request(req);
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index d25f6bbe7006..5f6cad45dcee 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -72,6 +72,7 @@ config NFSD_V4
select NFSD_V3
select FS_POSIX_ACL
select SUNRPC_GSS
+ select CRYPTO_MD5
select CRYPTO
select GRACE_PERIOD
help
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 397eb7820929..5703e7898369 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1235,6 +1235,8 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb)
err = setup_callback_client(clp, &conn, ses);
if (err) {
nfsd4_mark_cb_down(clp, err);
+ if (c)
+ svc_xprt_put(c->cn_xprt);
return;
}
}
@@ -1246,6 +1248,7 @@ nfsd4_run_cb_work(struct work_struct *work)
container_of(work, struct nfsd4_callback, cb_work);
struct nfs4_client *clp = cb->cb_clp;
struct rpc_clnt *clnt;
+ int flags;
if (cb->cb_need_restart) {
cb->cb_need_restart = false;
@@ -1274,7 +1277,8 @@ nfsd4_run_cb_work(struct work_struct *work)
}
cb->cb_msg.rpc_cred = clp->cl_cb_cred;
- rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+ flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN;
+ rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags,
cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb);
}
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index a79e24b79095..8ff07b13f685 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -675,7 +675,7 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
/* Client gets 2 lease periods to return it */
cutoff = ktime_add_ns(task->tk_start,
- nn->nfsd4_lease * NSEC_PER_SEC * 2);
+ (u64)nn->nfsd4_lease * NSEC_PER_SEC * 2);
if (ktime_before(now, cutoff)) {
rpc_delay(task, HZ/100); /* 10 mili-seconds */
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 8beda999e134..bff38e0ff0cf 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1083,7 +1083,8 @@ nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
goto out;
status = nfsd4_clone_file_range(src, clone->cl_src_pos,
- dst, clone->cl_dst_pos, clone->cl_count);
+ dst, clone->cl_dst_pos, clone->cl_count,
+ EX_ISSYNC(cstate->current_fh.fh_export));
fput(dst);
fput(src);
@@ -1302,7 +1303,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
out:
return status;
out_err:
- cleanup_async_copy(async_copy);
+ if (async_copy)
+ cleanup_async_copy(async_copy);
goto out;
}
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1a0cdeb3b875..05a76b82df08 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -253,6 +253,8 @@ find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
if (!nbl) {
nbl= kmalloc(sizeof(*nbl), GFP_KERNEL);
if (nbl) {
+ INIT_LIST_HEAD(&nbl->nbl_list);
+ INIT_LIST_HEAD(&nbl->nbl_lru);
fh_copy_shallow(&nbl->nbl_fh, fh);
locks_init_lock(&nbl->nbl_lock);
nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client,
@@ -3128,12 +3130,17 @@ static bool replay_matches_cache(struct svc_rqst *rqstp,
(bool)seq->cachethis)
return false;
/*
- * If there's an error than the reply can have fewer ops than
- * the call. But if we cached a reply with *more* ops than the
- * call you're sending us now, then this new call is clearly not
- * really a replay of the old one:
+ * If there's an error then the reply can have fewer ops than
+ * the call.
*/
- if (slot->sl_opcnt < argp->opcnt)
+ if (slot->sl_opcnt < argp->opcnt && !slot->sl_status)
+ return false;
+ /*
+ * But if we cached a reply with *more* ops than the call you're
+ * sending us now, then this new call is clearly not really a
+ * replay of the old one:
+ */
+ if (slot->sl_opcnt > argp->opcnt)
return false;
/* This is the only check explicitly called by spec: */
if (!same_creds(&rqstp->rq_cred, &slot->sl_cred))
@@ -6133,7 +6140,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
if (fl_flags & FL_SLEEP) {
- nbl->nbl_time = jiffies;
+ nbl->nbl_time = get_seconds();
spin_lock(&nn->blocked_locks_lock);
list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 52c4f6daa649..7a1cc8b77d42 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3535,17 +3535,17 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
u32 zzz = 0;
int pad;
+ /*
+ * svcrdma requires every READ payload to start somewhere
+ * in xdr->pages.
+ */
+ if (xdr->iov == xdr->buf->head) {
+ xdr->iov = NULL;
+ xdr->end = xdr->p;
+ }
+
len = maxcount;
v = 0;
-
- thislen = min_t(long, len, ((void *)xdr->end - (void *)xdr->p));
- p = xdr_reserve_space(xdr, (thislen+3)&~3);
- WARN_ON_ONCE(!p);
- resp->rqstp->rq_vec[v].iov_base = p;
- resp->rqstp->rq_vec[v].iov_len = thislen;
- v++;
- len -= thislen;
-
while (len) {
thislen = min_t(long, len, PAGE_SIZE);
p = xdr_reserve_space(xdr, (thislen+3)&~3);
@@ -3563,6 +3563,8 @@ static __be32 nfsd4_encode_readv(struct nfsd4_compoundres *resp,
read->rd_length = maxcount;
if (nfserr)
return nfserr;
+ if (svc_encode_read_payload(resp->rqstp, starting_len + 8, maxcount))
+ return nfserr_io;
xdr_truncate_encode(xdr, starting_len + 8 + ((maxcount+3)&~3));
eof = nfsd_eof_on_read(len, maxcount, read->rd_offset,
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index da52b594362a..f9dbb177b236 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -476,8 +476,7 @@ found_entry:
rtn = RC_REPLY;
break;
default:
- printk(KERN_WARNING "nfsd: bad repcache type %d\n", rp->c_type);
- nfsd_reply_cache_free_locked(b, rp);
+ WARN_ONCE(1, "nfsd: bad repcache type %d\n", rp->c_type);
}
goto out;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 18d94ea984ba..7f938bcb927d 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -94,12 +94,11 @@ static const struct svc_version *nfsd_acl_version[] = {
#define NFSD_ACL_MINVERS 2
#define NFSD_ACL_NRVERS ARRAY_SIZE(nfsd_acl_version)
-static const struct svc_version *nfsd_acl_versions[NFSD_ACL_NRVERS];
static struct svc_program nfsd_acl_program = {
.pg_prog = NFS_ACL_PROGRAM,
.pg_nvers = NFSD_ACL_NRVERS,
- .pg_vers = nfsd_acl_versions,
+ .pg_vers = nfsd_acl_version,
.pg_name = "nfsacl",
.pg_class = "nfsd",
.pg_stats = &nfsd_acl_svcstats,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 0b74d371ed67..1bd2882d7860 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -595,7 +595,7 @@ static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b)
struct nfsd4_blocked_lock {
struct list_head nbl_list;
struct list_head nbl_lru;
- unsigned long nbl_time;
+ time_t nbl_time;
struct file_lock nbl_lock;
struct knfsd_fh nbl_fh;
struct nfsd4_callback nbl_cb;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index fc24ee47eab5..c238b0d50eff 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -307,19 +307,25 @@ out:
* Commit metadata changes to stable storage.
*/
static int
-commit_metadata(struct svc_fh *fhp)
+commit_inode_metadata(struct inode *inode)
{
- struct inode *inode = d_inode(fhp->fh_dentry);
const struct export_operations *export_ops = inode->i_sb->s_export_op;
- if (!EX_ISSYNC(fhp->fh_export))
- return 0;
-
if (export_ops->commit_metadata)
return export_ops->commit_metadata(inode);
return sync_inode_metadata(inode, 1);
}
+static int
+commit_metadata(struct svc_fh *fhp)
+{
+ struct inode *inode = d_inode(fhp->fh_dentry);
+
+ if (!EX_ISSYNC(fhp->fh_export))
+ return 0;
+ return commit_inode_metadata(inode);
+}
+
/*
* Go over the attributes and take care of the small differences between
* NFS semantics and what Linux expects.
@@ -552,7 +558,7 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
#endif
__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
- u64 dst_pos, u64 count)
+ u64 dst_pos, u64 count, bool sync)
{
loff_t cloned;
@@ -561,6 +567,15 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
return nfserrno(cloned);
if (count && cloned != count)
return nfserrno(-EINVAL);
+ if (sync) {
+ loff_t dst_end = count ? dst_pos + count - 1 : LLONG_MAX;
+ int status = vfs_fsync_range(dst, dst_pos, dst_end, 0);
+
+ if (!status)
+ status = commit_inode_metadata(file_inode(src));
+ if (status < 0)
+ return nfserrno(status);
+ }
return 0;
}
@@ -1022,6 +1037,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
host_err = vfs_iter_write(file, &iter, &pos, flags);
if (host_err < 0)
goto out_nfserr;
+ *cnt = host_err;
nfsdstats.io_write += *cnt;
fsnotify_modify(file);
@@ -1211,6 +1227,9 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp,
iap->ia_mode = 0;
iap->ia_mode = (iap->ia_mode & S_IALLUGO) | type;
+ if (!IS_POSIXACL(dirp))
+ iap->ia_mode &= ~current_umask();
+
err = 0;
host_err = 0;
switch (type) {
@@ -1443,6 +1462,9 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
goto out;
}
+ if (!IS_POSIXACL(dirp))
+ iap->ia_mode &= ~current_umask();
+
host_err = vfs_create(dirp, dchild, iap->ia_mode, true);
if (host_err < 0) {
fh_drop_write(fhp);
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index db351247892d..02b0a140af8c 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -58,7 +58,7 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
__be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
struct file *, loff_t, loff_t, int);
__be32 nfsd4_clone_file_range(struct file *, u64, struct file *,
- u64, u64);
+ u64, u64, bool);
#endif /* CONFIG_NFSD_V4 */
__be32 nfsd_create_locked(struct svc_rqst *, struct svc_fh *,
char *name, int len, struct iattr *attrs,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 445eef41bfaf..91b58c897f92 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -2780,6 +2780,8 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
if (!nilfs->ns_writer)
return -ENOMEM;
+ inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL);
+
err = nilfs_segctor_start_thread(nilfs->ns_writer);
if (err) {
kfree(nilfs->ns_writer);
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 5778d1347b35..d24548ed31b9 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -26,7 +26,7 @@ static bool should_merge(struct fsnotify_event *old_fsn,
old = FANOTIFY_E(old_fsn);
new = FANOTIFY_E(new_fsn);
- if (old_fsn->inode != new_fsn->inode || old->pid != new->pid ||
+ if (old_fsn->objectid != new_fsn->objectid || old->pid != new->pid ||
old->fh_type != new->fh_type || old->fh_len != new->fh_len)
return false;
@@ -171,6 +171,17 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
if (!fsnotify_iter_should_report_type(iter_info, type))
continue;
mark = iter_info->marks[type];
+
+ /* Apply ignore mask regardless of ISDIR and ON_CHILD flags */
+ marks_ignored_mask |= mark->ignored_mask;
+
+ /*
+ * If the event is on dir and this mark doesn't care about
+ * events on dir, don't send it!
+ */
+ if (event_mask & FS_ISDIR && !(mark->mask & FS_ISDIR))
+ continue;
+
/*
* If the event is for a child and this mark doesn't care about
* events on a child, don't send it!
@@ -181,7 +192,6 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
continue;
marks_mask |= mark->mask;
- marks_ignored_mask |= mark->ignored_mask;
}
test_mask = event_mask & marks_mask & ~marks_ignored_mask;
@@ -203,10 +213,6 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group,
user_mask &= ~FAN_ONDIR;
}
- if (event_mask & FS_ISDIR &&
- !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
- return 0;
-
return test_mask & user_mask;
}
@@ -314,7 +320,12 @@ struct fanotify_event *fanotify_alloc_event(struct fsnotify_group *group,
if (!event)
goto out;
init: __maybe_unused
- fsnotify_init_event(&event->fse, inode);
+ /*
+ * Use the victim inode instead of the watching inode as the id for
+ * event queue, so event reported on parent is merged with event
+ * reported on child when both directory and child watches exist.
+ */
+ fsnotify_init_event(&event->fse, (unsigned long)id);
event->mask = mask;
if (FAN_GROUP_FLAG(group, FAN_REPORT_TID))
event->pid = get_pid(task_pid(current));
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 4eb2ebfac468..ae9a4d1e4c71 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -57,6 +57,9 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
* doing an __iget/iput with SB_ACTIVE clear would actually
* evict all inodes with zero i_count from icache which is
* unnecessarily violent and may in fact be illegal to do.
+ * However, we should have been called /after/ evict_inodes
+ * removed all zero refcount inodes, in any case. Test to
+ * be sure.
*/
if (!atomic_read(&inode->i_count)) {
spin_unlock(&inode->i_lock);
@@ -77,6 +80,7 @@ static void fsnotify_unmount_inodes(struct super_block *sb)
iput_inode = inode;
+ cond_resched();
spin_lock(&sb->s_inode_list_lock);
}
spin_unlock(&sb->s_inode_list_lock);
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index d510223d302c..589dee962993 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -39,7 +39,7 @@ static bool event_compare(struct fsnotify_event *old_fsn,
if (old->mask & FS_IN_IGNORED)
return false;
if ((old->mask == new->mask) &&
- (old_fsn->inode == new_fsn->inode) &&
+ (old_fsn->objectid == new_fsn->objectid) &&
(old->name_len == new->name_len) &&
(!old->name_len || !strcmp(old->name, new->name)))
return true;
@@ -118,7 +118,7 @@ int inotify_handle_event(struct fsnotify_group *group,
mask &= ~IN_ISDIR;
fsn_event = &event->fse;
- fsnotify_init_event(fsn_event, inode);
+ fsnotify_init_event(fsn_event, (unsigned long)inode);
event->mask = mask;
event->wd = i_mark->wd;
event->sync_cookie = cookie;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index cce8de32779f..3f8af1631fbf 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -628,7 +628,7 @@ static struct fsnotify_group *inotify_new_group(unsigned int max_events)
return ERR_PTR(-ENOMEM);
}
group->overflow_event = &oevent->fse;
- fsnotify_init_event(group->overflow_event, NULL);
+ fsnotify_init_event(group->overflow_event, 0);
oevent->mask = FS_Q_OVERFLOW;
oevent->wd = -1;
oevent->sync_cookie = 0;
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 3e7da392aa6f..bb981ec76456 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -327,8 +327,8 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh)
down_read(&OCFS2_I(inode)->ip_xattr_sem);
acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh);
up_read(&OCFS2_I(inode)->ip_xattr_sem);
- if (IS_ERR(acl) || !acl)
- return PTR_ERR(acl);
+ if (IS_ERR_OR_NULL(acl))
+ return PTR_ERR_OR_ZERO(acl);
ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode);
if (ret)
return ret;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index d1348fc4ca6d..6984ec05d8d5 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -7385,6 +7385,10 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_inline_data *idata = &di->id2.i_data;
+ /* No need to punch hole beyond i_size. */
+ if (start >= i_size_read(inode))
+ return 0;
+
if (end > i_size_read(inode))
end = i_size_read(inode);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index a4c905d6b575..9b827143a350 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2042,7 +2042,8 @@ out_write_size:
inode->i_mtime = inode->i_ctime = current_time(inode);
di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
- ocfs2_update_inode_fsync_trans(handle, inode, 1);
+ if (handle)
+ ocfs2_update_inode_fsync_trans(handle, inode, 1);
}
if (handle)
ocfs2_journal_dirty(handle, wc->w_di_bh);
@@ -2139,13 +2140,30 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
struct ocfs2_dio_write_ctxt *dwc = NULL;
struct buffer_head *di_bh = NULL;
u64 p_blkno;
- loff_t pos = iblock << inode->i_sb->s_blocksize_bits;
+ unsigned int i_blkbits = inode->i_sb->s_blocksize_bits;
+ loff_t pos = iblock << i_blkbits;
+ sector_t endblk = (i_size_read(inode) - 1) >> i_blkbits;
unsigned len, total_len = bh_result->b_size;
int ret = 0, first_get_block = 0;
len = osb->s_clustersize - (pos & (osb->s_clustersize - 1));
len = min(total_len, len);
+ /*
+ * bh_result->b_size is count in get_more_blocks according to write
+ * "pos" and "end", we need map twice to return different buffer state:
+ * 1. area in file size, not set NEW;
+ * 2. area out file size, set NEW.
+ *
+ * iblock endblk
+ * |--------|---------|---------|---------
+ * |<-------area in file------->|
+ */
+
+ if ((iblock <= endblk) &&
+ ((iblock + ((len - 1) >> i_blkbits)) > endblk))
+ len = (endblk - iblock + 1) << i_blkbits;
+
mlog(0, "get block of %lu at %llu:%u req %u\n",
inode->i_ino, pos, len, total_len);
@@ -2229,6 +2247,9 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock,
if (desc->c_needs_zero)
set_buffer_new(bh_result);
+ if (iblock > endblk)
+ set_buffer_new(bh_result);
+
/* May sleep in end_io. It should not happen in a irq context. So defer
* it to dio work queue. */
set_buffer_defer_completion(bh_result);
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index 38b224372776..5e700b45d32d 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-ccflags-y := -I $(srctree)/$(src)/..
-
obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o
ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 4de89af96abf..6abaded3ff6b 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -23,15 +23,15 @@
#include <linux/spinlock.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 965f45dbe17b..6051edc33aef 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -23,9 +23,9 @@
#include <linux/spinlock.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -33,7 +33,7 @@
#include "dlmconvert.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
/* NOTE: __dlmconvert_master is the only function in here that
* needs a spinlock held on entry (res->spinlock) and it is the
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c8af5bc9e980..c13b4856ab87 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -17,9 +17,9 @@
#include <linux/debugfs.h>
#include <linux/export.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -27,7 +27,7 @@
#include "dlmdebug.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static int stringify_lockname(const char *lockname, int locklen, char *buf,
int len);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 9021e72e1f98..8d08e6cb94f7 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -20,9 +20,9 @@
#include <linux/debugfs.h>
#include <linux/sched/signal.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -30,7 +30,7 @@
#include "dlmdebug.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
/*
* ocfs2 node maps are array of long int, which limits to send them freely
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index baff087f3863..83f0760e4fba 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -25,9 +25,9 @@
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -35,7 +35,7 @@
#include "dlmconvert.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static struct kmem_cache *dlm_lock_cache;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 810f841494ef..f3b7c1f3ccbf 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -25,9 +25,9 @@
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
@@ -35,7 +35,7 @@
#include "dlmdebug.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static void dlm_mle_node_down(struct dlm_ctxt *dlm,
struct dlm_master_list_entry *mle,
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index e22d6a115220..fb80d0cef78f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -26,16 +26,16 @@
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_RECOVERY)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 61c51c268460..fd40c17cd022 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -25,16 +25,16 @@
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#include "dlmdomain.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_THREAD)
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static int dlm_thread(void *data);
static void dlm_flush_asts(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 3883633e82eb..dcb17ca8ae74 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -23,15 +23,15 @@
#include <linux/spinlock.h>
#include <linux/delay.h>
-#include "cluster/heartbeat.h"
-#include "cluster/nodemanager.h"
-#include "cluster/tcp.h"
+#include "../cluster/heartbeat.h"
+#include "../cluster/nodemanager.h"
+#include "../cluster/tcp.h"
#include "dlmapi.h"
#include "dlmcommon.h"
#define MLOG_MASK_PREFIX ML_DLM
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
#define DLM_UNLOCK_FREE_LOCK 0x00000001
#define DLM_UNLOCK_CALL_AST 0x00000002
diff --git a/fs/ocfs2/dlmfs/Makefile b/fs/ocfs2/dlmfs/Makefile
index a9874e441bd4..c7895f65be0e 100644
--- a/fs/ocfs2/dlmfs/Makefile
+++ b/fs/ocfs2/dlmfs/Makefile
@@ -1,6 +1,4 @@
# SPDX-License-Identifier: GPL-2.0-only
-ccflags-y := -I $(srctree)/$(src)/..
-
obj-$(CONFIG_OCFS2_FS) += ocfs2_dlmfs.o
ocfs2_dlmfs-objs := userdlm.o dlmfs.o
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 4f1668c81e1f..1de77f1a600b 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -33,11 +33,11 @@
#include <linux/uaccess.h>
-#include "stackglue.h"
+#include "../stackglue.h"
#include "userdlm.h"
#define MLOG_MASK_PREFIX ML_DLMFS
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static const struct super_operations dlmfs_ops;
@@ -275,7 +275,6 @@ static ssize_t dlmfs_file_write(struct file *filp,
loff_t *ppos)
{
int bytes_left;
- ssize_t writelen;
char *lvb_buf;
struct inode *inode = file_inode(filp);
@@ -285,32 +284,30 @@ static ssize_t dlmfs_file_write(struct file *filp,
if (*ppos >= i_size_read(inode))
return -ENOSPC;
+ /* don't write past the lvb */
+ if (count > i_size_read(inode) - *ppos)
+ count = i_size_read(inode) - *ppos;
+
if (!count)
return 0;
if (!access_ok(buf, count))
return -EFAULT;
- /* don't write past the lvb */
- if ((count + *ppos) > i_size_read(inode))
- writelen = i_size_read(inode) - *ppos;
- else
- writelen = count - *ppos;
-
- lvb_buf = kmalloc(writelen, GFP_NOFS);
+ lvb_buf = kmalloc(count, GFP_NOFS);
if (!lvb_buf)
return -ENOMEM;
- bytes_left = copy_from_user(lvb_buf, buf, writelen);
- writelen -= bytes_left;
- if (writelen)
- user_dlm_write_lvb(inode, lvb_buf, writelen);
+ bytes_left = copy_from_user(lvb_buf, buf, count);
+ count -= bytes_left;
+ if (count)
+ user_dlm_write_lvb(inode, lvb_buf, count);
kfree(lvb_buf);
- *ppos = *ppos + writelen;
- mlog(0, "wrote %zd bytes\n", writelen);
- return writelen;
+ *ppos = *ppos + count;
+ mlog(0, "wrote %zu bytes\n", count);
+ return count;
}
static void dlmfs_init_once(void *foo)
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 525b14ddfba5..3df5be25bfb1 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -21,12 +21,12 @@
#include <linux/types.h>
#include <linux/crc32.h>
-#include "ocfs2_lockingver.h"
-#include "stackglue.h"
+#include "../ocfs2_lockingver.h"
+#include "../stackglue.h"
#include "userdlm.h"
#define MLOG_MASK_PREFIX ML_DLMFS
-#include "cluster/masklog.h"
+#include "../cluster/masklog.h"
static inline struct user_lock_res *user_lksb_to_lock_res(struct ocfs2_dlm_lksb *lksb)
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b5fc5d3c7525..b5934e995a69 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -668,6 +668,12 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res,
&ocfs2_nfs_sync_lops, osb);
}
+static void ocfs2_nfs_sync_lock_init(struct ocfs2_super *osb)
+{
+ ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ init_rwsem(&osb->nfs_sync_rwlock);
+}
+
void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb)
{
struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres;
@@ -2834,14 +2840,25 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)
if (ocfs2_is_hard_readonly(osb))
return -EROFS;
+ if (ex)
+ down_write(&osb->nfs_sync_rwlock);
+ else
+ down_read(&osb->nfs_sync_rwlock);
+
if (ocfs2_mount_local(osb))
return 0;
status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
0, 0);
- if (status < 0)
+ if (status < 0) {
mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);
+ if (ex)
+ up_write(&osb->nfs_sync_rwlock);
+ else
+ up_read(&osb->nfs_sync_rwlock);
+ }
+
return status;
}
@@ -2852,6 +2869,10 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex)
if (!ocfs2_mount_local(osb))
ocfs2_cluster_unlock(osb, lockres,
ex ? LKM_EXMODE : LKM_PRMODE);
+ if (ex)
+ up_write(&osb->nfs_sync_rwlock);
+ else
+ up_read(&osb->nfs_sync_rwlock);
}
int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
@@ -3297,7 +3318,7 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
local:
ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
- ocfs2_nfs_sync_lock_res_init(&osb->osb_nfs_sync_lockres, osb);
+ ocfs2_nfs_sync_lock_init(osb);
ocfs2_orphan_scan_lock_res_init(&osb->osb_orphan_scan.os_lockres, osb);
osb->cconn = conn;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4435df3e5adb..20c254807613 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2092,54 +2092,88 @@ static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
return 0;
}
-static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
- struct file *file,
- loff_t pos, size_t count,
- int *meta_level)
+static int ocfs2_inode_lock_for_extent_tree(struct inode *inode,
+ struct buffer_head **di_bh,
+ int meta_level,
+ int write_sem,
+ int wait)
{
- int ret;
- struct buffer_head *di_bh = NULL;
- u32 cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
- u32 clusters =
- ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+ int ret = 0;
- ret = ocfs2_inode_lock(inode, &di_bh, 1);
- if (ret) {
- mlog_errno(ret);
+ if (wait)
+ ret = ocfs2_inode_lock(inode, di_bh, meta_level);
+ else
+ ret = ocfs2_try_inode_lock(inode, di_bh, meta_level);
+ if (ret < 0)
goto out;
+
+ if (wait) {
+ if (write_sem)
+ down_write(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ } else {
+ if (write_sem)
+ ret = down_write_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ ret = down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem);
+
+ if (!ret) {
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
}
- *meta_level = 1;
+ return ret;
- ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
- if (ret)
- mlog_errno(ret);
+out_unlock:
+ brelse(*di_bh);
+ *di_bh = NULL;
+ ocfs2_inode_unlock(inode, meta_level);
out:
- brelse(di_bh);
return ret;
}
+static void ocfs2_inode_unlock_for_extent_tree(struct inode *inode,
+ struct buffer_head **di_bh,
+ int meta_level,
+ int write_sem)
+{
+ if (write_sem)
+ up_write(&OCFS2_I(inode)->ip_alloc_sem);
+ else
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+
+ brelse(*di_bh);
+ *di_bh = NULL;
+
+ if (meta_level >= 0)
+ ocfs2_inode_unlock(inode, meta_level);
+}
+
static int ocfs2_prepare_inode_for_write(struct file *file,
loff_t pos, size_t count, int wait)
{
int ret = 0, meta_level = 0, overwrite_io = 0;
+ int write_sem = 0;
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = d_inode(dentry);
struct buffer_head *di_bh = NULL;
loff_t end;
+ u32 cpos;
+ u32 clusters;
/*
* We start with a read level meta lock and only jump to an ex
* if we need to make modifications here.
*/
for(;;) {
- if (wait)
- ret = ocfs2_inode_lock(inode, NULL, meta_level);
- else
- ret = ocfs2_try_inode_lock(inode,
- overwrite_io ? NULL : &di_bh, meta_level);
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem,
+ wait);
if (ret < 0) {
- meta_level = -1;
if (ret != -EAGAIN)
mlog_errno(ret);
goto out;
@@ -2151,15 +2185,8 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
*/
if (!wait && !overwrite_io) {
overwrite_io = 1;
- if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) {
- ret = -EAGAIN;
- goto out_unlock;
- }
ret = ocfs2_overwrite_io(inode, di_bh, pos, count);
- brelse(di_bh);
- di_bh = NULL;
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
if (ret < 0) {
if (ret != -EAGAIN)
mlog_errno(ret);
@@ -2178,7 +2205,10 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
* set inode->i_size at the end of a write. */
if (should_remove_suid(dentry)) {
if (meta_level == 0) {
- ocfs2_inode_unlock(inode, meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
meta_level = 1;
continue;
}
@@ -2194,18 +2224,32 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
ret = ocfs2_check_range_for_refcount(inode, pos, count);
if (ret == 1) {
- ocfs2_inode_unlock(inode, meta_level);
- meta_level = -1;
-
- ret = ocfs2_prepare_inode_for_refcount(inode,
- file,
- pos,
- count,
- &meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
+ meta_level = 1;
+ write_sem = 1;
+ ret = ocfs2_inode_lock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem,
+ wait);
+ if (ret < 0) {
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
+ goto out;
+ }
+
+ cpos = pos >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
+ clusters =
+ ocfs2_clusters_for_bytes(inode->i_sb, pos + count) - cpos;
+ ret = ocfs2_refcount_cow(inode, di_bh, cpos, clusters, UINT_MAX);
}
if (ret < 0) {
- mlog_errno(ret);
+ if (ret != -EAGAIN)
+ mlog_errno(ret);
goto out_unlock;
}
@@ -2216,10 +2260,10 @@ out_unlock:
trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno,
pos, count, wait);
- brelse(di_bh);
-
- if (meta_level >= 0)
- ocfs2_inode_unlock(inode, meta_level);
+ ocfs2_inode_unlock_for_extent_tree(inode,
+ &di_bh,
+ meta_level,
+ write_sem);
out:
return ret;
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 994726ada857..a6c328211dcc 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -290,7 +290,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
if (inode_alloc)
inode_lock(inode_alloc);
- if (o2info_coherent(&fi->ifi_req)) {
+ if (inode_alloc && o2info_coherent(&fi->ifi_req)) {
status = ocfs2_inode_lock(inode_alloc, &bh, 0);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 699a560efbb0..900e4ef686bf 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1066,6 +1066,14 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed)
ocfs2_clear_journal_error(osb->sb, journal->j_journal, osb->slot_num);
+ if (replayed) {
+ jbd2_journal_lock_updates(journal->j_journal);
+ status = jbd2_journal_flush(journal->j_journal);
+ jbd2_journal_unlock_updates(journal->j_journal);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
status = ocfs2_journal_toggle_dirty(osb, 1, replayed);
if (status < 0) {
mlog_errno(status);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index c0fe6ed08ab1..12359e2db942 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -623,9 +623,11 @@ static inline void ocfs2_update_inode_fsync_trans(handle_t *handle,
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- oi->i_sync_tid = handle->h_transaction->t_tid;
- if (datasync)
- oi->i_datasync_tid = handle->h_transaction->t_tid;
+ if (!is_handle_aborted(handle)) {
+ oi->i_sync_tid = handle->h_transaction->t_tid;
+ if (datasync)
+ oi->i_datasync_tid = handle->h_transaction->t_tid;
+ }
}
#endif /* OCFS2_JOURNAL_H */
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index a4647a646f07..b07eb2a30be5 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -324,8 +324,8 @@ struct ocfs2_super
spinlock_t osb_lock;
u32 s_next_generation;
unsigned long osb_flags;
- s16 s_inode_steal_slot;
- s16 s_meta_steal_slot;
+ u16 s_inode_steal_slot;
+ u16 s_meta_steal_slot;
atomic_t s_num_inodes_stolen;
atomic_t s_num_meta_stolen;
@@ -392,6 +392,7 @@ struct ocfs2_super
struct ocfs2_lock_res osb_super_lockres;
struct ocfs2_lock_res osb_rename_lockres;
struct ocfs2_lock_res osb_nfs_sync_lockres;
+ struct rw_semaphore nfs_sync_rwlock;
struct ocfs2_lock_res osb_trim_fs_lockres;
struct mutex obs_trim_fs_mutex;
struct ocfs2_dlm_debug *osb_dlm_debug;
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 0db4a7ec58a2..dcef83c8796d 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -290,7 +290,7 @@
#define OCFS2_MAX_SLOTS 255
/* Slot map indicator for an empty slot */
-#define OCFS2_INVALID_SLOT -1
+#define OCFS2_INVALID_SLOT ((u16)-1)
#define OCFS2_VOL_UUID_LEN 16
#define OCFS2_MAX_VOL_LABEL_LEN 64
@@ -326,8 +326,8 @@ struct ocfs2_system_inode_info {
enum {
BAD_BLOCK_SYSTEM_INODE = 0,
GLOBAL_INODE_ALLOC_SYSTEM_INODE,
+#define OCFS2_FIRST_ONLINE_SYSTEM_INODE GLOBAL_INODE_ALLOC_SYSTEM_INODE
SLOT_MAP_SYSTEM_INODE,
-#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
HEARTBEAT_SYSTEM_INODE,
GLOBAL_BITMAP_SYSTEM_INODE,
USER_QUOTA_SYSTEM_INODE,
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 7a922190a8c7..eda83487c9ec 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -728,7 +728,7 @@ static int ocfs2_release_dquot(struct dquot *dquot)
mutex_lock(&dquot->dq_lock);
/* Check whether we are not racing with some other dqget() */
- if (atomic_read(&dquot->dq_count) > 1)
+ if (dquot_is_busy(dquot))
goto out;
/* Running from downconvert thread? Postpone quota processing to wq */
if (current == osb->dc_task) {
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 69c21a3843af..5e0eaea47405 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -879,9 +879,9 @@ static void __ocfs2_set_steal_slot(struct ocfs2_super *osb, int slot, int type)
{
spin_lock(&osb->osb_lock);
if (type == INODE_ALLOC_SYSTEM_INODE)
- osb->s_inode_steal_slot = slot;
+ osb->s_inode_steal_slot = (u16)slot;
else if (type == EXTENT_ALLOC_SYSTEM_INODE)
- osb->s_meta_steal_slot = slot;
+ osb->s_meta_steal_slot = (u16)slot;
spin_unlock(&osb->osb_lock);
}
@@ -2827,9 +2827,12 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
goto bail;
}
- inode_alloc_inode =
- ocfs2_get_system_file_inode(osb, INODE_ALLOC_SYSTEM_INODE,
- suballoc_slot);
+ if (suballoc_slot == (u16)OCFS2_INVALID_SLOT)
+ inode_alloc_inode = ocfs2_get_system_file_inode(osb,
+ GLOBAL_INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
+ else
+ inode_alloc_inode = ocfs2_get_system_file_inode(osb,
+ INODE_ALLOC_SYSTEM_INODE, suballoc_slot);
if (!inode_alloc_inode) {
/* the error code could be inaccurate, but we are not able to
* get the correct one. */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index a201f9780b35..e9cb264c5aae 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -78,7 +78,7 @@ struct mount_options
unsigned long commit_interval;
unsigned long mount_opt;
unsigned int atime_quantum;
- signed short slot;
+ unsigned short slot;
int localalloc_opt;
unsigned int resv_level;
int dir_resv_level;
@@ -1354,7 +1354,7 @@ static int ocfs2_parse_options(struct super_block *sb,
goto bail;
}
if (option)
- mopt->slot = (s16)option;
+ mopt->slot = (u16)option;
break;
case Opt_commit:
if (match_int(&args[0], &option)) {
diff --git a/fs/open.c b/fs/open.c
index 78809163ba03..2941aa2c233e 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -853,9 +853,6 @@ cleanup_file:
* the return value of d_splice_alias(), then the caller needs to perform dput()
* on it after finish_open().
*
- * On successful return @file is a fully instantiated open file. After this, if
- * an error occurs in ->atomic_open(), it needs to clean up with fput().
- *
* Returns zero on success or -errno if the open failed.
*/
int finish_open(struct file *file, struct dentry *dentry,
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index a35c17017210..59ec1cb0e158 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -313,23 +313,8 @@ static ssize_t orangefs_file_read_iter(struct kiocb *iocb,
struct iov_iter *iter)
{
int ret;
- struct orangefs_read_options *ro;
-
orangefs_stats.reads++;
- /*
- * Remember how they set "count" in read(2) or pread(2) or whatever -
- * users can use count as a knob to control orangefs io size and later
- * we can try to help them fill as many pages as possible in readpage.
- */
- if (!iocb->ki_filp->private_data) {
- iocb->ki_filp->private_data = kmalloc(sizeof *ro, GFP_KERNEL);
- if (!iocb->ki_filp->private_data)
- return(ENOMEM);
- ro = iocb->ki_filp->private_data;
- ro->blksiz = iter->count;
- }
-
down_read(&file_inode(iocb->ki_filp)->i_rwsem);
ret = orangefs_revalidate_mapping(file_inode(iocb->ki_filp));
if (ret)
@@ -598,12 +583,6 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
return rc;
}
-static int orangefs_file_open(struct inode * inode, struct file *file)
-{
- file->private_data = NULL;
- return generic_file_open(inode, file);
-}
-
static int orangefs_flush(struct file *file, fl_owner_t id)
{
/*
@@ -617,9 +596,6 @@ static int orangefs_flush(struct file *file, fl_owner_t id)
struct inode *inode = file->f_mapping->host;
int r;
- kfree(file->private_data);
- file->private_data = NULL;
-
if (inode->i_state & I_DIRTY_TIME) {
spin_lock(&inode->i_lock);
inode->i_state &= ~I_DIRTY_TIME;
@@ -642,7 +618,7 @@ const struct file_operations orangefs_file_operations = {
.lock = orangefs_lock,
.unlocked_ioctl = orangefs_ioctl,
.mmap = orangefs_file_mmap,
- .open = orangefs_file_open,
+ .open = generic_file_open,
.flush = orangefs_flush,
.release = orangefs_file_release,
.fsync = orangefs_fsync,
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 0c337d8bdaab..c9b3e5ec8221 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -259,46 +259,19 @@ static int orangefs_readpage(struct file *file, struct page *page)
pgoff_t index; /* which page */
struct page *next_page;
char *kaddr;
- struct orangefs_read_options *ro = file->private_data;
loff_t read_size;
- loff_t roundedup;
int buffer_index = -1; /* orangefs shared memory slot */
int slot_index; /* index into slot */
int remaining;
/*
- * If they set some miniscule size for "count" in read(2)
- * (for example) then let's try to read a page, or the whole file
- * if it is smaller than a page. Once "count" goes over a page
- * then lets round up to the highest page size multiple that is
- * less than or equal to "count" and do that much orangefs IO and
- * try to fill as many pages as we can from it.
- *
- * "count" should be represented in ro->blksiz.
- *
- * inode->i_size = file size.
+ * Get up to this many bytes from Orangefs at a time and try
+ * to fill them into the page cache at once. Tests with dd made
+ * this seem like a reasonable static number, if there was
+ * interest perhaps this number could be made setable through
+ * sysfs...
*/
- if (ro) {
- if (ro->blksiz < PAGE_SIZE) {
- if (inode->i_size < PAGE_SIZE)
- read_size = inode->i_size;
- else
- read_size = PAGE_SIZE;
- } else {
- roundedup = ((PAGE_SIZE - 1) & ro->blksiz) ?
- ((ro->blksiz + PAGE_SIZE) & ~(PAGE_SIZE -1)) :
- ro->blksiz;
- if (roundedup > inode->i_size)
- read_size = inode->i_size;
- else
- read_size = roundedup;
-
- }
- } else {
- read_size = PAGE_SIZE;
- }
- if (!read_size)
- read_size = PAGE_SIZE;
+ read_size = 524288;
if (PageDirty(page))
orangefs_launder_page(page);
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 87b1a6fce628..610e1dc112ea 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -305,6 +305,7 @@ static void *help_start(struct seq_file *m, loff_t *pos)
static void *help_next(struct seq_file *m, void *v, loff_t *pos)
{
+ (*pos)++;
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "help_next: start\n");
return NULL;
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 572dd29fbd54..884cdf134f3e 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -239,10 +239,6 @@ struct orangefs_write_range {
kgid_t gid;
};
-struct orangefs_read_options {
- ssize_t blksiz;
-};
-
extern struct orangefs_stats orangefs_stats;
/*
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index b801c6353100..ec5eca5a96f4 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -40,7 +40,7 @@ int ovl_copy_xattr(struct dentry *old, struct dentry *new)
{
ssize_t list_size, size, value_size = 0;
char *buf, *name, *value = NULL;
- int uninitialized_var(error);
+ int error = 0;
size_t slen;
if (!(old->d_inode->i_opflags & IOP_XATTR) ||
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 702aa63f6774..29abdb1d3b5c 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1170,7 +1170,7 @@ static int ovl_rename(struct inode *olddir, struct dentry *old,
if (newdentry == trap)
goto out_dput;
- if (WARN_ON(olddentry->d_inode == newdentry->d_inode))
+ if (olddentry->d_inode == newdentry->d_inode)
goto out_dput;
err = 0;
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 73c9775215b3..11dd8177770d 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -482,7 +482,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
if (IS_ERR_OR_NULL(this))
return this;
- if (WARN_ON(ovl_dentry_real_at(this, layer->idx) != real)) {
+ if (ovl_dentry_real_at(this, layer->idx) != real) {
dput(this);
this = ERR_PTR(-EIO);
}
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index e235a635d9ec..7a08a576f7b2 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -21,13 +21,16 @@ static char ovl_whatisit(struct inode *inode, struct inode *realinode)
return 'm';
}
+/* No atime modificaton nor notify on underlying */
+#define OVL_OPEN_FLAGS (O_NOATIME | FMODE_NONOTIFY)
+
static struct file *ovl_open_realfile(const struct file *file,
struct inode *realinode)
{
struct inode *inode = file_inode(file);
struct file *realfile;
const struct cred *old_cred;
- int flags = file->f_flags | O_NOATIME | FMODE_NONOTIFY;
+ int flags = file->f_flags | OVL_OPEN_FLAGS;
old_cred = ovl_override_creds(inode->i_sb);
realfile = open_with_fake_path(&file->f_path, flags, realinode,
@@ -48,8 +51,7 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
struct inode *inode = file_inode(file);
int err;
- /* No atime modificaton on underlying */
- flags |= O_NOATIME | FMODE_NONOTIFY;
+ flags |= OVL_OPEN_FLAGS;
/* If some flag changed that cannot be changed then something's amiss */
if (WARN_ON((file->f_flags ^ flags) & ~OVL_SETFL_MASK))
@@ -102,7 +104,7 @@ static int ovl_real_fdget_meta(const struct file *file, struct fd *real,
}
/* Did the flags change since open? */
- if (unlikely((file->f_flags ^ real->file->f_flags) & ~O_NOATIME))
+ if (unlikely((file->f_flags ^ real->file->f_flags) & ~OVL_OPEN_FLAGS))
return ovl_change_flags(real->file, file->f_flags);
return 0;
@@ -146,7 +148,7 @@ static loff_t ovl_llseek(struct file *file, loff_t offset, int whence)
struct inode *inode = file_inode(file);
struct fd real;
const struct cred *old_cred;
- ssize_t ret;
+ loff_t ret;
/*
* The two special cases below do not need to involve real fs,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index bc14781886bf..bb980721502d 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -200,8 +200,14 @@ int ovl_getattr(const struct path *path, struct kstat *stat,
if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) ||
(!ovl_verify_lower(dentry->d_sb) &&
(is_dir || lowerstat.nlink == 1))) {
- stat->ino = lowerstat.ino;
lower_layer = ovl_layer_lower(dentry);
+ /*
+ * Cannot use origin st_dev;st_ino because
+ * origin inode content may differ from overlay
+ * inode content.
+ */
+ if (samefs || lower_layer->fsid)
+ stat->ino = lowerstat.ino;
}
/*
@@ -875,7 +881,7 @@ struct inode *ovl_get_inode(struct super_block *sb,
struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL;
bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry,
oip->index);
- int fsid = bylower ? oip->lowerpath->layer->fsid : 0;
+ int fsid = bylower ? lowerpath->layer->fsid : 0;
bool is_dir, metacopy = false;
unsigned long ino = 0;
int err = oip->newinode ? -EEXIST : -ENOMEM;
@@ -925,6 +931,8 @@ struct inode *ovl_get_inode(struct super_block *sb,
err = -ENOMEM;
goto out_err;
}
+ ino = realinode->i_ino;
+ fsid = lowerpath->layer->fsid;
}
ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev, ino, fsid);
ovl_inode_init(inode, upperdentry, lowerdentry, oip->lowerdata);
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index e9717c2f7d45..f47c591402d7 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -325,6 +325,14 @@ int ovl_check_origin_fh(struct ovl_fs *ofs, struct ovl_fh *fh, bool connected,
int i;
for (i = 0; i < ofs->numlower; i++) {
+ /*
+ * If lower fs uuid is not unique among lower fs we cannot match
+ * fh->uuid to layer.
+ */
+ if (ofs->lower_layers[i].fsid &&
+ ofs->lower_layers[i].fs->bad_uuid)
+ continue;
+
origin = ovl_decode_real_fh(fh, ofs->lower_layers[i].mnt,
connected);
if (origin)
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index a8279280e88d..28348c44ea5b 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -22,6 +22,8 @@ struct ovl_config {
struct ovl_sb {
struct super_block *sb;
dev_t pseudo_dev;
+ /* Unusable (conflicting) uuid */
+ bool bad_uuid;
};
struct ovl_layer {
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index 47a91c9733a5..7255e6a5838f 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -504,7 +504,13 @@ get:
if (err)
goto fail;
- WARN_ON_ONCE(dir->d_sb->s_dev != stat.dev);
+ /*
+ * Directory inode is always on overlay st_dev.
+ * Non-dir with ovl_same_dev() could be on pseudo st_dev in case
+ * of xino bits overflow.
+ */
+ WARN_ON_ONCE(S_ISDIR(stat.mode) &&
+ dir->d_sb->s_dev != stat.dev);
ino = stat.ino;
} else if (xinobits && !OVL_TYPE_UPPER(type)) {
ino = ovl_remap_lower_ino(ino, xinobits,
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index afbcb116a7f1..d6b724beb304 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -1255,17 +1255,33 @@ static bool ovl_lower_uuid_ok(struct ovl_fs *ofs, const uuid_t *uuid)
{
unsigned int i;
- if (!ofs->config.nfs_export && !(ofs->config.index && ofs->upper_mnt))
+ if (!ofs->config.nfs_export && !ofs->upper_mnt)
return true;
+ /*
+ * We allow using single lower with null uuid for index and nfs_export
+ * for example to support those features with single lower squashfs.
+ * To avoid regressions in setups of overlay with re-formatted lower
+ * squashfs, do not allow decoding origin with lower null uuid unless
+ * user opted-in to one of the new features that require following the
+ * lower inode of non-dir upper.
+ */
+ if (!ofs->config.index && !ofs->config.metacopy && !ofs->config.xino &&
+ uuid_is_null(uuid))
+ return false;
+
for (i = 0; i < ofs->numlowerfs; i++) {
/*
* We use uuid to associate an overlay lower file handle with a
* lower layer, so we can accept lower fs with null uuid as long
* as all lower layers with null uuid are on the same fs.
+ * if we detect multiple lower fs with the same uuid, we
+ * disable lower file handle decoding on all of them.
*/
- if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid))
+ if (uuid_equal(&ofs->lower_fs[i].sb->s_uuid, uuid)) {
+ ofs->lower_fs[i].bad_uuid = true;
return false;
+ }
}
return true;
}
@@ -1277,6 +1293,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
unsigned int i;
dev_t dev;
int err;
+ bool bad_uuid = false;
/* fsid 0 is reserved for upper fs even with non upper overlay */
if (ofs->upper_mnt && ofs->upper_mnt->mnt_sb == sb)
@@ -1288,11 +1305,15 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
}
if (!ovl_lower_uuid_ok(ofs, &sb->s_uuid)) {
- ofs->config.index = false;
- ofs->config.nfs_export = false;
- pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n",
- uuid_is_null(&sb->s_uuid) ? "null" : "conflicting",
- path->dentry);
+ bad_uuid = true;
+ if (ofs->config.index || ofs->config.nfs_export) {
+ ofs->config.index = false;
+ ofs->config.nfs_export = false;
+ pr_warn("overlayfs: %s uuid detected in lower fs '%pd2', falling back to index=off,nfs_export=off.\n",
+ uuid_is_null(&sb->s_uuid) ? "null" :
+ "conflicting",
+ path->dentry);
+ }
}
err = get_anon_bdev(&dev);
@@ -1303,6 +1324,7 @@ static int ovl_get_fsid(struct ovl_fs *ofs, const struct path *path)
ofs->lower_fs[ofs->numlowerfs].sb = sb;
ofs->lower_fs[ofs->numlowerfs].pseudo_dev = dev;
+ ofs->lower_fs[ofs->numlowerfs].bad_uuid = bad_uuid;
ofs->numlowerfs++;
return ofs->numlowerfs;
@@ -1334,14 +1356,23 @@ static int ovl_get_lower_layers(struct super_block *sb, struct ovl_fs *ofs,
if (err < 0)
goto out;
+ /*
+ * Check if lower root conflicts with this overlay layers before
+ * checking if it is in-use as upperdir/workdir of "another"
+ * mount, because we do not bother to check in ovl_is_inuse() if
+ * the upperdir/workdir is in fact in-use by our
+ * upperdir/workdir.
+ */
err = ovl_setup_trap(sb, stack[i].dentry, &trap, "lowerdir");
if (err)
goto out;
if (ovl_is_inuse(stack[i].dentry)) {
err = ovl_report_in_use(ofs, "lowerdir");
- if (err)
+ if (err) {
+ iput(trap);
goto out;
+ }
}
mnt = clone_private_mount(&stack[i]);
diff --git a/fs/pnode.c b/fs/pnode.c
index 49f6d7ff2139..1106137c747a 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -261,14 +261,13 @@ static int propagate_one(struct mount *m)
child = copy_tree(last_source, last_source->mnt.mnt_root, type);
if (IS_ERR(child))
return PTR_ERR(child);
+ read_seqlock_excl(&mount_lock);
mnt_set_mountpoint(m, mp, child);
+ if (m->mnt_master != dest_master)
+ SET_MNT_MARK(m->mnt_master);
+ read_sequnlock_excl(&mount_lock);
last_dest = m;
last_source = child;
- if (m->mnt_master != dest_master) {
- read_seqlock_excl(&mount_lock);
- SET_MNT_MARK(m->mnt_master);
- read_sequnlock_excl(&mount_lock);
- }
hlist_add_head(&child->mnt_hash, list);
return count_mounts(m->mnt_ns, child);
}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 5f8d215b3fd0..df6a5c0656df 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -441,7 +441,7 @@ const struct inode_operations proc_link_inode_operations = {
struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
{
- struct inode *inode = new_inode_pseudo(sb);
+ struct inode *inode = new_inode(sb);
if (inode) {
inode->i_ino = de->low_ino;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 57c0a1047250..32af065397f8 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -43,7 +43,7 @@ int proc_setup_self(struct super_block *s)
inode_lock(root_inode);
self = d_alloc_name(s->s_root, "self");
if (self) {
- struct inode *inode = new_inode_pseudo(s);
+ struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = self_inum;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index f61ae53533f5..fac9e50b33a6 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -43,7 +43,7 @@ int proc_setup_thread_self(struct super_block *s)
inode_lock(root_inode);
thread_self = d_alloc_name(s->s_root, "thread-self");
if (thread_self) {
- struct inode *inode = new_inode_pseudo(s);
+ struct inode *inode = new_inode(s);
if (inode) {
inode->i_ino = thread_self_inum;
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7bb96fdd38ad..53e95afab2be 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -262,7 +262,8 @@ static int vmcoredd_mmap_dumps(struct vm_area_struct *vma, unsigned long dst,
if (start < offset + dump->size) {
tsz = min(offset + (u64)dump->size - start, (u64)size);
buf = dump->buf + start - offset;
- if (remap_vmalloc_range_partial(vma, dst, buf, tsz)) {
+ if (remap_vmalloc_range_partial(vma, dst, buf, 0,
+ tsz)) {
ret = -EFAULT;
goto out_unlock;
}
@@ -620,7 +621,7 @@ static int mmap_vmcore(struct file *file, struct vm_area_struct *vma)
tsz = min(elfcorebuf_sz + elfnotes_sz - (size_t)start, size);
kaddr = elfnotes_buf + start - elfcorebuf_sz - vmcoredd_orig_sz;
if (remap_vmalloc_range_partial(vma, vma->vm_start + len,
- kaddr, tsz))
+ kaddr, 0, tsz))
goto fail;
size -= tsz;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 7fbe8f058220..d99b5d39aa90 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -87,11 +87,11 @@ static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos)
struct pstore_private *ps = s->private;
struct pstore_ftrace_seq_data *data = v;
+ (*pos)++;
data->off += REC_SIZE;
if (data->off + REC_SIZE > ps->total_size)
return NULL;
- (*pos)++;
return data;
}
@@ -101,6 +101,9 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
struct pstore_ftrace_seq_data *data = v;
struct pstore_ftrace_record *rec;
+ if (!data)
+ return 0;
+
rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off);
seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %ps <- %pS\n",
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 3d7024662d29..74a60bae2b23 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -275,6 +275,9 @@ static int pstore_compress(const void *in, void *out,
{
int ret;
+ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION))
+ return -EINVAL;
+
ret = crypto_comp_compress(tfm, in, inlen, out, &outlen);
if (ret) {
pr_err("crypto_comp_compress failed, ret = %d!\n", ret);
@@ -661,7 +664,7 @@ static void decompress_record(struct pstore_record *record)
int unzipped_len;
char *unzipped, *workspace;
- if (!record->compressed)
+ if (!IS_ENABLED(CONFIG_PSTORE_COMPRESSION) || !record->compressed)
return;
/* Only PSTORE_TYPE_DMESG support compression. */
@@ -823,9 +826,9 @@ static int __init pstore_init(void)
ret = pstore_init_fs();
if (ret)
- return ret;
+ free_buf_for_compression();
- return 0;
+ return ret;
}
late_initcall(pstore_init);
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index db9f67d34af3..2c549b5dd4aa 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -407,6 +407,17 @@ static int notrace ramoops_pstore_write(struct pstore_record *record)
prz = cxt->dprzs[cxt->dump_write_cnt];
+ /*
+ * Since this is a new crash dump, we need to reset the buffer in
+ * case it still has an old dump present. Without this, the new dump
+ * will get appended, which would seriously confuse anything trying
+ * to check dump file contents. Specifically, ramoops_read_kmsg_hdr()
+ * expects to find a dump header in the beginning of buffer data, so
+ * we must to reset the buffer values, in order to ensure that the
+ * header will be written to the beginning of the buffer.
+ */
+ persistent_ram_zap(prz);
+
/* Build header and append record contents. */
hlen = ramoops_write_kmsg_hdr(prz, record);
if (!hlen)
@@ -572,6 +583,7 @@ static int ramoops_init_przs(const char *name,
prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig,
&cxt->ecc_info,
cxt->memtype, flags, label);
+ kfree(label);
if (IS_ERR(prz_ar[i])) {
err = PTR_ERR(prz_ar[i]);
dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n",
@@ -617,6 +629,7 @@ static int ramoops_init_prz(const char *name,
label = kasprintf(GFP_KERNEL, "ramoops:%s", name);
*prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info,
cxt->memtype, PRZ_FLAG_ZAP_OLD, label);
+ kfree(label);
if (IS_ERR(*prz)) {
int err = PTR_ERR(*prz);
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 8823f65888f0..1f4d8c06f9be 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -574,7 +574,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
/* Initialize general buffer state. */
raw_spin_lock_init(&prz->buffer_lock);
prz->flags = flags;
- prz->label = label;
+ prz->label = kstrdup(label, GFP_KERNEL);
ret = persistent_ram_buffer_map(start, size, prz, memtype);
if (ret)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 58f15a083dd1..d7e54bc499f1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -492,7 +492,7 @@ int dquot_release(struct dquot *dquot)
mutex_lock(&dquot->dq_lock);
/* Check whether we are not racing with some other dqget() */
- if (atomic_read(&dquot->dq_count) > 1)
+ if (dquot_is_busy(dquot))
goto out_dqlock;
if (dqopt->ops[dquot->dq_id.type]->release_dqblk) {
ret = dqopt->ops[dquot->dq_id.type]->release_dqblk(dquot);
@@ -618,7 +618,7 @@ EXPORT_SYMBOL(dquot_scan_active);
/* Write all dquot structures to quota files */
int dquot_writeback_dquots(struct super_block *sb, int type)
{
- struct list_head *dirty;
+ struct list_head dirty;
struct dquot *dquot;
struct quota_info *dqopt = sb_dqopt(sb);
int cnt;
@@ -632,9 +632,10 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
if (!sb_has_quota_active(sb, cnt))
continue;
spin_lock(&dq_list_lock);
- dirty = &dqopt->info[cnt].dqi_dirty_list;
- while (!list_empty(dirty)) {
- dquot = list_first_entry(dirty, struct dquot,
+ /* Move list away to avoid livelock. */
+ list_replace_init(&dqopt->info[cnt].dqi_dirty_list, &dirty);
+ while (!list_empty(&dirty)) {
+ dquot = list_first_entry(&dirty, struct dquot,
dq_dirty);
WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags));
@@ -980,6 +981,7 @@ static int add_dquot_ref(struct super_block *sb, int type)
* later.
*/
old_inode = inode;
+ cond_resched();
spin_lock(&sb->s_inode_list_lock);
}
spin_unlock(&sb->s_inode_list_lock);
@@ -2855,68 +2857,73 @@ EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
static int do_proc_dqstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- unsigned int type = (int *)table->data - dqstats.stat;
+ unsigned int type = (unsigned long *)table->data - dqstats.stat;
+ s64 value = percpu_counter_sum(&dqstats.counter[type]);
+
+ /* Filter negative values for non-monotonic counters */
+ if (value < 0 && (type == DQST_ALLOC_DQUOTS ||
+ type == DQST_FREE_DQUOTS))
+ value = 0;
/* Update global table */
- dqstats.stat[type] =
- percpu_counter_sum_positive(&dqstats.counter[type]);
- return proc_dointvec(table, write, buffer, lenp, ppos);
+ dqstats.stat[type] = value;
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
static struct ctl_table fs_dqstats_table[] = {
{
.procname = "lookups",
.data = &dqstats.stat[DQST_LOOKUPS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "drops",
.data = &dqstats.stat[DQST_DROPS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "reads",
.data = &dqstats.stat[DQST_READS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "writes",
.data = &dqstats.stat[DQST_WRITES],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "cache_hits",
.data = &dqstats.stat[DQST_CACHE_HITS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "allocated_dquots",
.data = &dqstats.stat[DQST_ALLOC_DQUOTS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "free_dquots",
.data = &dqstats.stat[DQST_FREE_DQUOTS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
{
.procname = "syncs",
.data = &dqstats.stat[DQST_SYNCS],
- .maxlen = sizeof(int),
+ .maxlen = sizeof(unsigned long),
.mode = 0444,
.proc_handler = do_proc_dqstats,
},
diff --git a/fs/read_write.c b/fs/read_write.c
index 165befc4fd82..95b3f902fd19 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1768,10 +1768,9 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
* else. Assume that the offsets have already been checked for block
* alignment.
*
- * For deduplication we always scale down to the previous block because we
- * can't meaningfully compare post-EOF contents.
- *
- * For clone we only link a partial EOF block above the destination file's EOF.
+ * For clone we only link a partial EOF block above or at the destination file's
+ * EOF. For deduplication we accept a partial EOF block only if it ends at the
+ * destination file's EOF (can not link it into the middle of a file).
*
* Shorten the request if possible.
*/
@@ -1787,8 +1786,7 @@ static int generic_remap_check_len(struct inode *inode_in,
if ((*len & blkmask) == 0)
return 0;
- if ((remap_flags & REMAP_FILE_DEDUP) ||
- pos_out + *len < i_size_read(inode_out))
+ if (pos_out + *len < i_size_read(inode_out))
new_len &= ~blkmask;
if (new_len == *len)
diff --git a/fs/readdir.c b/fs/readdir.c
index d26d5ea4de7b..de2eceffdee8 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -102,10 +102,14 @@ EXPORT_SYMBOL(iterate_dir);
* filename length, and the above "soft error" worry means
* that it's probably better left alone until we have that
* issue clarified.
+ *
+ * Note the PATH_MAX check - it's arbitrary but the real
+ * kernel limit on a possible path component, not NAME_MAX,
+ * which is the technical standard limit.
*/
static int verify_dirent_name(const char *name, int len)
{
- if (!len)
+ if (len <= 0 || len >= PATH_MAX)
return -EIO;
if (memchr(name, '/', len))
return -EIO;
@@ -206,7 +210,7 @@ struct linux_dirent {
struct getdents_callback {
struct dir_context ctx;
struct linux_dirent __user * current_dir;
- struct linux_dirent __user * previous;
+ int prev_reclen;
int count;
int error;
};
@@ -214,12 +218,13 @@ struct getdents_callback {
static int filldir(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
- struct linux_dirent __user * dirent;
+ struct linux_dirent __user *dirent, *prev;
struct getdents_callback *buf =
container_of(ctx, struct getdents_callback, ctx);
unsigned long d_ino;
int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2,
sizeof(long));
+ int prev_reclen;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
@@ -232,28 +237,24 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen,
buf->error = -EOVERFLOW;
return -EOVERFLOW;
}
- dirent = buf->previous;
- if (dirent && signal_pending(current))
+ prev_reclen = buf->prev_reclen;
+ if (prev_reclen && signal_pending(current))
return -EINTR;
-
- /*
- * Note! This range-checks 'previous' (which may be NULL).
- * The real range was checked in getdents
- */
- if (!user_access_begin(dirent, sizeof(*dirent)))
- goto efault;
- if (dirent)
- unsafe_put_user(offset, &dirent->d_off, efault_end);
dirent = buf->current_dir;
+ prev = (void __user *) dirent - prev_reclen;
+ if (!user_access_begin(prev, reclen + prev_reclen))
+ goto efault;
+
+ /* This might be 'dirent->d_off', but if so it will get overwritten */
+ unsafe_put_user(offset, &prev->d_off, efault_end);
unsafe_put_user(d_ino, &dirent->d_ino, efault_end);
unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
unsafe_put_user(d_type, (char __user *) dirent + reclen - 1, efault_end);
unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
user_access_end();
- buf->previous = dirent;
- dirent = (void __user *)dirent + reclen;
- buf->current_dir = dirent;
+ buf->current_dir = (void __user *)dirent + reclen;
+ buf->prev_reclen = reclen;
buf->count -= reclen;
return 0;
efault_end:
@@ -267,7 +268,6 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct linux_dirent __user *, dirent, unsigned int, count)
{
struct fd f;
- struct linux_dirent __user * lastdirent;
struct getdents_callback buf = {
.ctx.actor = filldir,
.count = count,
@@ -285,8 +285,10 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
- lastdirent = buf.previous;
- if (lastdirent) {
+ if (buf.prev_reclen) {
+ struct linux_dirent __user * lastdirent;
+ lastdirent = (void __user *)buf.current_dir - buf.prev_reclen;
+
if (put_user(buf.ctx.pos, &lastdirent->d_off))
error = -EFAULT;
else
@@ -299,7 +301,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd,
struct getdents_callback64 {
struct dir_context ctx;
struct linux_dirent64 __user * current_dir;
- struct linux_dirent64 __user * previous;
+ int prev_reclen;
int count;
int error;
};
@@ -307,11 +309,12 @@ struct getdents_callback64 {
static int filldir64(struct dir_context *ctx, const char *name, int namlen,
loff_t offset, u64 ino, unsigned int d_type)
{
- struct linux_dirent64 __user *dirent;
+ struct linux_dirent64 __user *dirent, *prev;
struct getdents_callback64 *buf =
container_of(ctx, struct getdents_callback64, ctx);
int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1,
sizeof(u64));
+ int prev_reclen;
buf->error = verify_dirent_name(name, namlen);
if (unlikely(buf->error))
@@ -319,30 +322,27 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen,
buf->error = -EINVAL; /* only used if we fail.. */
if (reclen > buf->count)
return -EINVAL;
- dirent = buf->previous;
- if (dirent && signal_pending(current))
+ prev_reclen = buf->prev_reclen;
+ if (prev_reclen && signal_pending(current))
return -EINTR;
-
- /*
- * Note! This range-checks 'previous' (which may be NULL).
- * The real range was checked in getdents
- */
- if (!user_access_begin(dirent, sizeof(*dirent)))
- goto efault;
- if (dirent)
- unsafe_put_user(offset, &dirent->d_off, efault_end);
dirent = buf->current_dir;
+ prev = (void __user *)dirent - prev_reclen;
+ if (!user_access_begin(prev, reclen + prev_reclen))
+ goto efault;
+
+ /* This might be 'dirent->d_off', but if so it will get overwritten */
+ unsafe_put_user(offset, &prev->d_off, efault_end);
unsafe_put_user(ino, &dirent->d_ino, efault_end);
unsafe_put_user(reclen, &dirent->d_reclen, efault_end);
unsafe_put_user(d_type, &dirent->d_type, efault_end);
unsafe_copy_dirent_name(dirent->d_name, name, namlen, efault_end);
user_access_end();
- buf->previous = dirent;
- dirent = (void __user *)dirent + reclen;
- buf->current_dir = dirent;
+ buf->prev_reclen = reclen;
+ buf->current_dir = (void __user *)dirent + reclen;
buf->count -= reclen;
return 0;
+
efault_end:
user_access_end();
efault:
@@ -354,7 +354,6 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent,
unsigned int count)
{
struct fd f;
- struct linux_dirent64 __user * lastdirent;
struct getdents_callback64 buf = {
.ctx.actor = filldir64,
.count = count,
@@ -372,9 +371,11 @@ int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent,
error = iterate_dir(f.file, &buf.ctx);
if (error >= 0)
error = buf.error;
- lastdirent = buf.previous;
- if (lastdirent) {
+ if (buf.prev_reclen) {
+ struct linux_dirent64 __user * lastdirent;
typeof(lastdirent->d_off) d_off = buf.ctx.pos;
+
+ lastdirent = (void __user *) buf.current_dir - buf.prev_reclen;
if (__put_user(d_off, &lastdirent->d_off))
error = -EFAULT;
else
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 132ec4406ed0..6419e6dacc39 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2097,6 +2097,15 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
goto out_inserted_sd;
}
+ /*
+ * Mark it private if we're creating the privroot
+ * or something under it.
+ */
+ if (IS_PRIVATE(dir) || dentry == REISERFS_SB(sb)->priv_root) {
+ inode->i_flags |= S_PRIVATE;
+ inode->i_opflags &= ~IOP_XATTR;
+ }
+
if (reiserfs_posixacl(inode->i_sb)) {
reiserfs_write_unlock(inode->i_sb);
retval = reiserfs_inherit_default_acl(th, dir, dentry, inode);
@@ -2111,8 +2120,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
reiserfs_warning(inode->i_sb, "jdm-13090",
"ACLs aren't enabled in the fs, "
"but vfs thinks they are!");
- } else if (IS_PRIVATE(dir))
- inode->i_flags |= S_PRIVATE;
+ }
if (security->name) {
reiserfs_write_unlock(inode->i_sb);
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 97f3fc4fdd79..959a066b7bb0 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -377,10 +377,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
/*
* Propagate the private flag so we know we're
- * in the priv tree
+ * in the priv tree. Also clear IOP_XATTR
+ * since we don't have xattrs on xattr files.
*/
- if (IS_PRIVATE(dir))
+ if (IS_PRIVATE(dir)) {
inode->i_flags |= S_PRIVATE;
+ inode->i_opflags &= ~IOP_XATTR;
+ }
}
reiserfs_write_unlock(dir->i_sb);
if (retval == IO_ERROR) {
diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h
index e5ca9ed79e54..726580114d55 100644
--- a/fs/reiserfs/reiserfs.h
+++ b/fs/reiserfs/reiserfs.h
@@ -1168,6 +1168,8 @@ static inline int bmap_would_wrap(unsigned bmap_nr)
return bmap_nr > ((1LL << 16) - 1);
}
+extern const struct xattr_handler *reiserfs_xattr_handlers[];
+
/*
* this says about version of key of all items (but stat data) the
* object consists of
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 0037aea97d39..2946713cb00d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -2250,7 +2250,8 @@ error_out:
/* also releases the path */
unfix_nodes(&s_ins_balance);
#ifdef REISERQUOTA_DEBUG
- reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
+ if (inode)
+ reiserfs_debug(th->t_super, REISERFS_DEBUG_CODE,
"reiserquota insert_item(): freeing %u id=%u type=%c",
quota_bytes, inode->i_uid, head2type(ih));
#endif
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index ab028ea0e561..7a796b5b6643 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -629,6 +629,7 @@ static void reiserfs_put_super(struct super_block *s)
reiserfs_write_unlock(s);
mutex_destroy(&REISERFS_SB(s)->lock);
destroy_workqueue(REISERFS_SB(s)->commit_wq);
+ kfree(REISERFS_SB(s)->s_jdev);
kfree(s->s_fs_info);
s->s_fs_info = NULL;
}
@@ -1947,7 +1948,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (!sbi->s_jdev) {
SWARN(silent, s, "", "Cannot allocate memory for "
"journal device name");
- goto error;
+ goto error_unlocked;
}
}
#ifdef CONFIG_QUOTA
@@ -2046,6 +2047,8 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
if (replay_only(s))
goto error_unlocked;
+ s->s_xattr = reiserfs_xattr_handlers;
+
if (bdev_read_only(s->s_bdev) && !sb_rdonly(s)) {
SWARN(silent, s, "clm-7000",
"Detected readonly device, marking FS readonly");
@@ -2235,6 +2238,7 @@ error_unlocked:
kfree(qf_names[j]);
}
#endif
+ kfree(sbi->s_jdev);
kfree(sbi);
s->s_fs_info = NULL;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index b5b26d8a192c..28b241cd6987 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -122,13 +122,13 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
struct dentry *xaroot;
if (d_really_is_negative(privroot))
- return ERR_PTR(-ENODATA);
+ return ERR_PTR(-EOPNOTSUPP);
inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
xaroot = dget(REISERFS_SB(sb)->xattr_root);
if (!xaroot)
- xaroot = ERR_PTR(-ENODATA);
+ xaroot = ERR_PTR(-EOPNOTSUPP);
else if (d_really_is_negative(xaroot)) {
int err = -ENODATA;
@@ -319,8 +319,12 @@ static int reiserfs_for_each_xattr(struct inode *inode,
out_dir:
dput(dir);
out:
- /* -ENODATA isn't an error */
- if (err == -ENODATA)
+ /*
+ * -ENODATA: this object doesn't have any xattrs
+ * -EOPNOTSUPP: this file system doesn't have xattrs enabled on disk.
+ * Neither are errors
+ */
+ if (err == -ENODATA || err == -EOPNOTSUPP)
err = 0;
return err;
}
@@ -619,6 +623,10 @@ int reiserfs_xattr_set(struct inode *inode, const char *name,
int error, error2;
size_t jbegin_count = reiserfs_xattr_nblocks(inode, buffer_size);
+ /* Check before we start a transaction and then do nothing. */
+ if (!d_really_is_positive(REISERFS_SB(inode->i_sb)->priv_root))
+ return -EOPNOTSUPP;
+
if (!(flags & XATTR_REPLACE))
jbegin_count += reiserfs_xattr_jcreate_nblocks(inode);
@@ -841,8 +849,7 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
if (d_really_is_negative(dentry))
return -EINVAL;
- if (!dentry->d_sb->s_xattr ||
- get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
+ if (get_inode_sd_version(d_inode(dentry)) == STAT_DATA_V1)
return -EOPNOTSUPP;
dir = open_xa_dir(d_inode(dentry), XATTR_REPLACE);
@@ -882,6 +889,7 @@ static int create_privroot(struct dentry *dentry)
}
d_inode(dentry)->i_flags |= S_PRIVATE;
+ d_inode(dentry)->i_opflags &= ~IOP_XATTR;
reiserfs_info(dentry->d_sb, "Created %s - reserved for xattr "
"storage.\n", PRIVROOT_NAME);
@@ -895,7 +903,7 @@ static int create_privroot(struct dentry *dentry) { return 0; }
#endif
/* Actual operations that are exported to VFS-land */
-static const struct xattr_handler *reiserfs_xattr_handlers[] = {
+const struct xattr_handler *reiserfs_xattr_handlers[] = {
#ifdef CONFIG_REISERFS_FS_XATTR
&reiserfs_xattr_user_handler,
&reiserfs_xattr_trusted_handler,
@@ -966,8 +974,10 @@ int reiserfs_lookup_privroot(struct super_block *s)
if (!IS_ERR(dentry)) {
REISERFS_SB(s)->priv_root = dentry;
d_set_d_op(dentry, &xattr_lookup_poison_ops);
- if (d_really_is_positive(dentry))
+ if (d_really_is_positive(dentry)) {
d_inode(dentry)->i_flags |= S_PRIVATE;
+ d_inode(dentry)->i_opflags &= ~IOP_XATTR;
+ }
} else
err = PTR_ERR(dentry);
inode_unlock(d_inode(s->s_root));
@@ -996,7 +1006,6 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
}
if (d_really_is_positive(privroot)) {
- s->s_xattr = reiserfs_xattr_handlers;
inode_lock(d_inode(privroot));
if (!REISERFS_SB(s)->xattr_root) {
struct dentry *dentry;
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index aa9380bac196..05f666794561 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -320,10 +320,8 @@ reiserfs_inherit_default_acl(struct reiserfs_transaction_handle *th,
* would be useless since permissions are ignored, and a pain because
* it introduces locking cycles
*/
- if (IS_PRIVATE(dir)) {
- inode->i_flags |= S_PRIVATE;
+ if (IS_PRIVATE(inode))
goto apply_umask;
- }
err = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
if (err)
diff --git a/fs/select.c b/fs/select.c
index a4d8f6e8b63c..1fc1b247fede 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -730,7 +730,6 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
const sigset_t __user *sigmask, size_t sigsetsize,
enum poll_time_type type)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -753,12 +752,12 @@ static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = core_sys_select(n, inp, outp, exp, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+ restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
return ret;
@@ -1086,7 +1085,6 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
struct __kernel_timespec __user *, tsp, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1099,17 +1097,16 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
@@ -1121,7 +1118,6 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
struct old_timespec32 __user *, tsp, const sigset_t __user *, sigmask,
size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1134,17 +1130,16 @@ SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds, unsigned int, nfds,
return -EINVAL;
}
- ret = set_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
return ret;
@@ -1319,7 +1314,6 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
void __user *tsp, compat_sigset_t __user *sigmask,
compat_size_t sigsetsize, enum poll_time_type type)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1342,12 +1336,12 @@ static long do_compat_pselect(int n, compat_ulong_t __user *inp,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = compat_core_sys_select(n, inp, outp, exp, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -ERESTARTNOHAND);
+ restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);
ret = poll_select_copy_remaining(&end_time, tsp, type, ret);
return ret;
@@ -1402,7 +1396,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
unsigned int, nfds, struct old_timespec32 __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1415,17 +1408,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time32, struct pollfd __user *, ufds,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_OLD_TIMESPEC, ret);
return ret;
@@ -1437,7 +1429,6 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
unsigned int, nfds, struct __kernel_timespec __user *, tsp,
const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize)
{
- sigset_t ksigmask, sigsaved;
struct timespec64 ts, end_time, *to = NULL;
int ret;
@@ -1450,17 +1441,16 @@ COMPAT_SYSCALL_DEFINE5(ppoll_time64, struct pollfd __user *, ufds,
return -EINVAL;
}
- ret = set_compat_user_sigmask(sigmask, &ksigmask, &sigsaved, sigsetsize);
+ ret = set_compat_user_sigmask(sigmask, sigsetsize);
if (ret)
return ret;
ret = do_sys_poll(ufds, nfds, to);
- restore_user_sigmask(sigmask, &sigsaved, ret == -EINTR);
+ restore_saved_sigmask_unless(ret == -EINTR);
/* We can restart this syscall, usually */
if (ret == -EINTR)
ret = -ERESTARTNOHAND;
-
ret = poll_select_copy_remaining(&end_time, tsp, PT_TIMESPEC, ret);
return ret;
diff --git a/fs/splice.c b/fs/splice.c
index 4fe4afa7f6b1..e3059110adb6 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -947,12 +947,13 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
WARN_ON_ONCE(pipe->nrbufs != 0);
while (len) {
+ unsigned int pipe_pages;
size_t read_len;
loff_t pos = sd->pos, prev_pos = pos;
/* Don't try to read more the pipe has space for. */
- read_len = min_t(size_t, len,
- (pipe->buffers - pipe->nrbufs) << PAGE_SHIFT);
+ pipe_pages = pipe->buffers - pipe->nrbufs;
+ read_len = min(len, (size_t)pipe_pages << PAGE_SHIFT);
ret = do_splice_to(in, &pos, pipe, read_len, flags);
if (unlikely(ret <= 0))
goto out_release;
@@ -1182,8 +1183,15 @@ static long do_splice(struct file *in, loff_t __user *off_in,
pipe_lock(opipe);
ret = wait_for_space(opipe, flags);
- if (!ret)
+ if (!ret) {
+ unsigned int pipe_pages;
+
+ /* Don't try to read more the pipe has space for. */
+ pipe_pages = opipe->buffers - opipe->nrbufs;
+ len = min(len, (size_t)pipe_pages << PAGE_SHIFT);
+
ret = do_splice_to(in, &offset, opipe, len, flags);
+ }
pipe_unlock(opipe);
if (ret > 0)
wakeup_pipe_readers(opipe);
diff --git a/fs/super.c b/fs/super.c
index 2739f57515f8..691b202aa5c2 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -444,10 +444,12 @@ void generic_shutdown_super(struct super_block *sb)
sync_filesystem(sb);
sb->s_flags &= ~SB_ACTIVE;
- fsnotify_sb_delete(sb);
cgroup_writeback_umount();
+ /* evict all inodes with zero refcount */
evict_inodes(sb);
+ /* only nonzero refcount inodes can have marks */
+ fsnotify_sb_delete(sb);
if (sb->s_dio_done_wq) {
destroy_workqueue(sb->s_dio_done_wq);
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 60f43b93d06e..046fafbe1d38 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -77,13 +77,9 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node,
struct shash_desc *inhash)
{
struct ubifs_auth_node *auth = node;
- u8 *hash;
+ u8 hash[UBIFS_HASH_ARR_SZ];
int err;
- hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS);
- if (!hash)
- return -ENOMEM;
-
{
SHASH_DESC_ON_STACK(hash_desc, c->hash_tfm);
@@ -92,21 +88,16 @@ int ubifs_prepare_auth_node(struct ubifs_info *c, void *node,
err = crypto_shash_final(hash_desc, hash);
if (err)
- goto out;
+ return err;
}
err = ubifs_hash_calc_hmac(c, hash, auth->hmac);
if (err)
- goto out;
+ return err;
auth->ch.node_type = UBIFS_AUTH_NODE;
ubifs_prepare_node(c, auth, ubifs_auth_node_sz(c), 0);
-
- err = 0;
-out:
- kfree(hash);
-
- return err;
+ return 0;
}
static struct shash_desc *ubifs_get_desc(const struct ubifs_info *c,
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 0b98e3c8b461..6c0e19f7a21f 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -228,6 +228,8 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
if (nm.hash) {
ubifs_assert(c, fname_len(&nm) == 0);
ubifs_assert(c, fname_name(&nm) == NULL);
+ if (nm.hash & ~UBIFS_S_KEY_HASH_MASK)
+ goto done; /* ENOENT */
dent_key_init_hash(c, &key, dir->i_ino, nm.hash);
err = ubifs_tnc_lookup_dh(c, &key, dent, nm.minor_hash);
} else {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index e5f8de62fc51..31ed4e957cc4 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -786,7 +786,9 @@ static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
if (page_offset > end_index)
break;
- page = find_or_create_page(mapping, page_offset, ra_gfp_mask);
+ page = pagecache_get_page(mapping, page_offset,
+ FGP_LOCK|FGP_ACCESSED|FGP_CREAT|FGP_NOWAIT,
+ ra_gfp_mask);
if (!page)
break;
if (!PageUptodate(page))
@@ -1376,7 +1378,6 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time,
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_budget_req req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(ui->data_len, 8) };
- int iflags = I_DIRTY_TIME;
int err, release;
if (!IS_ENABLED(CONFIG_UBIFS_ATIME_SUPPORT))
@@ -1394,11 +1395,8 @@ int ubifs_update_time(struct inode *inode, struct timespec64 *time,
if (flags & S_MTIME)
inode->i_mtime = *time;
- if (!(inode->i_sb->s_flags & SB_LAZYTIME))
- iflags |= I_DIRTY_SYNC;
-
release = ui->dirty;
- __mark_inode_dirty(inode, iflags);
+ __mark_inode_dirty(inode, I_DIRTY_SYNC);
mutex_unlock(&ui->ui_mutex);
if (release)
ubifs_release_budget(c, &req);
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 4f1a397fda69..de65ba8a6ab4 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -120,7 +120,8 @@ static int setflags(struct inode *inode, int flags)
}
}
- ui->flags = ioctl2ubifs(flags);
+ ui->flags &= ~ioctl2ubifs(UBIFS_SUPPORTED_IOCTL_FLAGS);
+ ui->flags |= ioctl2ubifs(flags);
ubifs_set_inode_flags(inode);
inode->i_ctime = current_time(inode);
release = ui->dirty;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 4fd9683b8245..a6ae2428e4c9 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -539,7 +539,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
const struct fscrypt_name *nm, const struct inode *inode,
int deletion, int xent)
{
- int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
+ int err, dlen, ilen, len, lnum, ino_offs, dent_offs, orphan_added = 0;
int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir);
int last_reference = !!(deletion && inode->i_nlink == 0);
struct ubifs_inode *ui = ubifs_inode(inode);
@@ -630,6 +630,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
goto out_finish;
}
ui->del_cmtno = c->cmt_no;
+ orphan_added = 1;
}
err = write_head(c, BASEHD, dent, len, &lnum, &dent_offs, sync);
@@ -702,7 +703,7 @@ out_release:
kfree(dent);
out_ro:
ubifs_ro_mode(c, err);
- if (last_reference)
+ if (orphan_added)
ubifs_delete_orphan(c, inode->i_ino);
finish_reservation(c);
return err;
@@ -899,7 +900,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
fname_name(&nm) = xent->name;
fname_len(&nm) = le16_to_cpu(xent->nlen);
- xino = ubifs_iget(c->vfs_sb, xent->inum);
+ xino = ubifs_iget(c->vfs_sb, le64_to_cpu(xent->inum));
if (IS_ERR(xino)) {
err = PTR_ERR(xino);
ubifs_err(c, "dead directory entry '%s', error %d",
@@ -1217,7 +1218,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
void *p;
union ubifs_key key;
struct ubifs_dent_node *dent, *dent2;
- int err, dlen1, dlen2, ilen, lnum, offs, len;
+ int err, dlen1, dlen2, ilen, lnum, offs, len, orphan_added = 0;
int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
int last_reference = !!(new_inode && new_inode->i_nlink == 0);
int move = (old_dir != new_dir);
@@ -1333,6 +1334,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
goto out_finish;
}
new_ui->del_cmtno = c->cmt_no;
+ orphan_added = 1;
}
err = write_head(c, BASEHD, dent, len, &lnum, &offs, sync);
@@ -1414,7 +1416,7 @@ out_release:
release_head(c, BASEHD);
out_ro:
ubifs_ro_mode(c, err);
- if (last_reference)
+ if (orphan_added)
ubifs_delete_orphan(c, new_inode->i_ino);
out_finish:
finish_reservation(c);
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index cb72688032cd..868e64226439 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -126,25 +126,10 @@ static void __orphan_drop(struct ubifs_info *c, struct ubifs_orphan *o)
kfree(o);
}
-static void orphan_delete(struct ubifs_info *c, ino_t inum)
+static void orphan_delete(struct ubifs_info *c, struct ubifs_orphan *orph)
{
- struct ubifs_orphan *orph, *child_orph, *tmp_o;
-
- spin_lock(&c->orphan_lock);
-
- orph = lookup_orphan(c, inum);
- if (!orph) {
- spin_unlock(&c->orphan_lock);
- ubifs_err(c, "missing orphan ino %lu", (unsigned long)inum);
- dump_stack();
-
- return;
- }
-
if (orph->del) {
- spin_unlock(&c->orphan_lock);
- dbg_gen("deleted twice ino %lu",
- (unsigned long)inum);
+ dbg_gen("deleted twice ino %lu", (unsigned long)orph->inum);
return;
}
@@ -152,20 +137,11 @@ static void orphan_delete(struct ubifs_info *c, ino_t inum)
orph->del = 1;
orph->dnext = c->orph_dnext;
c->orph_dnext = orph;
- spin_unlock(&c->orphan_lock);
- dbg_gen("delete later ino %lu",
- (unsigned long)inum);
+ dbg_gen("delete later ino %lu", (unsigned long)orph->inum);
return;
}
- list_for_each_entry_safe(child_orph, tmp_o, &orph->child_list, child_list) {
- list_del(&child_orph->child_list);
- __orphan_drop(c, child_orph);
- }
-
__orphan_drop(c, orph);
-
- spin_unlock(&c->orphan_lock);
}
/**
@@ -223,7 +199,27 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
*/
void ubifs_delete_orphan(struct ubifs_info *c, ino_t inum)
{
- orphan_delete(c, inum);
+ struct ubifs_orphan *orph, *child_orph, *tmp_o;
+
+ spin_lock(&c->orphan_lock);
+
+ orph = lookup_orphan(c, inum);
+ if (!orph) {
+ spin_unlock(&c->orphan_lock);
+ ubifs_err(c, "missing orphan ino %lu", (unsigned long)inum);
+ dump_stack();
+
+ return;
+ }
+
+ list_for_each_entry_safe(child_orph, tmp_o, &orph->child_list, child_list) {
+ list_del(&child_orph->child_list);
+ orphan_delete(c, child_orph);
+ }
+
+ orphan_delete(c, orph);
+
+ spin_unlock(&c->orphan_lock);
}
/**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index b28ac4dfb407..01fcf7975047 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -601,18 +601,12 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
struct ubifs_scan_node *snod;
int n_nodes = 0;
int err;
- u8 *hash, *hmac;
+ u8 hash[UBIFS_HASH_ARR_SZ];
+ u8 hmac[UBIFS_HMAC_ARR_SZ];
if (!ubifs_authenticated(c))
return sleb->nodes_cnt;
- hash = kmalloc(crypto_shash_descsize(c->hash_tfm), GFP_NOFS);
- hmac = kmalloc(c->hmac_desc_len, GFP_NOFS);
- if (!hash || !hmac) {
- err = -ENOMEM;
- goto out;
- }
-
list_for_each_entry(snod, &sleb->nodes, list) {
n_nodes++;
@@ -662,9 +656,6 @@ static int authenticate_sleb(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
err = 0;
}
out:
- kfree(hash);
- kfree(hmac);
-
return err ? err : n_nodes - n_not_auth;
}
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 12c2afdb5804..87a4775e5076 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -161,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c)
sup = kzalloc(ALIGN(UBIFS_SB_NODE_SZ, c->min_io_size), GFP_KERNEL);
mst = kzalloc(c->mst_node_alsz, GFP_KERNEL);
idx_node_size = ubifs_idx_node_sz(c, 1);
- idx = kzalloc(ALIGN(tmp, c->min_io_size), GFP_KERNEL);
+ idx = kzalloc(ALIGN(idx_node_size, c->min_io_size), GFP_KERNEL);
ino = kzalloc(ALIGN(UBIFS_INO_NODE_SZ, c->min_io_size), GFP_KERNEL);
cs = kzalloc(ALIGN(UBIFS_CS_NODE_SZ, c->min_io_size), GFP_KERNEL);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 6cfc494050be..4ccb448d29a6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1564,6 +1564,7 @@ out_free:
vfree(c->ileb_buf);
vfree(c->sbuf);
kfree(c->bottom_up_buf);
+ kfree(c->sup_node);
ubifs_debugging_exit(c);
return err;
}
@@ -1606,6 +1607,7 @@ static void ubifs_umount(struct ubifs_info *c)
vfree(c->ileb_buf);
vfree(c->sbuf);
kfree(c->bottom_up_buf);
+ kfree(c->sup_node);
ubifs_debugging_exit(c);
}
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index a384a0f9ff32..234be1c4dc87 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -212,7 +212,7 @@ static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
/**
* layout_leb_in_gaps - layout index nodes using in-the-gaps method.
* @c: UBIFS file-system description object
- * @p: return LEB number here
+ * @p: return LEB number in @c->gap_lebs[p]
*
* This function lays out new index nodes for dirty znodes using in-the-gaps
* method of TNC commit.
@@ -221,7 +221,7 @@ static int is_idx_node_in_use(struct ubifs_info *c, union ubifs_key *key,
* This function returns the number of index nodes written into the gaps, or a
* negative error code on failure.
*/
-static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
+static int layout_leb_in_gaps(struct ubifs_info *c, int p)
{
struct ubifs_scan_leb *sleb;
struct ubifs_scan_node *snod;
@@ -236,7 +236,7 @@ static int layout_leb_in_gaps(struct ubifs_info *c, int *p)
* filled, however we do not check there at present.
*/
return lnum; /* Error code */
- *p = lnum;
+ c->gap_lebs[p] = lnum;
dbg_gc("LEB %d", lnum);
/*
* Scan the index LEB. We use the generic scan for this even though
@@ -355,7 +355,7 @@ static int get_leb_cnt(struct ubifs_info *c, int cnt)
*/
static int layout_in_gaps(struct ubifs_info *c, int cnt)
{
- int err, leb_needed_cnt, written, *p;
+ int err, leb_needed_cnt, written, p = 0, old_idx_lebs, *gap_lebs;
dbg_gc("%d znodes to write", cnt);
@@ -364,9 +364,9 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
if (!c->gap_lebs)
return -ENOMEM;
- p = c->gap_lebs;
+ old_idx_lebs = c->lst.idx_lebs;
do {
- ubifs_assert(c, p < c->gap_lebs + c->lst.idx_lebs);
+ ubifs_assert(c, p < c->lst.idx_lebs);
written = layout_leb_in_gaps(c, p);
if (written < 0) {
err = written;
@@ -392,9 +392,29 @@ static int layout_in_gaps(struct ubifs_info *c, int cnt)
leb_needed_cnt = get_leb_cnt(c, cnt);
dbg_gc("%d znodes remaining, need %d LEBs, have %d", cnt,
leb_needed_cnt, c->ileb_cnt);
+ /*
+ * Dynamically change the size of @c->gap_lebs to prevent
+ * oob, because @c->lst.idx_lebs could be increased by
+ * function @get_idx_gc_leb (called by layout_leb_in_gaps->
+ * ubifs_find_dirty_idx_leb) during loop. Only enlarge
+ * @c->gap_lebs when needed.
+ *
+ */
+ if (leb_needed_cnt > c->ileb_cnt && p >= old_idx_lebs &&
+ old_idx_lebs < c->lst.idx_lebs) {
+ old_idx_lebs = c->lst.idx_lebs;
+ gap_lebs = krealloc(c->gap_lebs, sizeof(int) *
+ (old_idx_lebs + 1), GFP_NOFS);
+ if (!gap_lebs) {
+ kfree(c->gap_lebs);
+ c->gap_lebs = NULL;
+ return -ENOMEM;
+ }
+ c->gap_lebs = gap_lebs;
+ }
} while (leb_needed_cnt > c->ileb_cnt);
- *p = -1;
+ c->gap_lebs[p] = -1;
return 0;
}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index a14346137361..d17bdacdda60 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -994,7 +994,6 @@ static int check_partition_desc(struct super_block *sb,
switch (le32_to_cpu(p->accessType)) {
case PD_ACCESS_TYPE_READ_ONLY:
case PD_ACCESS_TYPE_WRITE_ONCE:
- case PD_ACCESS_TYPE_REWRITABLE:
case PD_ACCESS_TYPE_NONE:
goto force_ro;
}
@@ -2449,17 +2448,29 @@ static unsigned int udf_count_free_table(struct super_block *sb,
static unsigned int udf_count_free(struct super_block *sb)
{
unsigned int accum = 0;
- struct udf_sb_info *sbi;
+ struct udf_sb_info *sbi = UDF_SB(sb);
struct udf_part_map *map;
+ unsigned int part = sbi->s_partition;
+ int ptype = sbi->s_partmaps[part].s_partition_type;
+
+ if (ptype == UDF_METADATA_MAP25) {
+ part = sbi->s_partmaps[part].s_type_specific.s_metadata.
+ s_phys_partition_ref;
+ } else if (ptype == UDF_VIRTUAL_MAP15 || ptype == UDF_VIRTUAL_MAP20) {
+ /*
+ * Filesystems with VAT are append-only and we cannot write to
+ * them. Let's just report 0 here.
+ */
+ return 0;
+ }
- sbi = UDF_SB(sb);
if (sbi->s_lvid_bh) {
struct logicalVolIntegrityDesc *lvid =
(struct logicalVolIntegrityDesc *)
sbi->s_lvid_bh->b_data;
- if (le32_to_cpu(lvid->numOfPartitions) > sbi->s_partition) {
+ if (le32_to_cpu(lvid->numOfPartitions) > part) {
accum = le32_to_cpu(
- lvid->freeSpaceTable[sbi->s_partition]);
+ lvid->freeSpaceTable[part]);
if (accum == 0xFFFFFFFF)
accum = 0;
}
@@ -2468,7 +2479,7 @@ static unsigned int udf_count_free(struct super_block *sb)
if (accum)
return accum;
- map = &sbi->s_partmaps[sbi->s_partition];
+ map = &sbi->s_partmaps[part];
if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP) {
accum += udf_count_free_bitmap(sb,
map->s_uspace.s_bitmap);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 3d247c0d92aa..15677eebef4c 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -101,7 +101,7 @@ static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 gene
struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi;
struct inode *inode;
- if (ino < UFS_ROOTINO || ino > uspi->s_ncg * uspi->s_ipg)
+ if (ino < UFS_ROOTINO || ino > (u64)uspi->s_ncg * uspi->s_ipg)
return ERR_PTR(-ESTALE);
inode = ufs_iget(sb, ino);
diff --git a/fs/unicode/utf8-core.c b/fs/unicode/utf8-core.c
index 6afab4fdce90..71ca4d047d65 100644
--- a/fs/unicode/utf8-core.c
+++ b/fs/unicode/utf8-core.c
@@ -73,6 +73,34 @@ int utf8_strncasecmp(const struct unicode_map *um,
}
EXPORT_SYMBOL(utf8_strncasecmp);
+/* String cf is expected to be a valid UTF-8 casefolded
+ * string.
+ */
+int utf8_strncasecmp_folded(const struct unicode_map *um,
+ const struct qstr *cf,
+ const struct qstr *s1)
+{
+ const struct utf8data *data = utf8nfdicf(um->version);
+ struct utf8cursor cur1;
+ int c1, c2;
+ int i = 0;
+
+ if (utf8ncursor(&cur1, data, s1->name, s1->len) < 0)
+ return -EINVAL;
+
+ do {
+ c1 = utf8byte(&cur1);
+ c2 = cf->name[i++];
+ if (c1 < 0)
+ return -EINVAL;
+ if (c1 != c2)
+ return 1;
+ } while (c1);
+
+ return 0;
+}
+EXPORT_SYMBOL(utf8_strncasecmp_folded);
+
int utf8_casefold(const struct unicode_map *um, const struct qstr *str,
unsigned char *dest, size_t dlen)
{
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index fe6d804a38dc..2c807283115d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1832,13 +1832,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
goto out;
features = uffdio_api.features;
- if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES)) {
- memset(&uffdio_api, 0, sizeof(uffdio_api));
- if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
- goto out;
- ret = -EINVAL;
- goto out;
- }
+ ret = -EINVAL;
+ if (uffdio_api.api != UFFD_API || (features & ~UFFD_API_FEATURES))
+ goto err_out;
+ ret = -EPERM;
+ if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE))
+ goto err_out;
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
uffdio_api.ioctls = UFFD_API_IOCTLS;
@@ -1851,6 +1850,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
ret = 0;
out:
return ret;
+err_out:
+ memset(&uffdio_api, 0, sizeof(uffdio_api));
+ if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
+ ret = -EFAULT;
+ goto out;
}
static long userfaultfd_ioctl(struct file *file, unsigned cmd,
diff --git a/fs/xattr.c b/fs/xattr.c
index 40b01dd1b14a..494313bce9cc 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -204,10 +204,22 @@ int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
return error;
}
-
+/**
+ * __vfs_setxattr_locked: set an extended attribute while holding the inode
+ * lock
+ *
+ * @dentry - object to perform setxattr on
+ * @name - xattr name to set
+ * @value - value to set @name to
+ * @size - size of @value
+ * @flags - flags to pass into filesystem operations
+ * @delegated_inode - on return, will contain an inode pointer that
+ * a delegation was broken on, NULL if none.
+ */
int
-vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
- size_t size, int flags)
+__vfs_setxattr_locked(struct dentry *dentry, const char *name,
+ const void *value, size_t size, int flags,
+ struct inode **delegated_inode)
{
struct inode *inode = dentry->d_inode;
int error;
@@ -216,15 +228,40 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
if (error)
return error;
- inode_lock(inode);
error = security_inode_setxattr(dentry, name, value, size, flags);
if (error)
goto out;
+ error = try_break_deleg(inode, delegated_inode);
+ if (error)
+ goto out;
+
error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
out:
+ return error;
+}
+EXPORT_SYMBOL_GPL(__vfs_setxattr_locked);
+
+int
+vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ size_t size, int flags)
+{
+ struct inode *inode = dentry->d_inode;
+ struct inode *delegated_inode = NULL;
+ int error;
+
+retry_deleg:
+ inode_lock(inode);
+ error = __vfs_setxattr_locked(dentry, name, value, size, flags,
+ &delegated_inode);
inode_unlock(inode);
+
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
return error;
}
EXPORT_SYMBOL_GPL(vfs_setxattr);
@@ -379,8 +416,18 @@ __vfs_removexattr(struct dentry *dentry, const char *name)
}
EXPORT_SYMBOL(__vfs_removexattr);
+/**
+ * __vfs_removexattr_locked: set an extended attribute while holding the inode
+ * lock
+ *
+ * @dentry - object to perform setxattr on
+ * @name - name of xattr to remove
+ * @delegated_inode - on return, will contain an inode pointer that
+ * a delegation was broken on, NULL if none.
+ */
int
-vfs_removexattr(struct dentry *dentry, const char *name)
+__vfs_removexattr_locked(struct dentry *dentry, const char *name,
+ struct inode **delegated_inode)
{
struct inode *inode = dentry->d_inode;
int error;
@@ -389,11 +436,14 @@ vfs_removexattr(struct dentry *dentry, const char *name)
if (error)
return error;
- inode_lock(inode);
error = security_inode_removexattr(dentry, name);
if (error)
goto out;
+ error = try_break_deleg(inode, delegated_inode);
+ if (error)
+ goto out;
+
error = __vfs_removexattr(dentry, name);
if (!error) {
@@ -402,12 +452,32 @@ vfs_removexattr(struct dentry *dentry, const char *name)
}
out:
+ return error;
+}
+EXPORT_SYMBOL_GPL(__vfs_removexattr_locked);
+
+int
+vfs_removexattr(struct dentry *dentry, const char *name)
+{
+ struct inode *inode = dentry->d_inode;
+ struct inode *delegated_inode = NULL;
+ int error;
+
+retry_deleg:
+ inode_lock(inode);
+ error = __vfs_removexattr_locked(dentry, name, &delegated_inode);
inode_unlock(inode);
+
+ if (delegated_inode) {
+ error = break_deleg_wait(&delegated_inode);
+ if (!error)
+ goto retry_deleg;
+ }
+
return error;
}
EXPORT_SYMBOL_GPL(vfs_removexattr);
-
/*
* Extended attribute SET operations
*/
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index a9ff3cf82cce..3e6e7c368064 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -2607,6 +2607,13 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
return __this_address;
+ if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
+ return __this_address;
+
+ if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
+ be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
+ return __this_address;
+
if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
@@ -2618,6 +2625,10 @@ xfs_agf_verify(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
return __this_address;
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
+ return __this_address;
+
/*
* during growfs operations, the perag is not fully initialised,
* so we can't use it for any useful checking. growfs ensures we can't
@@ -2632,6 +2643,11 @@ xfs_agf_verify(
return __this_address;
if (xfs_sb_version_hasreflink(&mp->m_sb) &&
+ be32_to_cpu(agf->agf_refcount_blocks) >
+ be32_to_cpu(agf->agf_length))
+ return __this_address;
+
+ if (xfs_sb_version_hasreflink(&mp->m_sb) &&
(be32_to_cpu(agf->agf_refcount_level) < 1 ||
be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 1f6e3965ff74..f09dcf9a96ee 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -926,8 +926,10 @@ xfs_attr_shortform_verify(
* struct xfs_attr_sf_entry has a variable length.
* Check the fixed-offset parts of the structure are
* within the data buffer.
+ * xfs_attr_sf_entry is defined with a 1-byte variable
+ * array at the end, so we must subtract that off.
*/
- if (((char *)sfep + sizeof(*sfep)) >= endp)
+ if (((char *)sfep + sizeof(*sfep) - 1) >= endp)
return __this_address;
/* Don't allow names with known bad length. */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index d6fbe487d91a..628230bd7311 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -5322,7 +5322,7 @@ __xfs_bunmapi(
* Make sure we don't touch multiple AGF headers out of order
* in a single transaction, as that could cause AB-BA deadlocks.
*/
- if (!wasdel) {
+ if (!wasdel && !isrt) {
agno = XFS_FSB_TO_AGNO(mp, del.br_startblock);
if (prev_agno != NULLAGNUMBER && prev_agno > agno)
break;
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index a62fb950bef1..0c02d76eeb4d 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -57,7 +57,7 @@
XFS_DAREMOVE_SPACE_RES(mp, XFS_DATA_FORK)
#define XFS_IALLOC_SPACE_RES(mp) \
((mp)->m_ialloc_blks + \
- (xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1 * \
+ ((xfs_sb_version_hasfinobt(&mp->m_sb) ? 2 : 1) * \
((mp)->m_in_maxlevels - 1)))
/*
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index a703cd58a90e..3fe938968348 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -53,9 +53,27 @@ xchk_setup_inode_bmap(
*/
if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
sc->sm->sm_type == XFS_SCRUB_TYPE_BMBTD) {
+ struct address_space *mapping = VFS_I(sc->ip)->i_mapping;
+
inode_dio_wait(VFS_I(sc->ip));
- error = filemap_write_and_wait(VFS_I(sc->ip)->i_mapping);
- if (error)
+
+ /*
+ * Try to flush all incore state to disk before we examine the
+ * space mappings for the data fork. Leave accumulated errors
+ * in the mapping for the writer threads to consume.
+ *
+ * On ENOSPC or EIO writeback errors, we continue into the
+ * extent mapping checks because write failures do not
+ * necessarily imply anything about the correctness of the file
+ * metadata. The metadata and the file data could be on
+ * completely separate devices; a media failure might only
+ * affect a subset of the disk, etc. We can handle delalloc
+ * extents in the scrubber, so leaving them in memory is fine.
+ */
+ error = filemap_fdatawrite(mapping);
+ if (!error)
+ error = filemap_fdatawait_keep_errors(mapping);
+ if (error && (error != -ENOSPC && error != -EIO))
goto out;
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index 003a772cd26c..2e50d146105d 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -14,8 +14,15 @@
static inline bool
xchk_should_terminate(
struct xfs_scrub *sc,
- int *error)
+ int *error)
{
+ /*
+ * If preemption is disabled, we need to yield to the scheduler every
+ * few seconds so that we don't run afoul of the soft lockup watchdog
+ * or RCU stall detector.
+ */
+ cond_resched();
+
if (fatal_signal_pending(current)) {
if (*error == 0)
*error = -EAGAIN;
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 06d07f1e310b..814239fac1b5 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -1771,7 +1771,7 @@ xfs_swap_extents(
if (xfs_inode_has_cow_data(tip)) {
error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
if (error)
- return error;
+ goto out_unlock;
}
/*
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 548344e25128..a2e864c61401 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1243,8 +1243,10 @@ xfs_buf_ioend(
bp->b_ops->verify_read(bp);
}
- if (!bp->b_error)
+ if (!bp->b_error) {
+ bp->b_flags &= ~XBF_WRITE_FAIL;
bp->b_flags |= XBF_DONE;
+ }
if (bp->b_iodone)
(*(bp->b_iodone))(bp);
@@ -1304,7 +1306,7 @@ xfs_bwrite(
bp->b_flags |= XBF_WRITE;
bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
- XBF_WRITE_FAIL | XBF_DONE);
+ XBF_DONE);
error = xfs_buf_submit(bp);
if (error) {
@@ -2040,7 +2042,7 @@ xfs_buf_delwri_submit_buffers(
* synchronously. Otherwise, drop the buffer from the delwri
* queue and submit async.
*/
- bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_WRITE_FAIL);
+ bp->b_flags &= ~_XBF_DELWRI_Q;
bp->b_flags |= XBF_WRITE;
if (wait_list) {
bp->b_flags &= ~XBF_ASYNC;
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index a1af984e4913..59b2b29542f4 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1120,13 +1120,12 @@ xfs_qm_dqflush(
dqb = bp->b_addr + dqp->q_bufoffset;
ddqp = &dqb->dd_diskdq;
- /*
- * A simple sanity check in case we got a corrupted dquot.
- */
- fa = xfs_dqblk_verify(mp, dqb, be32_to_cpu(ddqp->d_id), 0);
+ /* sanity check the in-core structure before we flush */
+ fa = xfs_dquot_verify(mp, &dqp->q_core, be32_to_cpu(dqp->q_core.d_id),
+ 0);
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in memory at %pS",
- be32_to_cpu(ddqp->d_id), fa);
+ be32_to_cpu(dqp->q_core.d_id), fa);
xfs_buf_relse(bp);
xfs_dqfunlock(dqp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index a76b27565a18..39966f4e3ecb 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -909,7 +909,12 @@ xfs_eofblocks_worker(
{
struct xfs_mount *mp = container_of(to_delayed_work(work),
struct xfs_mount, m_eofblocks_work);
+
+ if (!sb_start_write_trylock(mp->m_super))
+ return;
xfs_icache_free_eofblocks(mp, NULL);
+ sb_end_write(mp->m_super);
+
xfs_queue_eofblocks(mp);
}
@@ -936,7 +941,12 @@ xfs_cowblocks_worker(
{
struct xfs_mount *mp = container_of(to_delayed_work(work),
struct xfs_mount, m_cowblocks_work);
+
+ if (!sb_start_write_trylock(mp->m_super))
+ return;
xfs_icache_free_cowblocks(mp, NULL);
+ sb_end_write(mp->m_super);
+
xfs_queue_cowblocks(mp);
}
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d7dfc13f30f5..8a9719104a8f 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -2184,7 +2184,10 @@ xfs_file_ioctl(
if (error)
return error;
- return xfs_icache_free_eofblocks(mp, &keofb);
+ sb_start_write(mp->m_super);
+ error = xfs_icache_free_eofblocks(mp, &keofb);
+ sb_end_write(mp->m_super);
+ return error;
}
default:
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 2466b0f5b6c4..80c4b21988df 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1579,6 +1579,8 @@ out_free_iclog:
if (iclog->ic_bp)
xfs_buf_free(iclog->ic_bp);
kmem_free(iclog);
+ if (prev_iclog == log->l_iclog)
+ break;
}
spinlock_destroy(&log->l_icloglock);
xfs_buf_free(log->l_xbuf);
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index a7c0c657dfaf..13ca7c16bfc7 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -202,6 +202,9 @@ xfs_fs_rm_xquota(
if (XFS_IS_QUOTA_ON(mp))
return -EINVAL;
+ if (uflags & ~(FS_USER_QUOTA | FS_GROUP_QUOTA | FS_PROJ_QUOTA))
+ return -EINVAL;
+
if (uflags & FS_USER_QUOTA)
flags |= XFS_DQ_USER;
if (uflags & FS_GROUP_QUOTA)
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 680ae7662a78..5919e1714d5f 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1023,6 +1023,7 @@ xfs_reflink_remap_extent(
xfs_filblks_t rlen;
xfs_filblks_t unmap_len;
xfs_off_t newlen;
+ int64_t qres;
int error;
unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
@@ -1045,13 +1046,19 @@ xfs_reflink_remap_extent(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- /* If we're not just clearing space, then do we have enough quota? */
- if (real_extent) {
- error = xfs_trans_reserve_quota_nblks(tp, ip,
- irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_cancel;
- }
+ /*
+ * Reserve quota for this operation. We don't know if the first unmap
+ * in the dest file will cause a bmap btree split, so we always reserve
+ * at least enough blocks for that split. If the extent being mapped
+ * in is written, we need to reserve quota for that too.
+ */
+ qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ if (real_extent)
+ qres += irec->br_blockcount;
+ error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
+ XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto out_cancel;
trace_xfs_reflink_remap(ip, irec->br_startoff,
irec->br_blockcount, irec->br_startblock);
@@ -1071,6 +1078,7 @@ xfs_reflink_remap_extent(
uirec.br_startblock = irec->br_startblock + rlen;
uirec.br_startoff = irec->br_startoff + rlen;
uirec.br_blockcount = unmap_len - rlen;
+ uirec.br_state = irec->br_state;
unmap_len = rlen;
/* If this isn't a real mapping, we're done. */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d3a4e89bf4a0..66f167aefd94 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -520,8 +520,9 @@ xfsaild(
{
struct xfs_ail *ailp = data;
long tout = 0; /* milliseconds */
+ unsigned int noreclaim_flag;
- current->flags |= PF_MEMALLOC;
+ noreclaim_flag = memalloc_noreclaim_save();
set_freezable();
while (1) {
@@ -592,6 +593,7 @@ xfsaild(
tout = xfsaild_push(ailp);
}
+ memalloc_noreclaim_restore(noreclaim_flag);
return 0;
}