aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Makefile1
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/afs/cell.c6
-rw-r--r--fs/afs/dir.c2
-rw-r--r--fs/afs/dynroot.c33
-rw-r--r--fs/afs/flock.c2
-rw-r--r--fs/afs/fs_probe.c4
-rw-r--r--fs/afs/fsclient.c2
-rw-r--r--fs/afs/inode.c1
-rw-r--r--fs/afs/internal.h3
-rw-r--r--fs/afs/misc.c1
-rw-r--r--fs/afs/rxrpc.c9
-rw-r--r--fs/afs/security.c2
-rw-r--r--fs/afs/server.c7
-rw-r--r--fs/afs/super.c2
-rw-r--r--fs/afs/vl_probe.c4
-rw-r--r--fs/afs/vl_rotate.c10
-rw-r--r--fs/afs/volume.c4
-rw-r--r--fs/afs/yfsclient.c3
-rw-r--r--fs/aio.c21
-rw-r--r--fs/attr.c22
-rw-r--r--fs/autofs/waitq.c3
-rw-r--r--fs/binfmt_aout.c2
-rw-r--r--fs/binfmt_elf_fdpic.c12
-rw-r--r--fs/binfmt_flat.c3
-rw-r--r--fs/binfmt_misc.c8
-rw-r--r--fs/block_dev.c9
-rw-r--r--fs/btrfs/backref.c116
-rw-r--r--fs/btrfs/block-group.c1
-rw-r--r--fs/btrfs/block-rsv.c5
-rw-r--r--fs/btrfs/block-rsv.h16
-rw-r--r--fs/btrfs/ctree.c34
-rw-r--r--fs/btrfs/ctree.h4
-rw-r--r--fs/btrfs/delalloc-space.c3
-rw-r--r--fs/btrfs/delayed-inode.c22
-rw-r--r--fs/btrfs/dev-replace.c29
-rw-r--r--fs/btrfs/dir-item.c122
-rw-r--r--fs/btrfs/disk-io.c58
-rw-r--r--fs/btrfs/export.c11
-rw-r--r--fs/btrfs/export.h2
-rw-r--r--fs/btrfs/extent-tree.c19
-rw-r--r--fs/btrfs/extent_io.c7
-rw-r--r--fs/btrfs/file-item.c4
-rw-r--r--fs/btrfs/free-space-cache.c7
-rw-r--r--fs/btrfs/inode.c60
-rw-r--r--fs/btrfs/ioctl.c94
-rw-r--r--fs/btrfs/print-tree.c6
-rw-r--r--fs/btrfs/qgroup.c197
-rw-r--r--fs/btrfs/qgroup.h1
-rw-r--r--fs/btrfs/raid56.c74
-rw-r--r--fs/btrfs/rcu-string.h6
-rw-r--r--fs/btrfs/ref-verify.c6
-rw-r--r--fs/btrfs/relocation.c14
-rw-r--r--fs/btrfs/root-tree.c5
-rw-r--r--fs/btrfs/scrub.c36
-rw-r--r--fs/btrfs/send.c44
-rw-r--r--fs/btrfs/super.c4
-rw-r--r--fs/btrfs/sysfs.c7
-rw-r--r--fs/btrfs/tests/btrfs-tests.c2
-rw-r--r--fs/btrfs/tests/qgroup-tests.c36
-rw-r--r--fs/btrfs/transaction.c17
-rw-r--r--fs/btrfs/tree-checker.c27
-rw-r--r--fs/btrfs/tree-log.c20
-rw-r--r--fs/btrfs/volumes.c107
-rw-r--r--fs/btrfs/volumes.h3
-rw-r--r--fs/btrfs/xattr.c3
-rw-r--r--fs/btrfs/zlib.c2
-rw-r--r--fs/buffer.c4
-rw-r--r--fs/cachefiles/bind.c3
-rw-r--r--fs/ceph/caps.c41
-rw-r--r--fs/ceph/debugfs.c13
-rw-r--r--fs/ceph/file.c10
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/mds_client.c11
-rw-r--r--fs/ceph/mds_client.h14
-rw-r--r--fs/ceph/snap.c48
-rw-r--r--fs/ceph/super.c14
-rw-r--r--fs/char_dev.c2
-rw-r--r--fs/cifs/cifs_spnego.c4
-rw-r--r--fs/cifs/cifsacl.c14
-rw-r--r--fs/cifs/cifsfs.c22
-rw-r--r--fs/cifs/cifsfs.h5
-rw-r--r--fs/cifs/cifsglob.h76
-rw-r--r--fs/cifs/cifsproto.h12
-rw-r--r--fs/cifs/cifssmb.c9
-rw-r--r--fs/cifs/connect.c28
-rw-r--r--fs/cifs/dfs_cache.c701
-rw-r--r--fs/cifs/dir.c5
-rw-r--r--fs/cifs/file.c24
-rw-r--r--fs/cifs/inode.c8
-rw-r--r--fs/cifs/ioctl.c6
-rw-r--r--fs/cifs/link.c19
-rw-r--r--fs/cifs/misc.c8
-rw-r--r--fs/cifs/smb1ops.c19
-rw-r--r--fs/cifs/smb2inode.c9
-rw-r--r--fs/cifs/smb2misc.c26
-rw-r--r--fs/cifs/smb2ops.c261
-rw-r--r--fs/cifs/smb2pdu.c19
-rw-r--r--fs/cifs/smb2pdu.h2
-rw-r--r--fs/cifs/smb2proto.h2
-rw-r--r--fs/cifs/smbdirect.c19
-rw-r--r--fs/cifs/transport.c27
-rw-r--r--fs/coda/upcall.c2
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/configfs/dir.c2
-rw-r--r--fs/dcache.c7
-rw-r--r--fs/debugfs/file.c36
-rw-r--r--fs/debugfs/inode.c28
-rw-r--r--fs/dlm/ast.c6
-rw-r--r--fs/dlm/lock.c69
-rw-r--r--fs/dlm/netlink.c2
-rw-r--r--fs/dlm/plock.c185
-rw-r--r--fs/dlm/recover.c39
-rw-r--r--fs/ecryptfs/inode.c8
-rw-r--r--fs/erofs/decompressor.c16
-rw-r--r--fs/erofs/internal.h2
-rw-r--r--fs/erofs/zdata.c4
-rw-r--r--fs/erofs/zmap.c10
-rw-r--r--fs/eventfd.c7
-rw-r--r--fs/eventpoll.c94
-rw-r--r--fs/exec.c1
-rw-r--r--fs/ext2/ext2.h13
-rw-r--r--fs/ext2/super.c42
-rw-r--r--fs/ext2/xattr.c4
-rw-r--r--fs/ext4/acl.h5
-rw-r--r--fs/ext4/balloc.c25
-rw-r--r--fs/ext4/ext4.h12
-rw-r--r--fs/ext4/extents.c23
-rw-r--r--fs/ext4/extents_status.c343
-rw-r--r--fs/ext4/file.c6
-rw-r--r--fs/ext4/fsmap.c10
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/indirect.c17
-rw-r--r--fs/ext4/inline.c32
-rw-r--r--fs/ext4/inode.c129
-rw-r--r--fs/ext4/ioctl.c19
-rw-r--r--fs/ext4/mballoc.c272
-rw-r--r--fs/ext4/mballoc.h14
-rw-r--r--fs/ext4/migrate.c7
-rw-r--r--fs/ext4/move_extent.c6
-rw-r--r--fs/ext4/namei.c77
-rw-r--r--fs/ext4/page-io.c10
-rw-r--r--fs/ext4/resize.c78
-rw-r--r--fs/ext4/super.c97
-rw-r--r--fs/ext4/sysfs.c7
-rw-r--r--fs/ext4/verity.c7
-rw-r--r--fs/ext4/xattr.c319
-rw-r--r--fs/ext4/xattr.h20
-rw-r--r--fs/f2fs/checkpoint.c22
-rw-r--r--fs/f2fs/data.c9
-rw-r--r--fs/f2fs/extent_cache.c6
-rw-r--r--fs/f2fs/f2fs.h4
-rw-r--r--fs/f2fs/file.c16
-rw-r--r--fs/f2fs/gc.c25
-rw-r--r--fs/f2fs/inline.c28
-rw-r--r--fs/f2fs/namei.c2
-rw-r--r--fs/f2fs/node.c10
-rw-r--r--fs/f2fs/recovery.c46
-rw-r--r--fs/f2fs/segment.c2
-rw-r--r--fs/f2fs/super.c15
-rw-r--r--fs/f2fs/verity.c12
-rw-r--r--fs/f2fs/xattr.c6
-rw-r--r--fs/fat/dir.c2
-rw-r--r--fs/fat/nfs.c6
-rw-r--r--fs/fhandle.c2
-rw-r--r--fs/file.c1
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fs_context.c3
-rw-r--r--fs/fuse/control.c8
-rw-r--r--fs/fuse/cuse.c2
-rw-r--r--fs/fuse/dir.c2
-rw-r--r--fs/fuse/file.c26
-rw-r--r--fs/fuse/fuse_i.h1
-rw-r--r--fs/fuse/inode.c13
-rw-r--r--fs/fuse/readdir.c10
-rw-r--r--fs/gfs2/aops.c7
-rw-r--r--fs/gfs2/bmap.c5
-rw-r--r--fs/gfs2/glops.c6
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/ops_fstype.c17
-rw-r--r--fs/gfs2/quota.c11
-rw-r--r--fs/gfs2/rgrp.c2
-rw-r--r--fs/gfs2/super.c34
-rw-r--r--fs/hfs/bnode.c1
-rw-r--r--fs/hfs/inode.c13
-rw-r--r--fs/hfs/trans.c2
-rw-r--r--fs/hfsplus/hfsplus_fs.h2
-rw-r--r--fs/hfsplus/inode.c36
-rw-r--r--fs/hfsplus/options.c4
-rw-r--r--fs/hfsplus/super.c4
-rw-r--r--fs/hfsplus/unicode.c2
-rw-r--r--fs/hugetlbfs/inode.c12
-rw-r--r--fs/inode.c81
-rw-r--r--fs/internal.h2
-rw-r--r--fs/io_uring.c178
-rw-r--r--fs/iomap/buffered-io.c3
-rw-r--r--fs/isofs/inode.c18
-rw-r--r--fs/isofs/namei.c4
-rw-r--r--fs/jbd2/checkpoint.c137
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jbd2/journal.c46
-rw-r--r--fs/jbd2/recovery.c8
-rw-r--r--fs/jbd2/transaction.c51
-rw-r--r--fs/jffs2/build.c5
-rw-r--r--fs/jffs2/erase.c2
-rw-r--r--fs/jffs2/file.c15
-rw-r--r--fs/jffs2/xattr.c13
-rw-r--r--fs/jffs2/xattr.h4
-rw-r--r--fs/jfs/jfs_dmap.c116
-rw-r--r--fs/jfs/jfs_dtree.c7
-rw-r--r--fs/jfs/jfs_extent.c5
-rw-r--r--fs/jfs/jfs_filsys.h2
-rw-r--r--fs/jfs/jfs_imap.c9
-rw-r--r--fs/jfs/jfs_mount.c6
-rw-r--r--fs/jfs/jfs_txnmgr.c5
-rw-r--r--fs/jfs/namei.c5
-rw-r--r--fs/kernfs/dir.c17
-rw-r--r--fs/kernfs/mount.c2
-rw-r--r--fs/libfs.c22
-rw-r--r--fs/lockd/mon.c3
-rw-r--r--fs/locks.c2
-rw-r--r--fs/mbcache.c121
-rw-r--r--fs/namei.c159
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/blocklayout/dev.c4
-rw-r--r--fs/nfs/filelayout/filelayout.c8
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c1
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/nfs2xdr.c2
-rw-r--r--fs/nfs/nfs3xdr.c2
-rw-r--r--fs/nfs/nfs4client.c1
-rw-r--r--fs/nfs/nfs4idmap.c46
-rw-r--r--fs/nfs/nfs4proc.c83
-rw-r--r--fs/nfs/nfs4state.c18
-rw-r--r--fs/nfs/nfs4trace.h93
-rw-r--r--fs/nfs/nfs4xdr.c12
-rw-r--r--fs/nfs/nfsroot.c4
-rw-r--r--fs/nfs/pnfs_dev.c2
-rw-r--r--fs/nfs/super.c62
-rw-r--r--fs/nfsd/blocklayoutxdr.c9
-rw-r--r--fs/nfsd/flexfilelayoutxdr.c9
-rw-r--r--fs/nfsd/netns.h6
-rw-r--r--fs/nfsd/nfs3xdr.c4
-rw-r--r--fs/nfsd/nfs4callback.c8
-rw-r--r--fs/nfsd/nfs4layouts.c4
-rw-r--r--fs/nfsd/nfs4proc.c6
-rw-r--r--fs/nfsd/nfs4recover.c4
-rw-r--r--fs/nfsd/nfs4state.c72
-rw-r--r--fs/nfsd/nfs4xdr.c51
-rw-r--r--fs/nfsd/nfscache.c57
-rw-r--r--fs/nfsd/nfsctl.c16
-rw-r--r--fs/nfsd/nfsd.h3
-rw-r--r--fs/nfsd/nfssvc.c37
-rw-r--r--fs/nfsd/trace.h65
-rw-r--r--fs/nfsd/vfs.c12
-rw-r--r--fs/nilfs2/alloc.c41
-rw-r--r--fs/nilfs2/bmap.c16
-rw-r--r--fs/nilfs2/btnode.c12
-rw-r--r--fs/nilfs2/btree.c66
-rw-r--r--fs/nilfs2/cpfile.c10
-rw-r--r--fs/nilfs2/dat.c48
-rw-r--r--fs/nilfs2/dir.c2
-rw-r--r--fs/nilfs2/direct.c23
-rw-r--r--fs/nilfs2/file.c8
-rw-r--r--fs/nilfs2/gcinode.c8
-rw-r--r--fs/nilfs2/ifile.c4
-rw-r--r--fs/nilfs2/inode.c84
-rw-r--r--fs/nilfs2/ioctl.c46
-rw-r--r--fs/nilfs2/mdt.c2
-rw-r--r--fs/nilfs2/namei.c6
-rw-r--r--fs/nilfs2/nilfs.h9
-rw-r--r--fs/nilfs2/page.c21
-rw-r--r--fs/nilfs2/recovery.c39
-rw-r--r--fs/nilfs2/segbuf.c8
-rw-r--r--fs/nilfs2/segment.c124
-rw-r--r--fs/nilfs2/sufile.c80
-rw-r--r--fs/nilfs2/super.c93
-rw-r--r--fs/nilfs2/sysfs.c29
-rw-r--r--fs/nilfs2/the_nilfs.c188
-rw-r--r--fs/nilfs2/the_nilfs.h2
-rw-r--r--fs/nls/nls_base.c4
-rw-r--r--fs/notify/fanotify/fanotify_user.c22
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c11
-rw-r--r--fs/ntfs/attrib.c28
-rw-r--r--fs/ntfs/inode.c7
-rw-r--r--fs/ntfs/super.c3
-rw-r--r--fs/ocfs2/alloc.c4
-rw-r--r--fs/ocfs2/aops.c18
-rw-r--r--fs/ocfs2/dir.c14
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlmglue.c8
-rw-r--r--fs/ocfs2/extent_map.c4
-rw-r--r--fs/ocfs2/file.c8
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/journal.h1
-rw-r--r--fs/ocfs2/move_extents.c34
-rw-r--r--fs/ocfs2/namei.c30
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/ocfs2/stackglue.c8
-rw-r--r--fs/ocfs2/super.c114
-rw-r--r--fs/ocfs2/xattr.c2
-rw-r--r--fs/omfs/file.c2
-rw-r--r--fs/open.c38
-rw-r--r--fs/orangefs/orangefs-debugfs.c29
-rw-r--r--fs/orangefs/orangefs-mod.c8
-rw-r--r--fs/overlayfs/copy_up.c4
-rw-r--r--fs/overlayfs/dir.c46
-rw-r--r--fs/overlayfs/export.c2
-rw-r--r--fs/overlayfs/namei.c24
-rw-r--r--fs/overlayfs/ovl_entry.h9
-rw-r--r--fs/overlayfs/super.c5
-rw-r--r--fs/pnode.c2
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/proc/proc_sysctl.c33
-rw-r--r--fs/proc/task_mmu.c6
-rw-r--r--fs/pstore/Kconfig1
-rw-r--r--fs/pstore/ram.c3
-rw-r--r--fs/pstore/ram_core.c12
-rw-r--r--fs/quota/dquot.c563
-rw-r--r--fs/quota/quota_tree.c38
-rw-r--r--fs/reiserfs/journal.c4
-rw-r--r--fs/reiserfs/namei.c4
-rw-r--r--fs/reiserfs/super.c6
-rw-r--r--fs/reiserfs/xattr_security.c10
-rw-r--r--fs/select.c2
-rw-r--r--fs/signalfd.c1
-rw-r--r--fs/squashfs/squashfs_fs.h2
-rw-r--r--fs/squashfs/squashfs_fs_sb.h2
-rw-r--r--fs/squashfs/xattr.h4
-rw-r--r--fs/squashfs/xattr_id.c2
-rw-r--r--fs/statfs.c4
-rw-r--r--fs/super.c11
-rw-r--r--fs/sync.c3
-rw-r--r--fs/sysfs/file.c2
-rw-r--r--fs/sysv/itree.c16
-rw-r--r--fs/tracefs/inode.c34
-rw-r--r--fs/ubifs/budget.c9
-rw-r--r--fs/ubifs/commit.c6
-rw-r--r--fs/ubifs/dir.c14
-rw-r--r--fs/ubifs/file.c29
-rw-r--r--fs/ubifs/journal.c4
-rw-r--r--fs/ubifs/lpt.c2
-rw-r--r--fs/ubifs/super.c17
-rw-r--r--fs/ubifs/tnc.c158
-rw-r--r--fs/ubifs/tnc_misc.c4
-rw-r--r--fs/ubifs/ubifs.h5
-rw-r--r--fs/udf/balloc.c33
-rw-r--r--fs/udf/file.c26
-rw-r--r--fs/udf/inode.c195
-rw-r--r--fs/udf/namei.c10
-rw-r--r--fs/udf/super.c1
-rw-r--r--fs/udf/truncate.c48
-rw-r--r--fs/udf/udf_i.h3
-rw-r--r--fs/udf/udf_sb.h2
-rw-r--r--fs/udf/unicode.c2
-rw-r--r--fs/userfaultfd.c4
-rw-r--r--fs/verity/enable.c24
-rw-r--r--fs/verity/signature.c16
-rw-r--r--fs/verity/verify.c12
-rw-r--r--fs/xattr.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c27
-rw-r--r--fs/xfs/libxfs/xfs_attr.c2
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c37
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h26
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c85
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.h2
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c58
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h18
-rw-r--r--fs/xfs/libxfs/xfs_btree.c35
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c24
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h17
-rw-r--r--fs/xfs/libxfs/xfs_da_format.c1
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h59
-rw-r--r--fs/xfs/libxfs/xfs_defer.c368
-rw-r--r--fs/xfs/libxfs/xfs_defer.h49
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2.h2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c33
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c32
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h19
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c28
-rw-r--r--fs/xfs/libxfs/xfs_dquot_buf.c8
-rw-r--r--fs/xfs/libxfs/xfs_format.h50
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c70
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h1
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c54
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h8
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c10
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h9
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h10
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c4
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c6
-rw-r--r--fs/xfs/libxfs/xfs_shared.h1
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c8
-rw-r--r--fs/xfs/xfs_acl.c27
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/xfs/xfs_attr_inactive.c156
-rw-r--r--fs/xfs/xfs_attr_list.c5
-rw-r--r--fs/xfs/xfs_bmap_item.c243
-rw-r--r--fs/xfs/xfs_bmap_item.h3
-rw-r--r--fs/xfs/xfs_bmap_util.c105
-rw-r--r--fs/xfs/xfs_buf.c22
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_buf_item.c4
-rw-r--r--fs/xfs/xfs_dquot.c90
-rw-r--r--fs/xfs/xfs_dquot.h98
-rw-r--r--fs/xfs/xfs_dquot_item.c47
-rw-r--r--fs/xfs/xfs_dquot_item.h35
-rw-r--r--fs/xfs/xfs_error.c32
-rw-r--r--fs/xfs/xfs_error.h7
-rw-r--r--fs/xfs/xfs_export.c14
-rw-r--r--fs/xfs/xfs_extent_busy.c14
-rw-r--r--fs/xfs/xfs_extfree_item.c180
-rw-r--r--fs/xfs/xfs_extfree_item.h18
-rw-r--r--fs/xfs/xfs_file.c53
-rw-r--r--fs/xfs/xfs_fsmap.c1
-rw-r--r--fs/xfs/xfs_icache.c8
-rw-r--r--fs/xfs/xfs_icreate_item.c1
-rw-r--r--fs/xfs/xfs_inode.c153
-rw-r--r--fs/xfs/xfs_inode.h22
-rw-r--r--fs/xfs/xfs_inode_item.c55
-rw-r--r--fs/xfs/xfs_inode_item.h4
-rw-r--r--fs/xfs/xfs_ioctl.c22
-rw-r--r--fs/xfs/xfs_iomap.c17
-rw-r--r--fs/xfs/xfs_iops.c21
-rw-r--r--fs/xfs/xfs_itable.c8
-rw-r--r--fs/xfs/xfs_iwalk.c27
-rw-r--r--fs/xfs/xfs_linux.h32
-rw-r--r--fs/xfs/xfs_log.c94
-rw-r--r--fs/xfs/xfs_log.h3
-rw-r--r--fs/xfs/xfs_log_cil.c47
-rw-r--r--fs/xfs/xfs_log_priv.h53
-rw-r--r--fs/xfs/xfs_log_recover.c253
-rw-r--r--fs/xfs/xfs_message.c2
-rw-r--r--fs/xfs/xfs_message.h2
-rw-r--r--fs/xfs/xfs_mount.c261
-rw-r--r--fs/xfs/xfs_mount.h5
-rw-r--r--fs/xfs/xfs_pnfs.c4
-rw-r--r--fs/xfs/xfs_qm.c121
-rw-r--r--fs/xfs/xfs_qm_bhv.c8
-rw-r--r--fs/xfs/xfs_qm_syscalls.c142
-rw-r--r--fs/xfs/xfs_quota.h4
-rw-r--r--fs/xfs/xfs_refcount_item.c178
-rw-r--r--fs/xfs/xfs_refcount_item.h3
-rw-r--r--fs/xfs/xfs_reflink.c247
-rw-r--r--fs/xfs/xfs_rmap_item.c170
-rw-r--r--fs/xfs/xfs_rmap_item.h3
-rw-r--r--fs/xfs/xfs_stats.c14
-rw-r--r--fs/xfs/xfs_stats.h1
-rw-r--r--fs/xfs/xfs_super.c98
-rw-r--r--fs/xfs/xfs_symlink.c7
-rw-r--r--fs/xfs/xfs_trace.h75
-rw-r--r--fs/xfs/xfs_trans.c182
-rw-r--r--fs/xfs/xfs_trans.h10
-rw-r--r--fs/xfs/xfs_trans_ail.c104
-rw-r--r--fs/xfs/xfs_trans_dquot.c73
-rw-r--r--fs/xfs/xfs_trans_priv.h6
460 files changed, 9253 insertions, 5705 deletions
diff --git a/fs/Makefile b/fs/Makefile
index 7876ba238244..97375ba820dd 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -131,6 +131,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs/
obj-$(CONFIG_CEPH_FS) += ceph/
obj-$(CONFIG_PSTORE) += pstore/
obj-$(CONFIG_EFIVAR_FS) += efivarfs/
+obj-$(CONFIG_YAFFS_FS) += yaffs2/
obj-$(CONFIG_EROFS_FS) += erofs/
obj-$(CONFIG_YAFFS_FS) += yaffs2/
obj-$(CONFIG_AUFS_FS) += aufs/
diff --git a/fs/affs/file.c b/fs/affs/file.c
index ba084b0b214b..82bb38370aa9 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -878,7 +878,7 @@ affs_truncate(struct inode *inode)
if (inode->i_size > AFFS_I(inode)->mmu_private) {
struct address_space *mapping = inode->i_mapping;
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
loff_t isize = inode->i_size;
int res;
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 296b489861a9..1522fadd8d2d 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -404,10 +404,12 @@ static int afs_update_cell(struct afs_cell *cell)
if (ret == -ENOMEM)
goto out_wake;
- ret = -ENOMEM;
vllist = afs_alloc_vlserver_list(0);
- if (!vllist)
+ if (!vllist) {
+ if (ret >= 0)
+ ret = -ENOMEM;
goto out_wake;
+ }
switch (ret) {
case -ENODATA:
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 3a355a209919..43f5b972fcea 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -1023,7 +1023,7 @@ static int afs_d_revalidate_rcu(struct dentry *dentry)
static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
{
struct afs_vnode *vnode, *dir;
- struct afs_fid uninitialized_var(fid);
+ struct afs_fid fid;
struct dentry *parent;
struct inode *inode;
struct key *key;
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index f07e53ab808e..d06994990fc3 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -20,6 +20,7 @@ static int afs_probe_cell_name(struct dentry *dentry)
struct afs_net *net = afs_d2net(dentry);
const char *name = dentry->d_name.name;
size_t len = dentry->d_name.len;
+ char *result = NULL;
int ret;
/* Names prefixed with a dot are R/W mounts. */
@@ -37,9 +38,22 @@ static int afs_probe_cell_name(struct dentry *dentry)
}
ret = dns_query(net->net, "afsdb", name, len, "srv=1",
- NULL, NULL, false);
- if (ret == -ENODATA)
- ret = -EDESTADDRREQ;
+ &result, NULL, false);
+ if (ret == -ENODATA || ret == -ENOKEY || ret == 0)
+ ret = -ENOENT;
+ if (ret > 0 && ret >= sizeof(struct dns_server_list_v1_header)) {
+ struct dns_server_list_v1_header *v1 = (void *)result;
+
+ if (v1->hdr.zero == 0 &&
+ v1->hdr.content == DNS_PAYLOAD_IS_SERVER_LIST &&
+ v1->hdr.version == 1 &&
+ (v1->status != DNS_LOOKUP_GOOD &&
+ v1->status != DNS_LOOKUP_GOOD_WITH_BAD))
+ return -ENOENT;
+
+ }
+
+ kfree(result);
return ret;
}
@@ -163,20 +177,9 @@ static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags)
return 1;
}
-/*
- * Allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
- * sleep)
- * - called from dput() when d_count is going to 0.
- * - return 1 to request dentry be unhashed, 0 otherwise
- */
-static int afs_dynroot_d_delete(const struct dentry *dentry)
-{
- return d_really_is_positive(dentry);
-}
-
const struct dentry_operations afs_dynroot_dentry_operations = {
.d_revalidate = afs_dynroot_d_revalidate,
- .d_delete = afs_dynroot_d_delete,
+ .d_delete = always_delete_dentry,
.d_release = afs_d_release,
.d_automount = afs_d_automount,
};
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index d5e5a6ddc847..0fa05998a24d 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -75,7 +75,7 @@ void afs_lock_op_done(struct afs_call *call)
if (call->error == 0) {
spin_lock(&vnode->lock);
trace_afs_flock_ev(vnode, NULL, afs_flock_timestamp, 0);
- vnode->locked_at = call->reply_time;
+ vnode->locked_at = call->issue_time;
afs_schedule_lock_extension(vnode);
spin_unlock(&vnode->lock);
}
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index 51ee3dd79700..e9282e025912 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -92,8 +92,8 @@ responded:
}
}
- if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
- rtt_us < server->probe.rtt) {
+ rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
+ if (rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
alist->preferred = index;
have_result = true;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 5c2729fc07e5..254580b1dc74 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -136,7 +136,7 @@ bad:
static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
{
- return ktime_divns(call->reply_time, NSEC_PER_SEC) + expiry;
+ return ktime_divns(call->issue_time, NSEC_PER_SEC) + expiry;
}
static void xdr_decode_AFSCallBack(const __be32 **_bp,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 622363af4c1b..fd681eec49aa 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -227,6 +227,7 @@ static void afs_apply_status(struct afs_fs_cursor *fc,
set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
}
change_size = true;
+ data_changed = true;
} else if (vnode->status.type == AFS_FTYPE_DIR) {
/* Expected directory change is handled elsewhere so
* that we can locally edit the directory and save on a
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index c3ad582f9fd0..8d6582713fe7 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -159,7 +159,6 @@ struct afs_call {
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
- bool have_reply_time; /* T if have got reply_time */
bool intr; /* T if interruptible */
bool unmarshalling_error; /* T if an unmarshalling error occurred */
u16 service_id; /* Actual service ID (after upgrade) */
@@ -173,7 +172,7 @@ struct afs_call {
} __attribute__((packed));
__be64 tmp64;
};
- ktime_t reply_time; /* Time of first reply packet */
+ ktime_t issue_time; /* Time of issue of operation */
};
struct afs_call_type {
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 5334f1bd2bca..5171d6d99031 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -69,6 +69,7 @@ int afs_abort_to_error(u32 abort_code)
/* Unified AFS error table */
case UAEPERM: return -EPERM;
case UAENOENT: return -ENOENT;
+ case UAEAGAIN: return -EAGAIN;
case UAEACCES: return -EACCES;
case UAEBUSY: return -EBUSY;
case UAEEXIST: return -EEXIST;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 6adab30a8399..30952f073fb4 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -428,6 +428,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
if (call->max_lifespan)
rxrpc_kernel_set_max_life(call->net->socket, rxcall,
call->max_lifespan);
+ call->issue_time = ktime_get_real();
/* send the request */
iov[0].iov_base = call->request;
@@ -489,7 +490,7 @@ error_kill_call:
if (call->async) {
if (cancel_work_sync(&call->async_work))
afs_put_call(call);
- afs_put_call(call);
+ afs_set_call_complete(call, ret, 0);
}
ac->error = ret;
@@ -532,12 +533,6 @@ static void afs_deliver_to_call(struct afs_call *call)
return;
}
- if (!call->have_reply_time &&
- rxrpc_kernel_get_reply_time(call->net->socket,
- call->rxcall,
- &call->reply_time))
- call->have_reply_time = true;
-
ret = call->type->deliver(call);
state = READ_ONCE(call->state);
if (ret == 0 && call->unmarshalling_error)
diff --git a/fs/afs/security.c b/fs/afs/security.c
index ce9de1e6742b..207a54ce4540 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -401,7 +401,7 @@ int afs_check_permit(struct afs_vnode *vnode, struct key *key,
int afs_permission(struct inode *inode, int mask)
{
struct afs_vnode *vnode = AFS_FS_I(inode);
- afs_access_t uninitialized_var(access);
+ afs_access_t access;
struct key *key;
int ret = 0;
diff --git a/fs/afs/server.c b/fs/afs/server.c
index d3a9288f7556..44985ca6602e 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -35,7 +35,7 @@ struct afs_server *afs_find_server(struct afs_net *net,
const struct afs_addr_list *alist;
struct afs_server *server = NULL;
unsigned int i;
- int seq = 0, diff;
+ int seq = 1, diff;
rcu_read_lock();
@@ -43,6 +43,7 @@ struct afs_server *afs_find_server(struct afs_net *net,
if (server)
afs_put_server(net, server, afs_server_trace_put_find_rsq);
server = NULL;
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
if (srx->transport.family == AF_INET6) {
@@ -98,7 +99,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
{
struct afs_server *server = NULL;
struct rb_node *p;
- int diff, seq = 0;
+ int diff, seq = 1;
_enter("%pU", uuid);
@@ -110,7 +111,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
if (server)
afs_put_server(net, server, afs_server_trace_put_uuid_rsq);
server = NULL;
-
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&net->fs_lock, &seq);
p = net->fs_servers.rb_node;
diff --git a/fs/afs/super.c b/fs/afs/super.c
index eb04dcc54328..554119068ea4 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -391,6 +391,8 @@ static int afs_validate_fc(struct fs_context *fc)
return PTR_ERR(volume);
ctx->volume = volume;
+ if (volume->type != AFSVL_RWVOL)
+ ctx->flock_mode = afs_flock_mode_local;
}
return 0;
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index 081b7e5b13f5..163069d9fc16 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -92,8 +92,8 @@ responded:
}
}
- if (rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us) &&
- rtt_us < server->probe.rtt) {
+ rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
+ if (rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
alist->preferred = index;
have_result = true;
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index 9a5ce9687779..370c27cae2e6 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -58,6 +58,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
}
/* Status load is ordered after lookup counter load */
+ if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) {
+ pr_warn("No record of cell %s\n", cell->name);
+ vc->error = -ENOENT;
+ return false;
+ }
+
if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
vc->error = -EDESTADDRREQ;
return false;
@@ -276,6 +282,7 @@ failed:
*/
static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
{
+ struct afs_cell *cell = vc->cell;
static int count;
int i;
@@ -285,6 +292,9 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
rcu_read_lock();
pr_notice("EDESTADDR occurred\n");
+ pr_notice("CELL: %s err=%d\n", cell->name, cell->error);
+ pr_notice("DNS: src=%u st=%u lc=%x\n",
+ cell->dns_source, cell->dns_status, cell->dns_lookup_count);
pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 4310336b9bb8..4225bc766d46 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -221,7 +221,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
{
struct afs_server_list *new, *old, *discard;
struct afs_vldb_entry *vldb;
- char idbuf[16];
+ char idbuf[24];
int ret, idsz;
_enter("");
@@ -229,7 +229,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
/* We look up an ID by passing it as a decimal string in the
* operation's name parameter.
*/
- idsz = sprintf(idbuf, "%llu", volume->vid);
+ idsz = snprintf(idbuf, sizeof(idbuf), "%llu", volume->vid);
vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz);
if (IS_ERR(vldb)) {
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 3b19b009452a..fa85b359f325 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -241,8 +241,7 @@ static void xdr_decode_YFSCallBack(const __be32 **_bp,
struct afs_callback *cb = &scb->callback;
ktime_t cb_expiry;
- cb_expiry = call->reply_time;
- cb_expiry = ktime_add(cb_expiry, xdr_to_u64(x->expiration_time) * 100);
+ cb_expiry = ktime_add(call->issue_time, xdr_to_u64(x->expiration_time) * 100);
cb->expires_at = ktime_divns(cb_expiry, NSEC_PER_SEC);
scb->have_cb = true;
*_bp += xdr_size(x);
diff --git a/fs/aio.c b/fs/aio.c
index fb92c32a6f1e..e8013c7ede27 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -336,6 +336,9 @@ static int aio_ring_mremap(struct vm_area_struct *vma)
spin_lock(&mm->ioctx_lock);
rcu_read_lock();
table = rcu_dereference(mm->ioctx_table);
+ if (!table)
+ goto out_unlock;
+
for (i = 0; i < table->nr; i++) {
struct kioctx *ctx;
@@ -349,6 +352,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma)
}
}
+out_unlock:
rcu_read_unlock();
spin_unlock(&mm->ioctx_lock);
return res;
@@ -562,13 +566,24 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
{
- struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
- struct kioctx *ctx = req->ki_ctx;
+ struct aio_kiocb *req;
+ struct kioctx *ctx;
unsigned long flags;
+ /*
+ * kiocb didn't come from aio or is neither a read nor a write, hence
+ * ignore it.
+ */
+ if (!(iocb->ki_flags & IOCB_AIO_RW))
+ return;
+
+ req = container_of(iocb, struct aio_kiocb, rw);
+
if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
return;
+ ctx = req->ki_ctx;
+
spin_lock_irqsave(&ctx->ctx_lock, flags);
list_add_tail(&req->ki_list, &ctx->active_reqs);
req->ki_cancel = cancel;
@@ -1451,7 +1466,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
- req->ki_flags = iocb_flags(req->ki_filp);
+ req->ki_flags = iocb_flags(req->ki_filp) | IOCB_AIO_RW;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp));
diff --git a/fs/attr.c b/fs/attr.c
index b4bbdbd4c8ca..95bbd49f75c8 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -134,6 +134,8 @@ EXPORT_SYMBOL(setattr_prepare);
*/
int inode_newsize_ok(const struct inode *inode, loff_t offset)
{
+ if (offset < 0)
+ return -EINVAL;
if (inode->i_size < offset) {
unsigned long limit;
@@ -251,9 +253,25 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
}
if ((ia_valid & ATTR_MODE)) {
- umode_t amode = attr->ia_mode;
+ /*
+ * Don't allow changing the mode of symlinks:
+ *
+ * (1) The vfs doesn't take the mode of symlinks into account
+ * during permission checking.
+ * (2) This has never worked correctly. Most major filesystems
+ * did return EOPNOTSUPP due to interactions with POSIX ACLs
+ * but did still updated the mode of the symlink.
+ * This inconsistency led system call wrapper providers such
+ * as libc to block changing the mode of symlinks with
+ * EOPNOTSUPP already.
+ * (3) To even do this in the first place one would have to use
+ * specific file descriptors and quite some effort.
+ */
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
/* Flag setting protected by i_mutex */
- if (is_sxid(amode))
+ if (is_sxid(attr->ia_mode))
inode->i_flags &= ~S_NOSEC;
}
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index b04c528b19d3..1230bdf32989 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -32,8 +32,9 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi)
wq->status = -ENOENT; /* Magic is gone - report failure */
kfree(wq->name.name);
wq->name.name = NULL;
- wq->wait_ctr--;
wake_up_interruptible(&wq->queue);
+ if (!--wq->wait_ctr)
+ kfree(wq);
wq = nwq;
}
fput(sbi->pipe); /* Close the pipe */
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 8e8346a81723..ace587b66904 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -162,6 +162,7 @@ static int load_aout_binary(struct linux_binprm * bprm)
set_personality(PER_LINUX);
#endif
setup_new_exec(bprm);
+ install_exec_creds(bprm);
current->mm->end_code = ex.a_text +
(current->mm->start_code = N_TXTADDR(ex));
@@ -174,7 +175,6 @@ static int load_aout_binary(struct linux_binprm * bprm)
if (retval < 0)
return retval;
- install_exec_creds(bprm);
if (N_MAGIC(ex) == OMAGIC) {
unsigned long text_addr, map_size;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d86ebd0dcc3d..28aef31a6e6f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -345,14 +345,14 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
/* there's now no turning back... the old userspace image is dead,
* defunct, deceased, etc.
*/
+ SET_PERSONALITY(exec_params.hdr);
if (elf_check_fdpic(&exec_params.hdr))
- set_personality(PER_LINUX_FDPIC);
- else
- set_personality(PER_LINUX);
+ current->personality |= PER_LINUX_FDPIC;
if (elf_read_implies_exec(&exec_params.hdr, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
setup_new_exec(bprm);
+ install_exec_creds(bprm);
set_binfmt(&elf_fdpic_format);
@@ -434,9 +434,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
current->mm->start_stack = current->mm->start_brk + stack_size;
#endif
- install_exec_creds(bprm);
- if (create_elf_fdpic_tables(bprm, current->mm,
- &exec_params, &interp_params) < 0)
+ retval = create_elf_fdpic_tables(bprm, current->mm, &exec_params,
+ &interp_params);
+ if (retval < 0)
goto error;
kdebug("- start_code %lx", current->mm->start_code);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index c999bc0c0691..22a7d7547a91 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -565,6 +565,7 @@ static int load_flat_file(struct linux_binprm *bprm,
/* OK, This is the point of no return */
set_personality(PER_LINUX_32BIT);
setup_new_exec(bprm);
+ install_exec_creds(bprm);
}
/*
@@ -992,8 +993,6 @@ static int load_flat_binary(struct linux_binprm *bprm)
}
}
- install_exec_creds(bprm);
-
set_binfmt(&flat_format);
#ifdef CONFIG_MMU
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 056a68292e15..23b563ff0dd7 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -44,10 +44,10 @@ static LIST_HEAD(entries);
static int enabled = 1;
enum {Enabled, Magic};
-#define MISC_FMT_PRESERVE_ARGV0 (1 << 31)
-#define MISC_FMT_OPEN_BINARY (1 << 30)
-#define MISC_FMT_CREDENTIALS (1 << 29)
-#define MISC_FMT_OPEN_FILE (1 << 28)
+#define MISC_FMT_PRESERVE_ARGV0 (1UL << 31)
+#define MISC_FMT_OPEN_BINARY (1UL << 30)
+#define MISC_FMT_CREDENTIALS (1UL << 29)
+#define MISC_FMT_OPEN_FILE (1UL << 28)
typedef struct {
struct list_head list;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index fa329c7eddf0..e528ad860143 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -2114,21 +2114,26 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
if ((start | len) & (bdev_logical_block_size(bdev) - 1))
return -EINVAL;
- /* Invalidate the page cache, including dirty pages. */
+ /*
+ * Invalidate the page cache, including dirty pages, for valid
+ * de-allocate mode calls to fallocate().
+ */
mapping = bdev->bd_inode->i_mapping;
- truncate_inode_pages_range(mapping, start, end);
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
+ truncate_inode_pages_range(mapping, start, end);
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
+ truncate_inode_pages_range(mapping, start, end);
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
+ truncate_inode_pages_range(mapping, start, end);
error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
GFP_KERNEL, 0);
break;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index c701a19fac53..16e5e07cb5d1 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -136,6 +136,7 @@ struct share_check {
u64 root_objectid;
u64 inum;
int share_count;
+ bool have_delayed_delete_refs;
};
static inline int extent_is_shared(struct share_check *sc)
@@ -286,8 +287,10 @@ static void prelim_release(struct preftree *preftree)
struct prelim_ref *ref, *next_ref;
rbtree_postorder_for_each_entry_safe(ref, next_ref,
- &preftree->root.rb_root, rbnode)
+ &preftree->root.rb_root, rbnode) {
+ free_inode_elem_list(ref->inode_list);
free_pref(ref);
+ }
preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
@@ -428,6 +431,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
u64 wanted_disk_byte = ref->wanted_disk_byte;
u64 count = 0;
u64 data_offset;
+ u8 type;
if (level != 0) {
eb = path->nodes[level];
@@ -482,6 +486,9 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
continue;
}
fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+ type = btrfs_file_extent_type(eb, fi);
+ if (type == BTRFS_FILE_EXTENT_INLINE)
+ goto next;
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
data_offset = btrfs_file_extent_offset(eb, fi);
@@ -641,6 +648,18 @@ unode_aux_to_inode_list(struct ulist_node *node)
return (struct extent_inode_elem *)(uintptr_t)node->aux;
}
+static void free_leaf_list(struct ulist *ulist)
+{
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(ulist, &uiter)))
+ free_inode_elem_list(unode_aux_to_inode_list(node));
+
+ ulist_free(ulist);
+}
+
/*
* We maintain three separate rbtrees: one for direct refs, one for
* indirect refs which have a key, and one for indirect refs which do not
@@ -755,7 +774,11 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
cond_resched();
}
out:
- ulist_free(parents);
+ /*
+ * We may have inode lists attached to refs in the parents ulist, so we
+ * must free them before freeing the ulist and its refs.
+ */
+ free_leaf_list(parents);
return ret;
}
@@ -812,16 +835,11 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
struct preftrees *preftrees, struct share_check *sc)
{
struct btrfs_delayed_ref_node *node;
- struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct btrfs_key key;
- struct btrfs_key tmp_op_key;
struct rb_node *n;
int count;
int ret = 0;
- if (extent_op && extent_op->update_key)
- btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
-
spin_lock(&head->lock);
for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -847,10 +865,16 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_key *key_ptr = NULL;
+
+ if (head->extent_op && head->extent_op->update_key) {
+ btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
+ key_ptr = &key;
+ }
ref = btrfs_delayed_node_to_tree_ref(node);
ret = add_indirect_ref(fs_info, preftrees, ref->root,
- &tmp_op_key, ref->level + 1,
+ key_ptr, ref->level + 1,
node->bytenr, count, sc,
GFP_ATOMIC);
break;
@@ -876,13 +900,22 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
key.offset = ref->offset;
/*
- * Found a inum that doesn't match our known inum, we
- * know it's shared.
+ * If we have a share check context and a reference for
+ * another inode, we can't exit immediately. This is
+ * because even if this is a BTRFS_ADD_DELAYED_REF
+ * reference we may find next a BTRFS_DROP_DELAYED_REF
+ * which cancels out this ADD reference.
+ *
+ * If this is a DROP reference and there was no previous
+ * ADD reference, then we need to signal that when we
+ * process references from the extent tree (through
+ * add_inline_refs() and add_keyed_refs()), we should
+ * not exit early if we find a reference for another
+ * inode, because one of the delayed DROP references
+ * may cancel that reference in the extent tree.
*/
- if (sc && sc->inum && ref->objectid != sc->inum) {
- ret = BACKREF_FOUND_SHARED;
- goto out;
- }
+ if (sc && count < 0)
+ sc->have_delayed_delete_refs = true;
ret = add_indirect_ref(fs_info, preftrees, ref->root,
&key, 0, node->bytenr, count, sc,
@@ -912,7 +945,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
}
if (!ret)
ret = extent_is_shared(sc);
-out:
+
spin_unlock(&head->lock);
return ret;
}
@@ -1015,7 +1048,8 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1025,6 +1059,7 @@ static int add_inline_refs(const struct btrfs_fs_info *fs_info,
ret = add_indirect_ref(fs_info, preftrees, root,
&key, 0, bytenr, count,
sc, GFP_NOFS);
+
break;
}
default:
@@ -1114,7 +1149,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1353,6 +1389,12 @@ again:
if (ret < 0)
goto out;
ref->inode_list = eie;
+ /*
+ * We transferred the list ownership to the ref,
+ * so set to NULL to avoid a double free in case
+ * an error happens after this.
+ */
+ eie = NULL;
}
ret = ulist_add_merge_ptr(refs, ref->parent,
ref->inode_list,
@@ -1378,6 +1420,14 @@ again:
eie->next = ref->inode_list;
}
eie = NULL;
+ /*
+ * We have transferred the inode list ownership from
+ * this ref to the ref we added to the 'refs' ulist.
+ * So set this ref's inode list to NULL to avoid
+ * use-after-free when our caller uses it or double
+ * frees in case an error happens before we return.
+ */
+ ref->inode_list = NULL;
}
cond_resched();
}
@@ -1394,24 +1444,6 @@ out:
return ret;
}
-static void free_leaf_list(struct ulist *blocks)
-{
- struct ulist_node *node = NULL;
- struct extent_inode_elem *eie;
- struct ulist_iterator uiter;
-
- ULIST_ITER_INIT(&uiter);
- while ((node = ulist_next(blocks, &uiter))) {
- if (!node->aux)
- continue;
- eie = unode_aux_to_inode_list(node);
- free_inode_elem_list(eie);
- node->aux = 0;
- }
-
- ulist_free(blocks);
-}
-
/*
* Finds all leafs with a reference to the specified combination of bytenr and
* offset. key_list_head will point to a list of corresponding keys (caller must
@@ -1537,6 +1569,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
.root_objectid = root->root_key.objectid,
.inum = inum,
.share_count = 0,
+ .have_delayed_delete_refs = false,
};
ulist_init(roots);
@@ -1571,6 +1604,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
break;
bytenr = node->val;
shared.share_count = 0;
+ shared.have_delayed_delete_refs = false;
cond_resched();
}
@@ -2257,20 +2291,14 @@ struct btrfs_data_container *init_data_container(u32 total_bytes)
size_t alloc_bytes;
alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
- data = kvmalloc(alloc_bytes, GFP_KERNEL);
+ data = kvzalloc(alloc_bytes, GFP_KERNEL);
if (!data)
return ERR_PTR(-ENOMEM);
- if (total_bytes >= sizeof(*data)) {
+ if (total_bytes >= sizeof(*data))
data->bytes_left = total_bytes - sizeof(*data);
- data->bytes_missing = 0;
- } else {
+ else
data->bytes_missing = sizeof(*data) - total_bytes;
- data->bytes_left = 0;
- }
-
- data->elem_cnt = 0;
- data->elem_missed = 0;
return data;
}
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index bcf19dfb0af3..278933cd3a09 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2938,6 +2938,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
* attempt.
*/
wait_for_alloc = true;
+ force = CHUNK_ALLOC_NO_FORCE;
spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index 343400d49bd1..63205d2f4d84 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -29,7 +29,8 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
} else {
num_bytes = 0;
}
- if (block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
+ if (qgroup_to_release_ret &&
+ block_rsv->qgroup_rsv_reserved >= block_rsv->qgroup_rsv_size) {
qgroup_to_release = block_rsv->qgroup_rsv_reserved -
block_rsv->qgroup_rsv_size;
block_rsv->qgroup_rsv_reserved = block_rsv->qgroup_rsv_size;
@@ -391,7 +392,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
- if (unlikely(block_rsv->size == 0))
+ if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
goto try_reserve;
again:
ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index d1428bb73fc5..69770360917c 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -98,4 +98,20 @@ static inline void btrfs_unuse_block_rsv(struct btrfs_fs_info *fs_info,
btrfs_block_rsv_release(fs_info, block_rsv, 0);
}
+/*
+ * Get the size of a block reserve in a context where getting a stale value is
+ * acceptable, instead of accessing it directly and trigger data race warning
+ * from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->size;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 822c615840e8..608e41b61689 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -3589,6 +3589,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
if (ret) {
+ btrfs_tree_unlock(split);
+ free_extent_buffer(split);
btrfs_abort_transaction(trans, ret);
return ret;
}
@@ -5138,10 +5140,12 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
{
struct btrfs_key key;
+ struct btrfs_key orig_key;
struct btrfs_disk_key found_key;
int ret;
btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
+ orig_key = key;
if (key.offset > 0) {
key.offset--;
@@ -5158,8 +5162,36 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
btrfs_release_path(path);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
- if (ret < 0)
+ if (ret <= 0)
return ret;
+
+ /*
+ * Previous key not found. Even if we were at slot 0 of the leaf we had
+ * before releasing the path and calling btrfs_search_slot(), we now may
+ * be in a slot pointing to the same original key - this can happen if
+ * after we released the path, one of more items were moved from a
+ * sibling leaf into the front of the leaf we had due to an insertion
+ * (see push_leaf_right()).
+ * If we hit this case and our slot is > 0 and just decrement the slot
+ * so that the caller does not process the same key again, which may or
+ * may not break the caller, depending on its logic.
+ */
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
+ ret = comp_keys(&found_key, &orig_key);
+ if (ret == 0) {
+ if (path->slots[0] > 0) {
+ path->slots[0]--;
+ return 0;
+ }
+ /*
+ * At slot 0, same key as before, it means orig_key is
+ * the lowest, leftmost, key in the tree. We're done.
+ */
+ return 1;
+ }
+ }
+
btrfs_item_key(path->nodes[0], &found_key, 0);
ret = comp_keys(&found_key, &key);
/*
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index cd77c0621a55..b141a7ba4507 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -482,8 +482,6 @@ struct btrfs_swapfile_pin {
bool is_block_group;
};
-bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
-
enum {
BTRFS_FS_BARRIER,
BTRFS_FS_CLOSING_START,
@@ -2727,7 +2725,7 @@ struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod);
struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root,
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index f4f531c4aa96..d3f16dc33d0f 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -324,9 +324,6 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes)
} else {
if (current->journal_info)
flush = BTRFS_RESERVE_FLUSH_LIMIT;
-
- if (btrfs_transaction_in_commit(fs_info))
- schedule_timeout(1);
}
if (delalloc_lock)
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index e96890475bac..95afe5ef7500 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1137,6 +1137,9 @@ __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
if (ret)
return ret;
+ ret = btrfs_record_root_in_trans(trans, node->root);
+ if (ret)
+ return ret;
ret = btrfs_update_delayed_inode(trans, node->root, path, node);
return ret;
}
@@ -1175,20 +1178,33 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
ret = __btrfs_commit_inode_delayed_items(trans, path,
curr_node);
if (ret) {
- btrfs_release_delayed_node(curr_node);
- curr_node = NULL;
btrfs_abort_transaction(trans, ret);
break;
}
prev_node = curr_node;
curr_node = btrfs_next_delayed_node(curr_node);
+ /*
+ * See the comment below about releasing path before releasing
+ * node. If the commit of delayed items was successful the path
+ * should always be released, but in case of an error, it may
+ * point to locked extent buffers (a leaf at the very least).
+ */
+ ASSERT(path->nodes[0] == NULL);
btrfs_release_delayed_node(prev_node);
}
+ /*
+ * Release the path to avoid a potential deadlock and lockdep splat when
+ * releasing the delayed node, as that requires taking the delayed node's
+ * mutex. If another task starts running delayed items before we take
+ * the mutex, it will first lock the mutex and then it may try to lock
+ * the same btree path (leaf).
+ */
+ btrfs_free_path(path);
+
if (curr_node)
btrfs_release_delayed_node(curr_node);
- btrfs_free_path(path);
trans->block_rsv = block_rsv;
return ret;
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1cb7f5d79765..4abc0db6527e 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -125,7 +125,7 @@ no_valid_dev_replace_entry_found:
if (btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
btrfs_err(fs_info,
- "replace devid present without an active replace item");
+"replace without active item, run 'device scan --forget' on the target device");
ret = -EUCLEAN;
} else {
dev_replace->srcdev = NULL;
@@ -535,6 +535,23 @@ leave:
return ret;
}
+static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args)
+{
+ if (args->start.srcdevid == 0) {
+ if (memchr(args->start.srcdev_name, 0,
+ sizeof(args->start.srcdev_name)) == NULL)
+ return -ENAMETOOLONG;
+ } else {
+ args->start.srcdev_name[0] = 0;
+ }
+
+ if (memchr(args->start.tgtdev_name, 0,
+ sizeof(args->start.tgtdev_name)) == NULL)
+ return -ENAMETOOLONG;
+
+ return 0;
+}
+
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
@@ -547,10 +564,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
default:
return -EINVAL;
}
-
- if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
- args->start.tgtdev_name[0] == '\0')
- return -EINVAL;
+ ret = btrfs_check_replace_dev_names(args);
+ if (ret < 0)
+ return ret;
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
args->start.srcdevid,
@@ -918,8 +934,7 @@ int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info)
up_write(&dev_replace->rwsem);
/* Scrub for replace must not be running in suspended state */
- ret = btrfs_scrub_cancel(fs_info);
- ASSERT(ret != -ENOTCONN);
+ btrfs_scrub_cancel(fs_info);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 863367c2c620..98c6faa8ce15 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -171,10 +171,40 @@ out_free:
return 0;
}
+static struct btrfs_dir_item *btrfs_lookup_match_dir(
+ struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *key, const char *name,
+ int name_len, int mod)
+{
+ const int ins_len = (mod < 0 ? -1 : 0);
+ const int cow = (mod != 0);
+ int ret;
+
+ ret = btrfs_search_slot(trans, root, key, path, ins_len, cow);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ if (ret > 0)
+ return ERR_PTR(-ENOENT);
+
+ return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+}
+
/*
- * lookup a directory item based on name. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory item by name.
+ *
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
+ *
+ * Returns: NULL if the dir item does not exists, an error pointer if an error
+ * happened, or a pointer to a dir item if a dir item exists for the given name.
*/
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
@@ -182,23 +212,18 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
const char *name, int name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
+ struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
-
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return di;
}
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
@@ -212,7 +237,6 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
int slot;
struct btrfs_path *path;
-
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
@@ -221,20 +245,20 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-
- /* return back any errors */
- if (ret < 0)
- goto out;
+ di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ /* Nothing found, we're safe */
+ if (ret == -ENOENT) {
+ ret = 0;
+ goto out;
+ }
- /* nothing found, we're safe */
- if (ret > 0) {
- ret = 0;
- goto out;
+ if (ret < 0)
+ goto out;
}
/* we found an item, look for our name in the item */
- di = btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
if (di) {
/* our exact name was found */
ret = -EEXIST;
@@ -261,35 +285,42 @@ out:
}
/*
- * lookup a directory item based on index. 'dir' is the objectid
- * we're searching in, and 'mod' tells us if you plan on deleting the
- * item (use mod < 0) or changing the options (use mod > 0)
+ * Lookup for a directory index item by name and index number.
+ *
+ * @trans: The transaction handle to use. Can be NULL if @mod is 0.
+ * @root: The root of the target tree.
+ * @path: Path to use for the search.
+ * @dir: The inode number (objectid) of the directory.
+ * @index: The index number.
+ * @name: The name associated to the directory entry we are looking for.
+ * @name_len: The length of the name.
+ * @mod: Used to indicate if the tree search is meant for a read only
+ * lookup, for a modification lookup or for a deletion lookup, so
+ * its value should be 0, 1 or -1, respectively.
*
- * The name is used to make sure the index really points to the name you were
- * looking for.
+ * Returns: NULL if the dir index item does not exists, an error pointer if an
+ * error happened, or a pointer to a dir item if the dir index item exists and
+ * matches the criteria (name and index number).
*/
struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
- u64 objectid, const char *name, int name_len,
+ u64 index, const char *name, int name_len,
int mod)
{
- int ret;
+ struct btrfs_dir_item *di;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
- key.offset = objectid;
+ key.offset = index;
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
- return ERR_PTR(-ENOENT);
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (di == ERR_PTR(-ENOENT))
+ return NULL;
+
+ return di;
}
struct btrfs_dir_item *
@@ -346,21 +377,18 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
const char *name, u16 name_len,
int mod)
{
- int ret;
struct btrfs_key key;
- int ins_len = mod < 0 ? -1 : 0;
- int cow = mod != 0;
+ struct btrfs_dir_item *di;
key.objectid = dir;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
- ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
- if (ret < 0)
- return ERR_PTR(ret);
- if (ret > 0)
+
+ di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
+ if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
- return btrfs_match_dir_item_name(root->fs_info, path, name, name_len);
+ return di;
}
/*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a4b3e6f6bf02..b4ed11b5f148 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2239,6 +2239,23 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type)
fs_info->csum_shash = csum_shash;
+ /*
+ * Check if the checksum implementation is a fast accelerated one.
+ * As-is this is a bit of a hack and should be replaced once the csum
+ * implementations provide that information themselves.
+ */
+ switch (csum_type) {
+ case BTRFS_CSUM_TYPE_CRC32:
+ if (!strstr(crypto_shash_driver_name(csum_shash), "generic"))
+ set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
+ break;
+ default:
+ break;
+ }
+
+ btrfs_info(fs_info, "using %s (%s) checksum algorithm",
+ btrfs_super_csum_name(csum_type),
+ crypto_shash_driver_name(csum_shash));
return 0;
}
@@ -2463,21 +2480,18 @@ static int validate_super(struct btrfs_fs_info *fs_info,
ret = -EINVAL;
}
- if (memcmp(fs_info->fs_devices->fsid, fs_info->super_copy->fsid,
- BTRFS_FSID_SIZE)) {
+ if (memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
"superblock fsid doesn't match fsid of fs_devices: %pU != %pU",
- fs_info->super_copy->fsid, fs_info->fs_devices->fsid);
+ sb->fsid, fs_info->fs_devices->fsid);
ret = -EINVAL;
}
- if (btrfs_fs_incompat(fs_info, METADATA_UUID) &&
- memcmp(fs_info->fs_devices->metadata_uuid,
- fs_info->super_copy->metadata_uuid, BTRFS_FSID_SIZE)) {
+ if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb),
+ BTRFS_FSID_SIZE) != 0) {
btrfs_err(fs_info,
"superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU",
- fs_info->super_copy->metadata_uuid,
- fs_info->fs_devices->metadata_uuid);
+ btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid);
ret = -EINVAL;
}
@@ -2970,6 +2984,20 @@ int open_ctree(struct super_block *sb,
err = -EINVAL;
goto fail_csum;
}
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata write, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (unlikely(features && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ features);
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
ret = btrfs_init_workqueues(fs_info, fs_devices);
if (ret) {
@@ -4075,6 +4103,11 @@ void close_ctree(struct btrfs_fs_info *fs_info)
ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
+ if (btrfs_check_quota_leak(fs_info)) {
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ btrfs_err(fs_info, "qgroup reserved space leaked");
+ }
+
btrfs_free_qgroup_config(fs_info);
ASSERT(list_empty(&fs_info->delalloc_roots));
@@ -4377,7 +4410,11 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
*/
inode = igrab(&btrfs_inode->vfs_inode);
if (inode) {
+ unsigned int nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
invalidate_inode_pages2(inode->i_mapping);
+ memalloc_nofs_restore(nofs_flag);
iput(inode);
}
spin_lock(&root->delalloc_lock);
@@ -4495,7 +4532,12 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
inode = cache->io_ctl.inode;
if (inode) {
+ unsigned int nofs_flag;
+
+ nofs_flag = memalloc_nofs_save();
invalidate_inode_pages2(inode->i_mapping);
+ memalloc_nofs_restore(nofs_flag);
+
BTRFS_I(inode)->generation = 0;
cache->io_ctl.inode = NULL;
iput(inode);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 93cceeba484c..08d1d456e2f0 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -58,7 +58,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
}
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -182,8 +182,15 @@ struct dentry *btrfs_get_parent(struct dentry *child)
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto fail;
+ if (ret == 0) {
+ /*
+ * Key with offset of -1 found, there would have to exist an
+ * inode with such number or a root with such id.
+ */
+ ret = -EUCLEAN;
+ goto fail;
+ }
- BUG_ON(ret == 0); /* Key with offset of -1 found */
if (path->slots[0] == 0) {
ret = -ENOENT;
goto fail;
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index f32f4113c976..5afb7ca42828 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -19,7 +19,7 @@ struct btrfs_fid {
} __attribute__ ((packed));
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation);
struct dentry *btrfs_get_parent(struct dentry *child);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 19d2104c0462..a28b0eafb65a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -895,6 +895,11 @@ again:
err = -ENOENT;
goto out;
} else if (WARN_ON(ret)) {
+ btrfs_print_leaf(path->nodes[0]);
+ btrfs_err(fs_info,
+"extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
+ bytenr, num_bytes, parent, root_objectid, owner,
+ offset);
err = -EIO;
goto out;
}
@@ -1238,7 +1243,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
u64 bytes_left, end;
u64 aligned_start = ALIGN(start, 1 << 9);
- if (WARN_ON(start != aligned_start)) {
+ /* Adjust the range to be aligned to 512B sectors if necessary. */
+ if (start != aligned_start) {
len -= aligned_start - start;
len = round_down(len, 1 << 9);
start = aligned_start;
@@ -1676,12 +1682,12 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
parent = ref->parent;
ref_root = ref->root;
- if (node->ref_mod != 1) {
+ if (unlikely(node->ref_mod != 1)) {
btrfs_err(trans->fs_info,
- "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
+ "btree block %llu has %d references rather than 1: action %d ref_root %llu parent %llu",
node->bytenr, node->ref_mod, node->action, ref_root,
parent);
- return -EIO;
+ return -EUCLEAN;
}
if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
BUG_ON(!extent_op || !extent_op->update_flags);
@@ -3989,8 +3995,11 @@ have_block_group:
ret = 0;
}
- if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
+ if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
+ if (!cache_block_group_error)
+ cache_block_group_error = -EIO;
goto loop;
+ }
/*
* Ok we want to try and use the cluster allocator, so
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 95ddeb477797..04788940afaf 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4024,11 +4024,12 @@ retry:
free_extent_buffer(eb);
/*
- * the filesystem may choose to bump up nr_to_write.
+ * The filesystem may choose to bump up nr_to_write.
* We have to make sure to honor the new nr_to_write
- * at any time
+ * at any time.
*/
- nr_to_write_done = wbc->nr_to_write <= 0;
+ nr_to_write_done = (wbc->sync_mode == WB_SYNC_NONE &&
+ wbc->nr_to_write <= 0);
}
pagevec_release(&pvec);
cond_resched();
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 61b82c69eed5..1a7183cdfe95 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -499,7 +499,9 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio,
sums = kvzalloc(btrfs_ordered_sum_size(fs_info,
bytes_left), GFP_KERNEL);
memalloc_nofs_restore(nofs_flag);
- BUG_ON(!sums); /* -ENOMEM */
+ if (!sums)
+ return BLK_STS_RESOURCE;
+
sums->len = bytes_left;
ordered = btrfs_lookup_ordered_extent(inode,
offset);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d2d32fed8f2e..0cb93f73acb2 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -784,15 +784,16 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
}
spin_lock(&ctl->tree_lock);
ret = link_free_space(ctl, e);
- ctl->total_bitmaps++;
- ctl->op->recalc_thresholds(ctl);
- spin_unlock(&ctl->tree_lock);
if (ret) {
+ spin_unlock(&ctl->tree_lock);
btrfs_err(fs_info,
"Duplicate entries in free space cache, dumping");
kmem_cache_free(btrfs_free_space_cachep, e);
goto free_cache;
}
+ ctl->total_bitmaps++;
+ ctl->op->recalc_thresholds(ctl);
+ spin_unlock(&ctl->tree_lock);
list_add_tail(&e->list, &bitmaps);
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7755a0362a3a..c89e85a7da7d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6830,7 +6830,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -6894,7 +6894,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -7039,7 +7039,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_fail;
@@ -9751,8 +9751,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- root_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -9768,8 +9766,6 @@ static int btrfs_rename_exchange(struct inode *old_dir,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(dest);
- dest_log_pinned = true;
ret = btrfs_insert_inode_ref(trans, root,
old_dentry->d_name.name,
old_dentry->d_name.len,
@@ -9797,6 +9793,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode), 1);
}
+ /*
+ * Now pin the logs of the roots. We do it to ensure that no other task
+ * can sync the logs while we are in progress with the rename, because
+ * that could result in an inconsistency in case any of the inodes that
+ * are part of this rename operation were logged before.
+ *
+ * We pin the logs even if at this precise moment none of the inodes was
+ * logged before. This is because right after we checked for that, some
+ * other task fsyncing some other inode not involved with this rename
+ * operation could log that one of our inodes exists.
+ *
+ * We don't need to pin the logs before the above calls to
+ * btrfs_insert_inode_ref(), since those don't ever need to change a log.
+ */
+ if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(root);
+ root_log_pinned = true;
+ }
+ if (new_ino != BTRFS_FIRST_FREE_OBJECTID) {
+ btrfs_pin_log_trans(dest);
+ dest_log_pinned = true;
+ }
+
/* src is a subvolume */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
@@ -9911,7 +9930,7 @@ static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
u64 objectid;
u64 index;
- ret = btrfs_find_free_ino(root, &objectid);
+ ret = btrfs_find_free_objectid(root, &objectid);
if (ret)
return ret;
@@ -10046,8 +10065,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
/* force full log commit if subvolume involved. */
btrfs_set_log_full_commit(trans);
} else {
- btrfs_pin_log_trans(root);
- log_pinned = true;
ret = btrfs_insert_inode_ref(trans, dest,
new_dentry->d_name.name,
new_dentry->d_name.len,
@@ -10071,6 +10088,25 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
ret = btrfs_unlink_subvol(trans, old_dir, old_dentry);
} else {
+ /*
+ * Now pin the log. We do it to ensure that no other task can
+ * sync the log while we are in progress with the rename, as
+ * that could result in an inconsistency in case any of the
+ * inodes that are part of this rename operation were logged
+ * before.
+ *
+ * We pin the log even if at this precise moment none of the
+ * inodes was logged before. This is because right after we
+ * checked for that, some other task fsyncing some other inode
+ * not involved with this rename operation could log that one of
+ * our inodes exists.
+ *
+ * We don't need to pin the logs before the above call to
+ * btrfs_insert_inode_ref(), since that does not need to change
+ * a log.
+ */
+ btrfs_pin_log_trans(root);
+ log_pinned = true;
ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
BTRFS_I(d_inode(old_dentry)),
old_dentry->d_name.name,
@@ -10380,7 +10416,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (IS_ERR(trans))
return PTR_ERR(trans);
- err = btrfs_find_free_ino(root, &objectid);
+ err = btrfs_find_free_objectid(root, &objectid);
if (err)
goto out_unlock;
@@ -10663,7 +10699,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (IS_ERR(trans))
return PTR_ERR(trans);
- ret = btrfs_find_free_ino(root, &objectid);
+ ret = btrfs_find_free_objectid(root, &objectid);
if (ret)
goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 675112aa998f..674d774eb662 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1847,6 +1847,15 @@ static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
* are limited to own subvolumes only
*/
ret = -EPERM;
+ } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) {
+ /*
+ * Snapshots must be made with the src_inode referring
+ * to the subvolume inode, otherwise the permission
+ * checking above is useless because we may have
+ * permission on a lower directory but not the subvol
+ * itself.
+ */
+ ret = -EINVAL;
} else {
ret = btrfs_mksubvol(&file->f_path, name, namelen,
BTRFS_I(src_inode)->root,
@@ -2110,7 +2119,7 @@ static noinline int key_in_sk(struct btrfs_key *key,
static noinline int copy_to_sk(struct btrfs_path *path,
struct btrfs_key *key,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf,
unsigned long *sk_offset,
int *num_found)
@@ -2242,7 +2251,7 @@ out:
static noinline int search_ioctl(struct inode *inode,
struct btrfs_ioctl_search_key *sk,
- size_t *buf_size,
+ u64 *buf_size,
char __user *ubuf)
{
struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
@@ -2314,7 +2323,7 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
struct btrfs_ioctl_search_key sk;
struct inode *inode;
int ret;
- size_t buf_size;
+ u64 buf_size;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2348,8 +2357,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
struct btrfs_ioctl_search_args_v2 args;
struct inode *inode;
int ret;
- size_t buf_size;
- const size_t buf_limit = SZ_16M;
+ u64 buf_size;
+ const u64 buf_limit = SZ_16M;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -2824,6 +2833,8 @@ static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
}
}
+ btrfs_free_path(path);
+ path = NULL;
if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
ret = -EFAULT;
@@ -2914,6 +2925,8 @@ static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
}
out:
+ btrfs_free_path(path);
+
if (!ret || ret == -EOVERFLOW) {
rootrefs->num_items = found;
/* update min_treeid for next search */
@@ -2925,7 +2938,6 @@ out:
}
kfree(rootrefs);
- btrfs_free_path(path);
return ret;
}
@@ -3045,7 +3057,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
{
struct inode *inode = file_inode(file);
struct btrfs_root *root = BTRFS_I(inode)->root;
- struct btrfs_ioctl_defrag_range_args *range;
+ struct btrfs_ioctl_defrag_range_args range = {0};
int ret;
ret = mnt_want_write_file(file);
@@ -3077,33 +3089,28 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
goto out;
}
- range = kzalloc(sizeof(*range), GFP_KERNEL);
- if (!range) {
- ret = -ENOMEM;
- goto out;
- }
-
if (argp) {
- if (copy_from_user(range, argp,
- sizeof(*range))) {
+ if (copy_from_user(&range, argp, sizeof(range))) {
ret = -EFAULT;
- kfree(range);
+ goto out;
+ }
+ if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
+ ret = -EOPNOTSUPP;
goto out;
}
/* compression requires us to start the IO */
- if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
- range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
- range->extent_thresh = (u32)-1;
+ if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
+ range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
+ range.extent_thresh = (u32)-1;
}
} else {
/* the rest are all set to zero by kzalloc */
- range->len = (u64)-1;
+ range.len = (u64)-1;
}
ret = btrfs_defrag_file(file_inode(file), file,
- range, BTRFS_OLDEST_GENERATION, 0);
+ &range, BTRFS_OLDEST_GENERATION, 0);
if (ret > 0)
ret = 0;
- kfree(range);
break;
default:
ret = -EINVAL;
@@ -3296,13 +3303,10 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
di_args->bytes_used = btrfs_device_get_bytes_used(dev);
di_args->total_bytes = btrfs_device_get_total_bytes(dev);
memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
- if (dev->name) {
- strncpy(di_args->path, rcu_str_deref(dev->name),
- sizeof(di_args->path) - 1);
- di_args->path[sizeof(di_args->path) - 1] = 0;
- } else {
+ if (dev->name)
+ strscpy(di_args->path, rcu_str_deref(dev->name), sizeof(di_args->path));
+ else
di_args->path[0] = '\0';
- }
out:
rcu_read_unlock();
@@ -4143,7 +4147,7 @@ static void get_block_group_info(struct list_head *groups_list,
static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
void __user *arg)
{
- struct btrfs_ioctl_space_args space_args;
+ struct btrfs_ioctl_space_args space_args = { 0 };
struct btrfs_ioctl_space_info space;
struct btrfs_ioctl_space_info *dest;
struct btrfs_ioctl_space_info *dest_orig;
@@ -4339,6 +4343,11 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
if (IS_ERR(sa))
return PTR_ERR(sa);
+ if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+
if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
ret = mnt_want_write_file(file);
if (ret)
@@ -4515,6 +4524,8 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
ipath->fspath->val[i] = rel_ptr;
}
+ btrfs_free_path(path);
+ path = NULL;
ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
ipath->fspath, size);
if (ret) {
@@ -4585,21 +4596,20 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
size = min_t(u32, loi->size, SZ_16M);
}
- path = btrfs_alloc_path();
- if (!path) {
- ret = -ENOMEM;
- goto out;
- }
-
inodes = init_data_container(size);
if (IS_ERR(inodes)) {
ret = PTR_ERR(inodes);
- inodes = NULL;
- goto out;
+ goto out_loi;
}
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
build_ino_list, inodes, ignore_offset);
+ btrfs_free_path(path);
if (ret == -EINVAL)
ret = -ENOENT;
if (ret < 0)
@@ -4611,7 +4621,6 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
ret = -EFAULT;
out:
- btrfs_free_path(path);
kvfree(inodes);
out_loi:
kfree(loi);
@@ -4912,7 +4921,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
}
/* update qgroup status and info */
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
err = btrfs_run_qgroups(trans);
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
if (err < 0)
btrfs_handle_fs_error(fs_info, err,
"failed to update qgroup status and info");
@@ -4954,6 +4965,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
+ if (sa->create && is_fstree(sa->qgroupid)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -5508,7 +5524,7 @@ static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
if (compat) {
#if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
- struct btrfs_ioctl_send_args_32 args32;
+ struct btrfs_ioctl_send_args_32 args32 = { 0 };
ret = copy_from_user(&args32, argp, sizeof(args32));
if (ret)
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index f4edadf1067f..cec6c283a691 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -109,10 +109,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
pr_cont("shared data backref parent %llu count %u\n",
offset, btrfs_shared_data_ref_count(eb, sref));
/*
- * offset is supposed to be a tree block which
- * must be aligned to nodesize.
+ * Offset is supposed to be a tree block which must be
+ * aligned to sectorsize.
*/
- if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
+ if (!IS_ALIGNED(offset, eb->fs_info->sectorsize))
pr_info(
"\t\t\t(parent %llu not aligned to sectorsize %u)\n",
offset, eb->fs_info->sectorsize);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 5a3006c75d63..d6184b6206ff 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -504,6 +504,49 @@ out:
return ret < 0 ? ret : 0;
}
+static u64 btrfs_qgroup_subvolid(u64 qgroupid)
+{
+ return (qgroupid & ((1ULL << BTRFS_QGROUP_LEVEL_SHIFT) - 1));
+}
+
+/*
+ * Called in close_ctree() when quota is still enabled. This verifies we don't
+ * leak some reserved space.
+ *
+ * Return false if no reserved space is left.
+ * Return true if some reserved space is leaked.
+ */
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info)
+{
+ struct rb_node *node;
+ bool ret = false;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return ret;
+ /*
+ * Since we're unmounting, there is no race and no need to grab qgroup
+ * lock. And here we don't go post-order to provide a more user
+ * friendly sorted result.
+ */
+ for (node = rb_first(&fs_info->qgroup_tree); node; node = rb_next(node)) {
+ struct btrfs_qgroup *qgroup;
+ int i;
+
+ qgroup = rb_entry(node, struct btrfs_qgroup, node);
+ for (i = 0; i < BTRFS_QGROUP_RSV_LAST; i++) {
+ if (qgroup->rsv.values[i]) {
+ ret = true;
+ btrfs_warn(fs_info,
+ "qgroup %llu/%llu has unreleased space, type %d rsv %llu",
+ btrfs_qgroup_level(qgroup->qgroupid),
+ btrfs_qgroup_subvolid(qgroup->qgroupid),
+ i, qgroup->rsv.values[i]);
+ }
+ }
+ }
+ return ret;
+}
+
/*
* This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
* first two are in single-threaded paths.And for the third one, we have set
@@ -1075,6 +1118,21 @@ out_add_root:
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ } else {
+ /*
+ * We have set both BTRFS_FS_QUOTA_ENABLED and
+ * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
+ * -EINPROGRESS. That can happen because someone started the
+ * rescan worker by calling quota rescan ioctl before we
+ * attempted to initialize the rescan worker. Failure due to
+ * quotas disabled in the meanwhile is not possible, because
+ * we are holding a write lock on fs_info->subvol_sem, which
+ * is also acquired when disabling quotas.
+ * Ignore such error, and any other error would need to undo
+ * everything we did in the transaction we just committed.
+ */
+ ASSERT(ret == -EINPROGRESS);
+ ret = 0;
}
out_free_path:
@@ -1106,12 +1164,23 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
int ret = 0;
/*
- * We need to have subvol_sem write locked, to prevent races between
- * concurrent tasks trying to disable quotas, because we will unlock
- * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ * We need to have subvol_sem write locked to prevent races with
+ * snapshot creation.
*/
lockdep_assert_held_write(&fs_info->subvol_sem);
+ /*
+ * Lock the cleaner mutex to prevent races with concurrent relocation,
+ * because relocation may be building backrefs for blocks of the quota
+ * root while we are deleting the root. This is like dropping fs roots
+ * of deleted snapshots/subvolumes, we need the same protection.
+ *
+ * This also prevents races between concurrent tasks trying to disable
+ * quotas, because we will unlock and relock qgroup_ioctl_lock across
+ * BTRFS_FS_QUOTA_ENABLED changes.
+ */
+ mutex_lock(&fs_info->cleaner_mutex);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
@@ -1174,7 +1243,9 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
goto out;
}
+ spin_lock(&fs_info->trans_lock);
list_del(&quota_root->dirty_list);
+ spin_unlock(&fs_info->trans_lock);
btrfs_tree_lock(quota_root->node);
btrfs_clean_tree_block(quota_root->node);
@@ -1191,6 +1262,7 @@ out:
btrfs_end_transaction(trans);
else if (trans)
ret = btrfs_end_transaction(trans);
+ mutex_unlock(&fs_info->cleaner_mutex);
return ret;
}
@@ -1323,7 +1395,6 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
@@ -1339,9 +1410,8 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
return -ENOMEM;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
member = find_qgroup_rb(fs_info, src);
@@ -1387,7 +1457,6 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
u64 dst)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *parent;
struct btrfs_qgroup *member;
struct btrfs_qgroup_list *list;
@@ -1400,9 +1469,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
if (!tmp)
return -ENOMEM;
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -1467,11 +1535,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
+ quota_root = fs_info->quota_root;
qgroup = find_qgroup_rb(fs_info, qgroupid);
if (qgroup) {
ret = -EEXIST;
@@ -1493,18 +1561,25 @@ out:
return ret;
}
+static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+{
+ return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
+ qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+}
+
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_qgroup_list *list;
int ret = 0;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -1514,6 +1589,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
+ if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* Check if there are no children of this qgroup */
if (!list_empty(&qgroup->members)) {
ret = -EBUSY;
@@ -1545,7 +1625,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
struct btrfs_qgroup_limit *limit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
int ret = 0;
/* Sometimes we would want to clear the limit on this qgroup.
@@ -1555,9 +1634,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
const u64 CLEAR_VALUE = -1;
mutex_lock(&fs_info->qgroup_ioctl_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root) {
- ret = -EINVAL;
+ if (!fs_info->quota_root) {
+ ret = -ENOTCONN;
goto out;
}
@@ -2657,15 +2735,23 @@ cleanup:
}
/*
- * called from commit_transaction. Writes all changed qgroups to disk.
+ * Writes all changed qgroups to disk.
+ * Called by the transaction commit path and the qgroup assign ioctl.
*/
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_root *quota_root = fs_info->quota_root;
int ret = 0;
- if (!quota_root)
+ /*
+ * In case we are called from the qgroup assign ioctl, assert that we
+ * are holding the qgroup_ioctl_lock, otherwise we can race with a quota
+ * disable operation (ioctl) and access a freed quota root.
+ */
+ if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
+ lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
+
+ if (!fs_info->quota_root)
return ret;
spin_lock(&fs_info->qgroup_lock);
@@ -2809,14 +2895,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
dstgroup->rsv_rfer = inherit->lim.rsv_rfer;
dstgroup->rsv_excl = inherit->lim.rsv_excl;
- ret = update_qgroup_limit_item(trans, dstgroup);
- if (ret) {
- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- btrfs_info(fs_info,
- "unable to update quota limit for %llu",
- dstgroup->qgroupid);
- goto unlock;
- }
+ qgroup_dirty(fs_info, dstgroup);
}
if (srcid) {
@@ -2935,7 +3014,6 @@ static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
enum btrfs_qgroup_rsv_type type)
{
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct btrfs_fs_info *fs_info = root->fs_info;
u64 ref_root = root->root_key.objectid;
@@ -2954,8 +3032,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
enforce = false;
spin_lock(&fs_info->qgroup_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root)
+ if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -3022,7 +3099,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes,
enum btrfs_qgroup_rsv_type type)
{
- struct btrfs_root *quota_root;
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -3040,8 +3116,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
}
spin_lock(&fs_info->qgroup_lock);
- quota_root = fs_info->quota_root;
- if (!quota_root)
+ if (!fs_info->quota_root)
goto out;
qgroup = find_qgroup_rb(fs_info, ref_root);
@@ -3188,7 +3263,8 @@ out:
static bool rescan_should_stop(struct btrfs_fs_info *fs_info)
{
return btrfs_fs_closing(fs_info) ||
- test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state) ||
+ !test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
}
static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
@@ -3200,6 +3276,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
int err = -ENOMEM;
int ret = 0;
bool stopped = false;
+ bool did_leaf_rescans = false;
path = btrfs_alloc_path();
if (!path)
@@ -3218,11 +3295,10 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work)
err = PTR_ERR(trans);
break;
}
- if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
- err = -EINTR;
- } else {
- err = qgroup_rescan_leaf(trans, path);
- }
+
+ err = qgroup_rescan_leaf(trans, path);
+ did_leaf_rescans = true;
+
if (err > 0)
btrfs_commit_transaction(trans);
else
@@ -3236,22 +3312,29 @@ out:
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
- } else if (err < 0) {
+ } else if (err < 0 || stopped) {
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
}
mutex_unlock(&fs_info->qgroup_rescan_lock);
/*
- * only update status, since the previous part has already updated the
- * qgroup info.
+ * Only update status, since the previous part has already updated the
+ * qgroup info, and only if we did any actual work. This also prevents
+ * race with a concurrent quota disable, which has already set
+ * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
+ * btrfs_quota_disable().
*/
- trans = btrfs_start_transaction(fs_info->quota_root, 1);
- if (IS_ERR(trans)) {
- err = PTR_ERR(trans);
+ if (did_leaf_rescans) {
+ trans = btrfs_start_transaction(fs_info->quota_root, 1);
+ if (IS_ERR(trans)) {
+ err = PTR_ERR(trans);
+ trans = NULL;
+ btrfs_err(fs_info,
+ "fail to start transaction for status update: %d",
+ err);
+ }
+ } else {
trans = NULL;
- btrfs_err(fs_info,
- "fail to start transaction for status update: %d",
- err);
}
mutex_lock(&fs_info->qgroup_rescan_lock);
@@ -3924,7 +4007,6 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
int num_bytes)
{
- struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_qgroup *qgroup;
struct ulist_node *unode;
struct ulist_iterator uiter;
@@ -3932,7 +4014,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
if (num_bytes == 0)
return;
- if (!quota_root)
+ if (!fs_info->quota_root)
return;
spin_lock(&fs_info->qgroup_lock);
@@ -3979,6 +4061,8 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_qgroup_meta_convert(root, num_bytes);
qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
+ if (!sb_rdonly(fs_info->sb))
+ add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS);
}
/*
@@ -4267,4 +4351,5 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans)
ulist_free(entry->old_roots);
kfree(entry);
}
+ *root = RB_ROOT;
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 0a2659685ad6..94bdfb89505e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -416,5 +416,6 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *eb);
void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
+bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info);
#endif
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 7ac679ed2b6c..226a17a335da 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -334,6 +334,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest,
{
bio_list_merge(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
+ /* Also inherit the bitmaps from @victim. */
+ bitmap_or(dest->dbitmap, victim->dbitmap, dest->dbitmap,
+ dest->stripe_npages);
dest->generic_bio_cnt += victim->generic_bio_cnt;
bio_list_init(&victim->bio_list);
}
@@ -878,6 +881,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
if (rbio->generic_bio_cnt)
btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+ /*
+ * Clear the data bitmap, as the rbio may be cached for later usage.
+ * do this before before unlock_stripe() so there will be no new bio
+ * for this bio.
+ */
+ bitmap_clear(rbio->dbitmap, 0, rbio->stripe_npages);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -1212,6 +1221,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
else
BUG();
+ /* We should have at least one data sector. */
+ ASSERT(bitmap_weight(rbio->dbitmap, rbio->stripe_npages));
+
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
* recalculate the parity and write the new results.
@@ -1285,6 +1297,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(pagenr, rbio->dbitmap))
+ continue;
+
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
@@ -1309,6 +1326,11 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(pagenr, rbio->dbitmap))
+ continue;
+
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
@@ -1748,6 +1770,33 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
run_plug(plug);
}
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+{
+ const struct btrfs_fs_info *fs_info = rbio->fs_info;
+ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ const u64 full_stripe_start = rbio->bbio->raid_map[0];
+ const u32 orig_len = orig_bio->bi_iter.bi_size;
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 cur_logical;
+
+ ASSERT(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * rbio->stripe_len);
+
+ bio_list_add(&rbio->bio_list, orig_bio);
+ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
+
+ /* Update the dbitmap. */
+ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
+ cur_logical += sectorsize) {
+ int bit = ((u32)(cur_logical - full_stripe_start) >>
+ PAGE_SHIFT) % rbio->stripe_npages;
+
+ set_bit(bit, rbio->dbitmap);
+ }
+}
+
/*
* our main entry point for writes from the rest of the FS.
*/
@@ -1764,9 +1813,8 @@ int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->operation = BTRFS_RBIO_WRITE;
+ rbio_add_bio(rbio, bio);
btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
@@ -2068,9 +2116,12 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
atomic_set(&rbio->error, 0);
/*
- * read everything that hasn't failed. Thanks to the
- * stripe cache, it is possible that some or all of these
- * pages are going to be uptodate.
+ * Read everything that hasn't failed. However this time we will
+ * not trust any cached sector.
+ * As we may read out some stale data but higher layer is not reading
+ * that stale part.
+ *
+ * So here we always re-read everything in recovery path.
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
if (rbio->faila == stripe || rbio->failb == stripe) {
@@ -2079,16 +2130,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
}
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
-
- /*
- * the rmw code may have already read this
- * page in
- */
- p = rbio_stripe_page(rbio, stripe, pagenr);
- if (PageUptodate(p))
- continue;
-
ret = rbio_add_io_page(rbio, &bio_list,
rbio_stripe_page(rbio, stripe, pagenr),
stripe, pagenr, rbio->stripe_len);
@@ -2170,8 +2211,7 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio_add_bio(rbio, bio);
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
diff --git a/fs/btrfs/rcu-string.h b/fs/btrfs/rcu-string.h
index a97dc74a4d3d..02f15321cecc 100644
--- a/fs/btrfs/rcu-string.h
+++ b/fs/btrfs/rcu-string.h
@@ -18,7 +18,11 @@ static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
(len * sizeof(char)), mask);
if (!ret)
return ret;
- strncpy(ret->str, src, len);
+ /* Warn if the source got unexpectedly truncated. */
+ if (WARN_ON(strscpy(ret->str, src, len) < 0)) {
+ kfree(ret);
+ return NULL;
+ }
return ret;
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index bbd63535965c..d59e89ef3251 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -888,8 +888,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
out_unlock:
spin_unlock(&fs_info->ref_verify_lock);
out:
- if (ret)
+ if (ret) {
+ btrfs_free_ref_cache(fs_info);
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ }
return ret;
}
@@ -1018,8 +1020,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
}
}
if (ret) {
- btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
btrfs_free_ref_cache(fs_info);
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
}
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index ba68b0b41dff..e603cc8c141e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2511,7 +2511,7 @@ again:
list_splice(&reloc_roots, &rc->reloc_roots);
if (!err)
- btrfs_commit_transaction(trans);
+ err = btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
return err;
@@ -4102,8 +4102,12 @@ int prepare_to_relocate(struct reloc_control *rc)
*/
return PTR_ERR(trans);
}
- btrfs_commit_transaction(trans);
- return 0;
+
+ ret = btrfs_commit_transaction(trans);
+ if (ret)
+ unset_reloc_control(rc);
+
+ return ret;
}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
@@ -4263,7 +4267,9 @@ restart:
err = PTR_ERR(trans);
goto out_free;
}
- btrfs_commit_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
+ if (ret && !err)
+ err = ret;
out_free:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 0d07ebe511e7..ba4e198811a4 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -371,9 +371,10 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
+ if (ret < 0) {
+ err = ret;
goto out;
- if (ret == 0) {
+ } else if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index e5db948daa12..45809f75692e 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3849,6 +3849,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
+ bool need_commit = false;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
@@ -3961,6 +3962,12 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ u64 old_super_errors;
+
+ spin_lock(&sctx->stat_lock);
+ old_super_errors = sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+
btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
@@ -3969,6 +3976,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->fs_devices->device_list_mutex);
ret = scrub_supers(sctx, dev);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ spin_lock(&sctx->stat_lock);
+ /*
+ * Super block errors found, but we can not commit transaction
+ * at current context, since btrfs_commit_transaction() needs
+ * to pause the current running scrub (hold by ourselves).
+ */
+ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
+ need_commit = true;
+ spin_unlock(&sctx->stat_lock);
}
if (!ret)
@@ -3995,6 +4012,25 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
+ /*
+ * We found some super block errors before, now try to force a
+ * transaction commit, as scrub has finished.
+ */
+ if (need_commit) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "scrub: failed to start transaction to fix super block errors: %d", ret);
+ return ret;
+ }
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "scrub: failed to commit transaction to fix super block errors: %d", ret);
+ }
return ret;
out:
scrub_workers_put(fs_info);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index e258fc484cea..576c027909f8 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -973,7 +973,15 @@ static int iterate_inode_ref(struct btrfs_root *root, struct btrfs_path *path,
ret = PTR_ERR(start);
goto out;
}
- BUG_ON(start < p->buf);
+ if (unlikely(start < p->buf)) {
+ btrfs_err(root->fs_info,
+ "send: path ref buffer underflow for key (%llu %u %llu)",
+ found_key->objectid,
+ found_key->type,
+ found_key->offset);
+ ret = -EINVAL;
+ goto out;
+ }
}
p->start = start;
} else {
@@ -5405,6 +5413,7 @@ static int clone_range(struct send_ctx *sctx,
u64 ext_len;
u64 clone_len;
u64 clone_data_offset;
+ bool crossed_src_i_size = false;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(clone_root->root, path);
@@ -5461,8 +5470,10 @@ static int clone_range(struct send_ctx *sctx,
if (key.offset >= clone_src_i_size)
break;
- if (key.offset + ext_len > clone_src_i_size)
+ if (key.offset + ext_len > clone_src_i_size) {
ext_len = clone_src_i_size - key.offset;
+ crossed_src_i_size = true;
+ }
clone_data_offset = btrfs_file_extent_offset(leaf, ei);
if (btrfs_file_extent_disk_bytenr(leaf, ei) == disk_byte) {
@@ -5522,6 +5533,25 @@ static int clone_range(struct send_ctx *sctx,
ret = send_clone(sctx, offset, clone_len,
clone_root);
}
+ } else if (crossed_src_i_size && clone_len < len) {
+ /*
+ * If we are at i_size of the clone source inode and we
+ * can not clone from it, terminate the loop. This is
+ * to avoid sending two write operations, one with a
+ * length matching clone_len and the final one after
+ * this loop with a length of len - clone_len.
+ *
+ * When using encoded writes (BTRFS_SEND_FLAG_COMPRESSED
+ * was passed to the send ioctl), this helps avoid
+ * sending an encoded write for an offset that is not
+ * sector size aligned, in case the i_size of the source
+ * inode is not sector size aligned. That will make the
+ * receiver fallback to decompression of the data and
+ * writing it using regular buffered IO, therefore while
+ * not incorrect, it's not optimal due decompression and
+ * possible re-compression at the receiver.
+ */
+ break;
} else {
ret = send_extent_data(sctx, offset, clone_len);
}
@@ -7325,10 +7355,10 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
/*
* Check that we don't overflow at later allocations, we request
* clone_sources_count + 1 items, and compare to unsigned long inside
- * access_ok.
+ * access_ok. Also set an upper limit for allocation size so this can't
+ * easily exhaust memory. Max number of clone sources is about 200K.
*/
- if (arg->clone_sources_count >
- ULONG_MAX / sizeof(struct clone_root) - 1) {
+ if (arg->clone_sources_count > SZ_8M / sizeof(struct clone_root)) {
ret = -EINVAL;
goto out;
}
@@ -7341,7 +7371,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
}
if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
- ret = -EINVAL;
+ ret = -EOPNOTSUPP;
goto out;
}
@@ -7359,7 +7389,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg)
sctx->flags = arg->flags;
sctx->send_filp = fget(arg->send_fd);
- if (!sctx->send_filp) {
+ if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) {
ret = -EBADF;
goto out;
}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1a69bdb96fb2..ea8b5b2d859d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1567,8 +1567,6 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
} else {
snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
btrfs_sb(s)->bdev_holder = fs_type;
- if (!strstr(crc32c_impl(), "generic"))
- set_bit(BTRFS_FS_CSUM_IMPL_FAST, &fs_info->flags);
error = btrfs_fill_super(s, fs_devices, data);
}
if (!error)
@@ -2137,7 +2135,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
* calculated f_bavail.
*/
if (!mixed && block_rsv->space_info->full &&
- total_free_meta - thresh < block_rsv->size)
+ (total_free_meta < thresh || total_free_meta - thresh < block_rsv->size))
buf->f_bavail = 0;
buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 5c299e1f2297..356f2274c8eb 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1154,8 +1154,11 @@ int __init btrfs_init_sysfs(void)
#ifdef CONFIG_BTRFS_DEBUG
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
- if (ret)
- goto out2;
+ if (ret) {
+ sysfs_unmerge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ goto out_remove_group;
+ }
#endif
return 0;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 98f9684e7ffc..8e413c82190e 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -189,7 +189,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
void btrfs_free_dummy_root(struct btrfs_root *root)
{
- if (!root)
+ if (IS_ERR_OR_NULL(root))
return;
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ac035a6fa003..f6169bece7c0 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -230,21 +230,21 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -256,31 +256,33 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
}
+ /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */
+ old_roots = NULL;
+ new_roots = NULL;
+
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
- old_roots = NULL;
- new_roots = NULL;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_item(root, nodesize, nodesize);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return -EINVAL;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -331,21 +333,21 @@ static int test_multiple_refs(struct btrfs_root *root,
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -366,21 +368,21 @@ static int test_multiple_refs(struct btrfs_root *root,
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = add_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -407,21 +409,21 @@ static int test_multiple_refs(struct btrfs_root *root,
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e6cb95b81787..89ffc0255406 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -190,10 +190,11 @@ loop:
spin_unlock(&fs_info->trans_lock);
/*
- * If we are ATTACH, we just want to catch the current transaction,
- * and commit it. If there is no transaction, just return ENOENT.
+ * If we are ATTACH or TRANS_JOIN_NOSTART, we just want to catch the
+ * current transaction, and commit it. If there is no transaction, just
+ * return ENOENT.
*/
- if (type == TRANS_ATTACH)
+ if (type == TRANS_ATTACH || type == TRANS_JOIN_NOSTART)
return -ENOENT;
/*
@@ -706,8 +707,13 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
trans = start_transaction(root, 0, TRANS_ATTACH,
BTRFS_RESERVE_NO_FLUSH, true);
- if (trans == ERR_PTR(-ENOENT))
- btrfs_wait_for_commit(root->fs_info, 0);
+ if (trans == ERR_PTR(-ENOENT)) {
+ int ret;
+
+ ret = btrfs_wait_for_commit(root->fs_info, 0);
+ if (ret)
+ return ERR_PTR(ret);
+ }
return trans;
}
@@ -771,6 +777,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid)
}
wait_for_commit(cur_trans);
+ ret = cur_trans->aborted;
btrfs_put_transaction(cur_trans);
out:
return ret;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 368c43c6cbd0..597362eaf300 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1019,7 +1019,8 @@ static void extent_err(const struct extent_buffer *eb, int slot,
}
static int check_extent_item(struct extent_buffer *leaf,
- struct btrfs_key *key, int slot)
+ struct btrfs_key *key, int slot,
+ struct btrfs_key *prev_key)
{
struct btrfs_fs_info *fs_info = leaf->fs_info;
struct btrfs_extent_item *ei;
@@ -1164,7 +1165,7 @@ static int check_extent_item(struct extent_buffer *leaf,
if (ptr + btrfs_extent_inline_ref_size(inline_type) > end) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
- ptr, inline_type, end);
+ ptr, btrfs_extent_inline_ref_size(inline_type), end);
return -EUCLEAN;
}
@@ -1230,6 +1231,26 @@ static int check_extent_item(struct extent_buffer *leaf,
total_refs, inline_refs);
return -EUCLEAN;
}
+
+ if ((prev_key->type == BTRFS_EXTENT_ITEM_KEY) ||
+ (prev_key->type == BTRFS_METADATA_ITEM_KEY)) {
+ u64 prev_end = prev_key->objectid;
+
+ if (prev_key->type == BTRFS_METADATA_ITEM_KEY)
+ prev_end += fs_info->nodesize;
+ else
+ prev_end += prev_key->offset;
+
+ if (unlikely(prev_end > key->objectid)) {
+ extent_err(leaf, slot,
+ "previous extent [%llu %u %llu] overlaps current extent [%llu %u %llu]",
+ prev_key->objectid, prev_key->type,
+ prev_key->offset, key->objectid, key->type,
+ key->offset);
+ return -EUCLEAN;
+ }
+ }
+
return 0;
}
@@ -1343,7 +1364,7 @@ static int check_leaf_item(struct extent_buffer *leaf,
break;
case BTRFS_EXTENT_ITEM_KEY:
case BTRFS_METADATA_ITEM_KEY:
- ret = check_extent_item(leaf, key, slot);
+ ret = check_extent_item(leaf, key, slot, prev_key);
break;
case BTRFS_TREE_BLOCK_REF_KEY:
case BTRFS_SHARED_DATA_REF_KEY:
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index b7bfecfc2ea3..f75333d7b78a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -918,8 +918,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, name_len, 0);
if (IS_ERR(di)) {
- if (PTR_ERR(di) != -ENOENT)
- ret = PTR_ERR(di);
+ ret = PTR_ERR(di);
goto out;
} else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
@@ -1100,7 +1099,9 @@ again:
extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
inode_objectid, parent_objectid, 0,
0);
- if (!IS_ERR_OR_NULL(extref)) {
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
@@ -1169,8 +1170,7 @@ next:
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
ref_index, name, namelen, 0);
if (IS_ERR(di)) {
- if (PTR_ERR(di) != -ENOENT)
- return PTR_ERR(di);
+ return PTR_ERR(di);
} else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
@@ -2020,9 +2020,6 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
goto out;
}
- if (dst_di == ERR_PTR(-ENOENT))
- dst_di = NULL;
-
if (IS_ERR(dst_di)) {
ret = PTR_ERR(dst_di);
goto out;
@@ -2307,7 +2304,7 @@ again:
dir_key->offset,
name, name_len, 0);
}
- if (!log_di || log_di == ERR_PTR(-ENOENT)) {
+ if (!log_di) {
btrfs_dir_item_key_to_cpu(eb, di, &location);
btrfs_release_path(path);
btrfs_release_path(log_path);
@@ -3520,8 +3517,7 @@ out_unlock:
if (err == -ENOSPC) {
btrfs_set_log_full_commit(trans);
err = 0;
- } else if (err < 0 && err != -ENOENT) {
- /* ENOENT can be returned if the entry hasn't been fsynced yet */
+ } else if (err < 0) {
btrfs_abort_transaction(trans, err);
}
@@ -4287,7 +4283,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
int slot;
int ins_nr = 0;
- int start_slot;
+ int start_slot = 0;
int ret;
if (!(inode->flags & BTRFS_INODE_PREALLOC))
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c7706a769de1..d7014b2b28d6 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -354,6 +354,7 @@ void btrfs_free_device(struct btrfs_device *device)
static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
{
struct btrfs_device *device;
+
WARN_ON(fs_devices->opened);
while (!list_empty(&fs_devices->devices)) {
device = list_entry(fs_devices->devices.next,
@@ -713,15 +714,47 @@ static void pending_bios_fn(struct btrfs_work *work)
run_scheduled_bios(device);
}
-static bool device_path_matched(const char *path, struct btrfs_device *device)
+/*
+ * Check if the device in the path matches the device in the given struct device.
+ *
+ * Returns:
+ * true If it is the same device.
+ * false If it is not the same device or on error.
+ */
+static bool device_matched(const struct btrfs_device *device, const char *path)
{
- int found;
+ char *device_name;
+ struct block_device *bdev_old;
+ struct block_device *bdev_new;
+
+ /*
+ * If we are looking for a device with the matching dev_t, then skip
+ * device without a name (a missing device).
+ */
+ if (!device->name)
+ return false;
+
+ device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
+ if (!device_name)
+ return false;
rcu_read_lock();
- found = strcmp(rcu_str_deref(device->name), path);
+ scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
rcu_read_unlock();
- return found == 0;
+ bdev_old = lookup_bdev(device_name);
+ kfree(device_name);
+ if (IS_ERR(bdev_old))
+ return false;
+
+ bdev_new = lookup_bdev(path);
+ if (IS_ERR(bdev_new))
+ return false;
+
+ if (bdev_old == bdev_new)
+ return true;
+
+ return false;
}
/*
@@ -754,9 +787,7 @@ static int btrfs_free_stale_devices(const char *path,
&fs_devices->devices, dev_list) {
if (skip_device && skip_device == device)
continue;
- if (path && !device->name)
- continue;
- if (path && !device_path_matched(path, device))
+ if (path && !device_matched(device, path))
continue;
if (fs_devices->opened) {
/* for an already deleted device return 0 */
@@ -864,6 +895,14 @@ error_brelse:
return -EINVAL;
}
+u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
+{
+ bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
+ BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
+
+ return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
+}
+
/*
* Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
* being created with a disk that has already completed its fsid change.
@@ -1371,6 +1410,17 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
if (!fs_devices->opened) {
seed_devices = fs_devices->seed;
fs_devices->seed = NULL;
+
+ /*
+ * If the struct btrfs_fs_devices is not assembled with any
+ * other device, it can be re-initialized during the next mount
+ * without the needing device-scan step. Therefore, it can be
+ * fully freed.
+ */
+ if (fs_devices->num_devices == 1) {
+ list_del(&fs_devices->fs_list);
+ free_fs_devices(fs_devices);
+ }
}
mutex_unlock(&uuid_mutex);
@@ -1537,8 +1587,17 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
*/
bytenr = btrfs_sb_offset(0);
- flags |= FMODE_EXCL;
+ /*
+ * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may
+ * initiate the device scan which may race with the user's mount
+ * or mkfs command, resulting in failure.
+ * Since the device scan is solely for reading purposes, there is
+ * no need for FMODE_EXCL. Additionally, the devices are read again
+ * during the mount process. It is ok to get some inconsistent
+ * values temporarily, as the device paths of the fsid are the only
+ * required information for assembling the volume.
+ */
bdev = blkdev_get_by_path(path, flags, holder);
if (IS_ERR(bdev))
return ERR_CAST(bdev);
@@ -1579,7 +1638,7 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
if (in_range(physical_start, *start, len) ||
in_range(*start, physical_start,
- physical_end - physical_start)) {
+ physical_end + 1 - physical_start)) {
*start = physical_end + 1;
return true;
}
@@ -1671,7 +1730,7 @@ again:
goto out;
}
- while (1) {
+ while (search_start < search_end) {
l = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(l)) {
@@ -1694,6 +1753,9 @@ again:
if (key.type != BTRFS_DEV_EXTENT_KEY)
goto next;
+ if (key.offset > search_end)
+ break;
+
if (key.offset > search_start) {
hole_size = key.offset - search_start;
@@ -1764,6 +1826,7 @@ next:
else
ret = 0;
+ ASSERT(max_hole_start + max_hole_size <= search_end);
out:
btrfs_free_path(path);
*start = max_hole_start;
@@ -3027,15 +3090,16 @@ struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
read_unlock(&em_tree->lock);
if (!em) {
- btrfs_crit(fs_info, "unable to find logical %llu length %llu",
+ btrfs_crit(fs_info,
+ "unable to find chunk map for logical %llu length %llu",
logical, length);
return ERR_PTR(-EINVAL);
}
- if (em->start > logical || em->start + em->len < logical) {
+ if (em->start > logical || em->start + em->len <= logical) {
btrfs_crit(fs_info,
- "found a bad mapping, wanted %llu-%llu, found %llu-%llu",
- logical, length, em->start, em->start + em->len);
+ "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
+ logical, logical + length, em->start, em->start + em->len);
free_extent_map(em);
return ERR_PTR(-EINVAL);
}
@@ -3204,7 +3268,17 @@ again:
mutex_unlock(&fs_info->delete_unused_bgs_mutex);
goto error;
}
- BUG_ON(ret == 0); /* Corruption */
+ if (ret == 0) {
+ /*
+ * On the first search we would find chunk tree with
+ * offset -1, which is not possible. On subsequent
+ * loops this would find an existing item on an invalid
+ * offset (one less than the previous one, wrong
+ * alignment and size).
+ */
+ ret = -EUCLEAN;
+ goto error;
+ }
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
@@ -4503,8 +4577,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
}
}
- BUG_ON(fs_info->balance_ctl ||
- test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
+ ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
atomic_dec(&fs_info->balance_cancel_req);
mutex_unlock(&fs_info->balance_mutex);
return 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index aa6a6d7b2978..762c0a375498 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -581,4 +581,7 @@ int btrfs_bg_type_to_factor(u64 flags);
const char *btrfs_bg_type_to_raid_name(u64 flags);
int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info);
+bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr);
+u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb);
+
#endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 48858510739b..cd7ddf24157a 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -387,6 +387,9 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
const char *name, const void *buffer,
size_t size, int flags)
{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
name = xattr_full_name(handler, name);
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index df1aace5df50..9385d0bb276d 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -74,7 +74,7 @@ static struct list_head *zlib_alloc_workspace(unsigned int level)
workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
zlib_inflate_workspacesize());
- workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL);
+ workspace->strm.workspace = kvzalloc(workspacesize, GFP_KERNEL);
workspace->level = level;
workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
if (!workspace->strm.workspace || !workspace->buf)
diff --git a/fs/buffer.c b/fs/buffer.c
index 0d7bd7712076..4ec88d08d04e 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2318,7 +2318,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int err;
err = inode_newsize_ok(inode, size);
@@ -2344,7 +2344,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
unsigned int blocksize = i_blocksize(inode);
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
pgoff_t index, curidx;
loff_t curpos;
unsigned zerofrom, offset, len;
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index dfb14dbddf51..3b39552c2365 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -245,6 +245,8 @@ error_open_root:
kmem_cache_free(cachefiles_object_jar, fsdef);
error_root_object:
cachefiles_end_secure(cache, saved_cred);
+ put_cred(cache->cache_cred);
+ cache->cache_cred = NULL;
pr_err("Failed to register: %d\n", ret);
return ret;
}
@@ -265,6 +267,7 @@ void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
dput(cache->graveyard);
mntput(cache->mnt);
+ put_cred(cache->cache_cred);
kfree(cache->rootdirname);
kfree(cache->secctx);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 06e9b26bf277..45b8f6741f8d 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1558,6 +1558,7 @@ void ceph_flush_snaps(struct ceph_inode_info *ci,
struct inode *inode = &ci->vfs_inode;
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
struct ceph_mds_session *session = NULL;
+ bool need_put = false;
int mds;
dout("ceph_flush_snaps %p\n", inode);
@@ -1609,8 +1610,13 @@ out:
}
/* we flushed them all; remove this inode from the queue */
spin_lock(&mdsc->snap_flush_lock);
+ if (!list_empty(&ci->i_snap_flush_item))
+ need_put = true;
list_del_init(&ci->i_snap_flush_item);
spin_unlock(&mdsc->snap_flush_lock);
+
+ if (need_put)
+ iput(inode);
}
/*
@@ -2784,7 +2790,19 @@ int ceph_get_caps(struct file *filp, int need, int want,
if (ret == -EAGAIN)
continue;
if (!ret) {
+ struct ceph_mds_client *mdsc = fsc->mdsc;
+ struct cap_wait cw;
DEFINE_WAIT_FUNC(wait, woken_wake_function);
+
+ cw.ino = inode->i_ino;
+ cw.tgid = current->tgid;
+ cw.need = need;
+ cw.want = want;
+
+ spin_lock(&mdsc->caps_list_lock);
+ list_add(&cw.list, &mdsc->cap_wait_list);
+ spin_unlock(&mdsc->caps_list_lock);
+
add_wait_queue(&ci->i_cap_wq, &wait);
flags |= NON_BLOCKING;
@@ -2798,6 +2816,11 @@ int ceph_get_caps(struct file *filp, int need, int want,
}
remove_wait_queue(&ci->i_cap_wq, &wait);
+
+ spin_lock(&mdsc->caps_list_lock);
+ list_del(&cw.list);
+ spin_unlock(&mdsc->caps_list_lock);
+
if (ret == -EAGAIN)
continue;
}
@@ -3334,6 +3357,15 @@ static void handle_cap_grant(struct inode *inode,
}
BUG_ON(cap->issued & ~cap->implemented);
+ /* don't let check_caps skip sending a response to MDS for revoke msgs */
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_REVOKE) {
+ cap->mds_wanted = 0;
+ if (cap == ci->i_auth_cap)
+ check_caps = 1; /* check auth cap only */
+ else
+ check_caps = 2; /* check all caps */
+ }
+
if (extra_info->inline_version > 0 &&
extra_info->inline_version >= ci->i_inline_version) {
ci->i_inline_version = extra_info->inline_version;
@@ -4272,12 +4304,14 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
struct inode *dir,
int mds, int drop, int unless)
{
- struct dentry *parent = NULL;
struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry);
int force = 0;
int ret;
+ /* This shouldn't happen */
+ BUG_ON(!dir);
+
/*
* force an record for the directory caps if we have a dentry lease.
* this is racy (can't take i_ceph_lock and d_lock together), but it
@@ -4287,14 +4321,9 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
spin_lock(&dentry->d_lock);
if (di->lease_session && di->lease_session->s_mds == mds)
force = 1;
- if (!dir) {
- parent = dget(dentry->d_parent);
- dir = d_inode(parent);
- }
spin_unlock(&dentry->d_lock);
ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
- dput(parent);
spin_lock(&dentry->d_lock);
if (ret && di->lease_session && di->lease_session->s_mds == mds) {
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index facb387c2735..c281f32b54f7 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -139,6 +139,7 @@ static int caps_show(struct seq_file *s, void *p)
struct ceph_fs_client *fsc = s->private;
struct ceph_mds_client *mdsc = fsc->mdsc;
int total, avail, used, reserved, min, i;
+ struct cap_wait *cw;
ceph_reservation_status(fsc, &total, &avail, &used, &reserved, &min);
seq_printf(s, "total\t\t%d\n"
@@ -166,6 +167,18 @@ static int caps_show(struct seq_file *s, void *p)
}
mutex_unlock(&mdsc->mutex);
+ seq_printf(s, "\n\nWaiters:\n--------\n");
+ seq_printf(s, "tgid ino need want\n");
+ seq_printf(s, "-----------------------------------------------------\n");
+
+ spin_lock(&mdsc->caps_list_lock);
+ list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
+ seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
+ ceph_cap_string(cw->need),
+ ceph_cap_string(cw->want));
+ }
+ spin_unlock(&mdsc->caps_list_lock);
+
return 0;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index aa1eac6d89f2..83122fc5f813 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -452,6 +452,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
+ /*
+ * Do not truncate the file, since atomic_open is called before the
+ * permission check. The caller will do the truncation afterward.
+ */
+ flags &= ~O_TRUNC;
+
if (flags & O_CREAT) {
if (ceph_quota_is_max_files_exceeded(dir))
return -EDQUOT;
@@ -490,9 +496,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
req->r_parent = dir;
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
- err = ceph_mdsc_do_request(mdsc,
- (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
- req);
+ err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
err = ceph_handle_snapdir(req, dentry, err);
if (err)
goto out_req;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index af85a7237604..a08ddd4e26d9 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -619,9 +619,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
ci->i_truncate_seq = truncate_seq;
/* the MDS should have revoked these caps */
- WARN_ON_ONCE(issued & (CEPH_CAP_FILE_EXCL |
- CEPH_CAP_FILE_RD |
- CEPH_CAP_FILE_WR |
+ WARN_ON_ONCE(issued & (CEPH_CAP_FILE_RD |
CEPH_CAP_FILE_LAZYIO));
/*
* If we hold relevant caps, or in the case where we're
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 37fb71797b34..f7acf9680c9b 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3151,6 +3151,12 @@ static void handle_session(struct ceph_mds_session *session,
break;
case CEPH_SESSION_FLUSHMSG:
+ /* flush cap releases */
+ spin_lock(&session->s_cap_lock);
+ if (session->s_num_cap_releases)
+ ceph_flush_cap_releases(mdsc, session);
+ spin_unlock(&session->s_cap_lock);
+
send_flushmsg_ack(mdsc, session, seq);
break;
@@ -4068,7 +4074,7 @@ static void delayed_work(struct work_struct *work)
dout("mdsc delayed_work\n");
- if (mdsc->stopping)
+ if (mdsc->stopping >= CEPH_MDSC_STOPPING_FLUSHED)
return;
mutex_lock(&mdsc->mutex);
@@ -4168,6 +4174,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
mdsc->last_renew_caps = jiffies;
INIT_LIST_HEAD(&mdsc->cap_delay_list);
+ INIT_LIST_HEAD(&mdsc->cap_wait_list);
spin_lock_init(&mdsc->cap_delay_lock);
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
@@ -4239,7 +4246,7 @@ static void wait_requests(struct ceph_mds_client *mdsc)
void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
{
dout("pre_umount\n");
- mdsc->stopping = 1;
+ mdsc->stopping = CEPH_MDSC_STOPPING_BEGIN;
lock_unlock_sessions(mdsc);
ceph_flush_dirty_caps(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 5cd131b41d84..4fbbc33023c9 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -340,6 +340,19 @@ struct ceph_quotarealm_inode {
struct inode *inode;
};
+struct cap_wait {
+ struct list_head list;
+ unsigned long ino;
+ pid_t tgid;
+ int need;
+ int want;
+};
+
+enum {
+ CEPH_MDSC_STOPPING_BEGIN = 1,
+ CEPH_MDSC_STOPPING_FLUSHED = 2,
+};
+
/*
* mds client state
*/
@@ -416,6 +429,7 @@ struct ceph_mds_client {
spinlock_t caps_list_lock;
struct list_head caps_list; /* unused (reserved or
unreserved) */
+ struct list_head cap_wait_list;
int caps_total_count; /* total caps allocated */
int caps_use_count; /* in use */
int caps_use_max; /* max used caps */
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index e1b9b224fcb2..4df068bdba68 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -644,8 +644,10 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->size);
spin_lock(&mdsc->snap_flush_lock);
- if (list_empty(&ci->i_snap_flush_item))
+ if (list_empty(&ci->i_snap_flush_item)) {
+ ihold(inode);
list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ }
spin_unlock(&mdsc->snap_flush_lock);
return 1; /* caller may want to ceph_flush_snaps */
}
@@ -694,9 +696,10 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */
- struct ceph_snap_realm *realm = NULL;
+ struct ceph_snap_realm *realm;
struct ceph_snap_realm *first_realm = NULL;
- int invalidate = 0;
+ struct ceph_snap_realm *realm_to_rebuild = NULL;
+ int rebuild_snapcs;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
@@ -704,6 +707,8 @@ int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
dout("update_snap_trace deletion=%d\n", deletion);
more:
+ realm = NULL;
+ rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
p += sizeof(*ri);
@@ -727,7 +732,7 @@ more:
err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
if (err < 0)
goto fail;
- invalidate += err;
+ rebuild_snapcs += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
dout("update_snap_trace updating %llx %p %lld -> %lld\n",
@@ -752,22 +757,30 @@ more:
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
- invalidate = 1;
+ rebuild_snapcs = 1;
} else if (!realm->cached_context) {
dout("update_snap_trace %llx %p seq %lld new\n",
realm->ino, realm, realm->seq);
- invalidate = 1;
+ rebuild_snapcs = 1;
} else {
dout("update_snap_trace %llx %p seq %lld unchanged\n",
realm->ino, realm, realm->seq);
}
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
- realm, invalidate, p, e);
+ dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+ realm, rebuild_snapcs, p, e);
+
+ /*
+ * this will always track the uppest parent realm from which
+ * we need to rebuild the snapshot contexts _downward_ in
+ * hierarchy.
+ */
+ if (rebuild_snapcs)
+ realm_to_rebuild = realm;
- /* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate && p >= e)
- rebuild_snap_realms(realm, &dirty_realms);
+ /* rebuild_snapcs when we reach the _end_ (root) of the trace */
+ if (realm_to_rebuild && p >= e)
+ rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
if (!first_realm)
first_realm = realm;
@@ -994,6 +1007,19 @@ skip_inode:
continue;
adjust_snap_realm_parent(mdsc, child, realm->ino);
}
+ } else {
+ /*
+ * In the non-split case both 'num_split_inos' and
+ * 'num_split_realms' should be 0, making this a no-op.
+ * However the MDS happens to populate 'split_realms' list
+ * in one of the UPDATE op cases by mistake.
+ *
+ * Skip both lists just in case to ensure that 'p' is
+ * positioned at the start of realm info, as expected by
+ * ceph_update_snap_trace().
+ */
+ p += sizeof(u64) * num_split_inos;
+ p += sizeof(u64) * num_split_realms;
}
/*
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index d40658d5e808..0e38678d5add 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1174,14 +1174,23 @@ out_final:
static void ceph_kill_sb(struct super_block *s)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(s);
- dev_t dev = s->s_dev;
dout("kill_sb %p\n", s);
ceph_mdsc_pre_umount(fsc->mdsc);
flush_fs_workqueues(fsc);
- generic_shutdown_super(s);
+ /*
+ * Though the kill_anon_super() will finally trigger the
+ * sync_filesystem() anyway, we still need to do it here
+ * and then bump the stage of shutdown to stop the work
+ * queue as earlier as possible.
+ */
+ sync_filesystem(s);
+
+ fsc->mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHED;
+
+ kill_anon_super(s);
fsc->client->extra_mon_dispatch = NULL;
ceph_fs_debugfs_cleanup(fsc);
@@ -1189,7 +1198,6 @@ static void ceph_kill_sb(struct super_block *s)
ceph_fscache_unregister_fs(fsc);
destroy_fs_client(fsc);
- free_anon_bdev(dev);
}
static struct file_system_type ceph_fs_type = {
diff --git a/fs/char_dev.c b/fs/char_dev.c
index c5e6eff5a381..36479b72d278 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -544,7 +544,7 @@ int cdev_device_add(struct cdev *cdev, struct device *dev)
}
rc = device_add(dev);
- if (rc)
+ if (rc && dev->devt)
cdev_del(cdev);
return rc;
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 7f01c6e60791..6eb65988321f 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -76,8 +76,8 @@ struct key_type cifs_spnego_key_type = {
* strlen(";sec=ntlmsspi") */
#define MAX_MECH_STR_LEN 13
-/* strlen of "host=" */
-#define HOST_KEY_LEN 5
+/* strlen of ";host=" */
+#define HOST_KEY_LEN 6
/* strlen of ";ip4=" or ";ip6=" */
#define IP_KEY_LEN 5
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 1f55072aa302..1766d6d077f2 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -1056,7 +1056,7 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
struct cifs_ntsd *pntsd = NULL;
int oplock = 0;
unsigned int xid;
- int rc, create_options = 0;
+ int rc;
struct cifs_tcon *tcon;
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
struct cifs_fid fid;
@@ -1068,13 +1068,10 @@ static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
tcon = tlink_tcon(tlink);
xid = get_xid();
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = READ_CONTROL;
- oparms.create_options = create_options;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.disposition = FILE_OPEN;
oparms.path = path;
oparms.fid = &fid;
@@ -1119,7 +1116,7 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
{
int oplock = 0;
unsigned int xid;
- int rc, access_flags, create_options = 0;
+ int rc, access_flags;
struct cifs_tcon *tcon;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = cifs_sb_tlink(cifs_sb);
@@ -1132,9 +1129,6 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
tcon = tlink_tcon(tlink);
xid = get_xid();
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
access_flags = WRITE_OWNER;
else
@@ -1143,7 +1137,7 @@ int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = access_flags;
- oparms.create_options = create_options;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.disposition = FILE_OPEN;
oparms.path = path;
oparms.fid = &fid;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 79a18692b84c..12f632287d16 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -275,7 +275,7 @@ cifs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_ffree = 0; /* unlimited */
if (server->ops->queryfs)
- rc = server->ops->queryfs(xid, tcon, buf);
+ rc = server->ops->queryfs(xid, tcon, cifs_sb, buf);
free_xid(xid);
return rc;
@@ -609,9 +609,15 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
- /* Only display max_credits if it was overridden on mount */
+ /* Only display the following if overridden on mount */
if (tcon->ses->server->max_credits != SMB2_MAX_CREDITS_AVAILABLE)
seq_printf(s, ",max_credits=%u", tcon->ses->server->max_credits);
+ if (tcon->ses->server->tcp_nodelay)
+ seq_puts(s, ",tcpnodelay");
+ if (tcon->ses->server->noautotune)
+ seq_puts(s, ",noautotune");
+ if (tcon->ses->server->noblocksnd)
+ seq_puts(s, ",noblocksend");
if (tcon->snapshot_time)
seq_printf(s, ",snapshot=%llu", tcon->snapshot_time);
@@ -732,11 +738,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
struct inode *dir = d_inode(dentry);
struct dentry *child;
- if (!dir) {
- dput(dentry);
- dentry = ERR_PTR(-ENOENT);
- break;
- }
if (!S_ISDIR(dir->i_mode)) {
dput(dentry);
dentry = ERR_PTR(-ENOTDIR);
@@ -753,7 +754,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
while (*s && *s != sep)
s++;
- child = lookup_one_len_unlocked(p, dentry, s - p);
+ child = lookup_positive_unlocked(p, dentry, s - p);
dput(dentry);
dentry = child;
} while (!IS_ERR(dentry));
@@ -1061,6 +1062,7 @@ const struct inode_operations cifs_file_inode_ops = {
const struct inode_operations cifs_symlink_inode_ops = {
.get_link = cifs_get_link,
+ .setattr = cifs_setattr,
.permission = cifs_permission,
.listxattr = cifs_listxattr,
};
@@ -1077,7 +1079,9 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
unsigned int xid;
int rc;
- if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
+ if (remap_flags & REMAP_FILE_DEDUP)
+ return -EOPNOTSUPP;
+ if (remap_flags & ~REMAP_FILE_ADVISORY)
return -EINVAL;
cifs_dbg(FYI, "clone range\n");
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index bc4ca94137f2..7b155e19df4e 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -125,7 +125,10 @@ extern const struct dentry_operations cifs_ci_dentry_ops;
#ifdef CONFIG_CIFS_DFS_UPCALL
extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
#else
-#define cifs_dfs_d_automount NULL
+static inline struct vfsmount *cifs_dfs_d_automount(struct path *path)
+{
+ return ERR_PTR(-EREMOTE);
+}
#endif
/* Functions related to symlinks */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 414936989255..253321adc266 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -22,6 +22,8 @@
#include <linux/in.h>
#include <linux/in6.h>
#include <linux/slab.h>
+#include <linux/scatterlist.h>
+#include <linux/mm.h>
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include "cifs_fs_sb.h"
@@ -30,6 +32,7 @@
#include <linux/scatterlist.h>
#include <uapi/linux/cifs/cifs_mount.h>
#include "smb2pdu.h"
+#include "smb2glob.h"
#define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */
@@ -297,7 +300,8 @@ struct smb_version_operations {
const char *, struct dfs_info3_param **,
unsigned int *, const struct nls_table *, int);
/* informational QFS call */
- void (*qfs_tcon)(const unsigned int, struct cifs_tcon *);
+ void (*qfs_tcon)(const unsigned int, struct cifs_tcon *,
+ struct cifs_sb_info *);
/* check if a path is accessible or not */
int (*is_path_accessible)(const unsigned int, struct cifs_tcon *,
struct cifs_sb_info *, const char *);
@@ -405,7 +409,7 @@ struct smb_version_operations {
struct cifsInodeInfo *);
/* query remote filesystem */
int (*queryfs)(const unsigned int, struct cifs_tcon *,
- struct kstatfs *);
+ struct cifs_sb_info *, struct kstatfs *);
/* send mandatory brlock to the server */
int (*mand_lock)(const unsigned int, struct cifsFileInfo *, __u64,
__u64, __u32, int, int, bool);
@@ -486,6 +490,7 @@ struct smb_version_operations {
/* ioctl passthrough for query_info */
int (*ioctl_query_info)(const unsigned int xid,
struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
__le16 *path, int is_dir,
unsigned long p);
/* make unix special files (block, char, fifo, socket) */
@@ -1955,4 +1960,71 @@ extern struct smb_version_values smb302_values;
#define ALT_SMB311_VERSION_STRING "3.11"
extern struct smb_version_operations smb311_operations;
extern struct smb_version_values smb311_values;
+
+static inline unsigned int cifs_get_num_sgs(const struct smb_rqst *rqst,
+ int num_rqst,
+ const u8 *sig)
+{
+ unsigned int len, skip;
+ unsigned int nents = 0;
+ unsigned long addr;
+ int i, j;
+
+ /* Assumes the first rqst has a transform header as the first iov.
+ * I.e.
+ * rqst[0].rq_iov[0] is transform header
+ * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+ * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+ */
+ for (i = 0; i < num_rqst; i++) {
+ /*
+ * The first rqst has a transform header where the
+ * first 20 bytes are not part of the encrypted blob.
+ */
+ for (j = 0; j < rqst[i].rq_nvec; j++) {
+ struct kvec *iov = &rqst[i].rq_iov[j];
+
+ skip = (i == 0) && (j == 0) ? 20 : 0;
+ addr = (unsigned long)iov->iov_base + skip;
+ if (unlikely(is_vmalloc_addr((void *)addr))) {
+ len = iov->iov_len - skip;
+ nents += DIV_ROUND_UP(offset_in_page(addr) + len,
+ PAGE_SIZE);
+ } else {
+ nents++;
+ }
+ }
+ nents += rqst[i].rq_npages;
+ }
+ nents += DIV_ROUND_UP(offset_in_page(sig) + SMB2_SIGNATURE_SIZE, PAGE_SIZE);
+ return nents;
+}
+
+/* We can not use the normal sg_set_buf() as we will sometimes pass a
+ * stack object as buf.
+ */
+static inline struct scatterlist *cifs_sg_set_buf(struct scatterlist *sg,
+ const void *buf,
+ unsigned int buflen)
+{
+ unsigned long addr = (unsigned long)buf;
+ unsigned int off = offset_in_page(addr);
+
+ addr &= PAGE_MASK;
+ if (unlikely(is_vmalloc_addr((void *)addr))) {
+ do {
+ unsigned int len = min_t(unsigned int, buflen, PAGE_SIZE - off);
+
+ sg_set_page(sg++, vmalloc_to_page((void *)addr), len, off);
+
+ off = 0;
+ addr += PAGE_SIZE;
+ buflen -= len;
+ } while (buflen);
+ } else {
+ sg_set_page(sg++, virt_to_page(addr), buflen, off);
+ }
+ return sg;
+}
+
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index f18da99a6b55..2dde83a96968 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -583,8 +583,8 @@ int cifs_alloc_hash(const char *name, struct crypto_shash **shash,
struct sdesc **sdesc);
void cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc);
-extern void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
- unsigned int *len, unsigned int *offset);
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+ unsigned int *len, unsigned int *offset);
void extract_unc_hostname(const char *unc, const char **h, size_t *len);
int copy_path_name(char *dst, const char *src);
@@ -600,4 +600,12 @@ static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses,
}
#endif
+static inline int cifs_create_options(struct cifs_sb_info *cifs_sb, int options)
+{
+ if (cifs_sb && (backup_cred(cifs_sb)))
+ return options | CREATE_OPEN_BACKUP_INTENT;
+ else
+ return options;
+}
+
#endif /* _CIFSPROTO_H */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 4b8632eda2bd..f924f05b1829 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4933,8 +4933,13 @@ CIFSGetDFSRefer(const unsigned int xid, struct cifs_ses *ses,
return -ENODEV;
getDFSRetry:
- rc = smb_init(SMB_COM_TRANSACTION2, 15, ses->tcon_ipc, (void **) &pSMB,
- (void **) &pSMBr);
+ /*
+ * Use smb_init_no_reconnect() instead of smb_init() as
+ * CIFSGetDFSRefer() may be called from cifs_reconnect_tcon() and thus
+ * causing an infinite recursion.
+ */
+ rc = smb_init_no_reconnect(SMB_COM_TRANSACTION2, 15, ses->tcon_ipc,
+ (void **)&pSMB, (void **)&pSMBr);
if (rc)
return rc;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 86bdebd2ece6..d8d9d9061544 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -469,8 +469,7 @@ static void reconn_inval_dfs_target(struct TCP_Server_Info *server,
}
static inline int reconn_setup_dfs_targets(struct cifs_sb_info *cifs_sb,
- struct dfs_cache_tgt_list *tl,
- struct dfs_cache_tgt_iterator **it)
+ struct dfs_cache_tgt_list *tl)
{
if (!cifs_sb->origin_fullpath)
return -EOPNOTSUPP;
@@ -514,11 +513,13 @@ cifs_reconnect(struct TCP_Server_Info *server)
sb = NULL;
} else {
cifs_sb = CIFS_SB(sb);
-
- rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list, &tgt_it);
- if (rc && (rc != -EOPNOTSUPP)) {
- cifs_server_dbg(VFS, "%s: no target servers for DFS failover\n",
- __func__);
+ rc = reconn_setup_dfs_targets(cifs_sb, &tgt_list);
+ if (rc) {
+ cifs_sb = NULL;
+ if (rc != -EOPNOTSUPP) {
+ cifs_server_dbg(VFS, "%s: no target servers for DFS failover\n",
+ __func__);
+ }
} else {
server->nr_targets = dfs_cache_get_nr_tgts(&tgt_list);
}
@@ -791,9 +792,6 @@ cifs_readv_from_socket(struct TCP_Server_Info *server, struct msghdr *smb_msg)
int length = 0;
int total_read;
- smb_msg->msg_control = NULL;
- smb_msg->msg_controllen = 0;
-
for (total_read = 0; msg_data_left(smb_msg); total_read += length) {
try_to_freeze();
@@ -844,7 +842,7 @@ int
cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
unsigned int to_read)
{
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
struct kvec iov = {.iov_base = buf, .iov_len = to_read};
iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read);
@@ -855,7 +853,7 @@ int
cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
unsigned int page_offset, unsigned int to_read)
{
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
struct bio_vec bv = {
.bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read);
@@ -3238,7 +3236,7 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
struct cifs_ses *
cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
{
- int rc = -ENOMEM;
+ int rc = 0;
unsigned int xid;
struct cifs_ses *ses;
struct sockaddr_in *addr = (struct sockaddr_in *)&server->dstaddr;
@@ -3280,6 +3278,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
return ses;
}
+ rc = -ENOMEM;
+
cifs_dbg(FYI, "Existing smb sess not found\n");
ses = sesInfoAlloc();
if (ses == NULL)
@@ -4320,7 +4320,7 @@ static int mount_get_conns(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
/* do not care if a following call succeed - informational */
if (!tcon->pipe && server->ops->qfs_tcon) {
- server->ops->qfs_tcon(*xid, tcon);
+ server->ops->qfs_tcon(*xid, tcon, cifs_sb);
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RO_CACHE) {
if (tcon->fsDevInfo.DeviceCharacteristics &
cpu_to_le32(FILE_READ_ONLY_DEVICE))
diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c
index cf6cec59696c..cf9267cf42e7 100644
--- a/fs/cifs/dfs_cache.c
+++ b/fs/cifs/dfs_cache.c
@@ -22,67 +22,69 @@
#include "dfs_cache.h"
-#define DFS_CACHE_HTABLE_SIZE 32
-#define DFS_CACHE_MAX_ENTRIES 64
+#define CACHE_HTABLE_SIZE 32
+#define CACHE_MAX_ENTRIES 64
#define IS_INTERLINK_SET(v) ((v) & (DFSREF_REFERRAL_SERVER | \
DFSREF_STORAGE_SERVER))
-struct dfs_cache_tgt {
- char *t_name;
- struct list_head t_list;
+struct cache_dfs_tgt {
+ char *name;
+ struct list_head list;
};
-struct dfs_cache_entry {
- struct hlist_node ce_hlist;
- const char *ce_path;
- int ce_ttl;
- int ce_srvtype;
- int ce_flags;
- struct timespec64 ce_etime;
- int ce_path_consumed;
- int ce_numtgts;
- struct list_head ce_tlist;
- struct dfs_cache_tgt *ce_tgthint;
- struct rcu_head ce_rcu;
+struct cache_entry {
+ struct hlist_node hlist;
+ const char *path;
+ int ttl;
+ int srvtype;
+ int flags;
+ struct timespec64 etime;
+ int path_consumed;
+ int numtgts;
+ struct list_head tlist;
+ struct cache_dfs_tgt *tgthint;
+ struct rcu_head rcu;
};
-static struct kmem_cache *dfs_cache_slab __read_mostly;
-
-struct dfs_cache_vol_info {
- char *vi_fullpath;
- struct smb_vol vi_vol;
- char *vi_mntdata;
- struct list_head vi_list;
+struct vol_info {
+ char *fullpath;
+ spinlock_t smb_vol_lock;
+ struct smb_vol smb_vol;
+ char *mntdata;
+ struct list_head list;
+ struct list_head rlist;
+ struct kref refcnt;
};
-struct dfs_cache {
- struct mutex dc_lock;
- struct nls_table *dc_nlsc;
- struct list_head dc_vol_list;
- int dc_ttl;
- struct delayed_work dc_refresh;
-};
+static struct kmem_cache *cache_slab __read_mostly;
+static struct workqueue_struct *dfscache_wq __read_mostly;
+
+static int cache_ttl;
+static DEFINE_SPINLOCK(cache_ttl_lock);
-static struct dfs_cache dfs_cache;
+static struct nls_table *cache_nlsc;
/*
* Number of entries in the cache
*/
-static size_t dfs_cache_count;
+static size_t cache_count;
-static DEFINE_MUTEX(dfs_cache_list_lock);
-static struct hlist_head dfs_cache_htable[DFS_CACHE_HTABLE_SIZE];
+static struct hlist_head cache_htable[CACHE_HTABLE_SIZE];
+static DEFINE_MUTEX(list_lock);
+
+static LIST_HEAD(vol_list);
+static DEFINE_SPINLOCK(vol_list_lock);
static void refresh_cache_worker(struct work_struct *work);
-static inline bool is_path_valid(const char *path)
-{
- return path && (strchr(path + 1, '\\') || strchr(path + 1, '/'));
-}
+static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker);
-static inline int get_normalized_path(const char *path, char **npath)
+static int get_normalized_path(const char *path, char **npath)
{
+ if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/'))
+ return -EINVAL;
+
if (*path == '\\') {
*npath = (char *)path;
} else {
@@ -100,42 +102,42 @@ static inline void free_normalized_path(const char *path, char *npath)
kfree(npath);
}
-static inline bool cache_entry_expired(const struct dfs_cache_entry *ce)
+static inline bool cache_entry_expired(const struct cache_entry *ce)
{
struct timespec64 ts;
ktime_get_coarse_real_ts64(&ts);
- return timespec64_compare(&ts, &ce->ce_etime) >= 0;
+ return timespec64_compare(&ts, &ce->etime) >= 0;
}
-static inline void free_tgts(struct dfs_cache_entry *ce)
+static inline void free_tgts(struct cache_entry *ce)
{
- struct dfs_cache_tgt *t, *n;
+ struct cache_dfs_tgt *t, *n;
- list_for_each_entry_safe(t, n, &ce->ce_tlist, t_list) {
- list_del(&t->t_list);
- kfree(t->t_name);
+ list_for_each_entry_safe(t, n, &ce->tlist, list) {
+ list_del(&t->list);
+ kfree(t->name);
kfree(t);
}
}
static void free_cache_entry(struct rcu_head *rcu)
{
- struct dfs_cache_entry *ce = container_of(rcu, struct dfs_cache_entry,
- ce_rcu);
- kmem_cache_free(dfs_cache_slab, ce);
+ struct cache_entry *ce = container_of(rcu, struct cache_entry, rcu);
+
+ kmem_cache_free(cache_slab, ce);
}
-static inline void flush_cache_ent(struct dfs_cache_entry *ce)
+static inline void flush_cache_ent(struct cache_entry *ce)
{
- if (hlist_unhashed(&ce->ce_hlist))
+ if (hlist_unhashed(&ce->hlist))
return;
- hlist_del_init_rcu(&ce->ce_hlist);
- kfree_const(ce->ce_path);
+ hlist_del_init_rcu(&ce->hlist);
+ kfree(ce->path);
free_tgts(ce);
- dfs_cache_count--;
- call_rcu(&ce->ce_rcu, free_cache_entry);
+ cache_count--;
+ call_rcu(&ce->rcu, free_cache_entry);
}
static void flush_cache_ents(void)
@@ -143,11 +145,11 @@ static void flush_cache_ents(void)
int i;
rcu_read_lock();
- for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++) {
- struct hlist_head *l = &dfs_cache_htable[i];
- struct dfs_cache_entry *ce;
+ for (i = 0; i < CACHE_HTABLE_SIZE; i++) {
+ struct hlist_head *l = &cache_htable[i];
+ struct cache_entry *ce;
- hlist_for_each_entry_rcu(ce, l, ce_hlist)
+ hlist_for_each_entry_rcu(ce, l, hlist)
flush_cache_ent(ce);
}
rcu_read_unlock();
@@ -159,35 +161,35 @@ static void flush_cache_ents(void)
static int dfscache_proc_show(struct seq_file *m, void *v)
{
int bucket;
- struct dfs_cache_entry *ce;
- struct dfs_cache_tgt *t;
+ struct cache_entry *ce;
+ struct cache_dfs_tgt *t;
seq_puts(m, "DFS cache\n---------\n");
- mutex_lock(&dfs_cache_list_lock);
+ mutex_lock(&list_lock);
rcu_read_lock();
- hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) {
+ hash_for_each_rcu(cache_htable, bucket, ce, hlist) {
seq_printf(m,
"cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
"interlink=%s,path_consumed=%d,expired=%s\n",
- ce->ce_path,
- ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link",
- ce->ce_ttl, ce->ce_etime.tv_nsec,
- IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no",
- ce->ce_path_consumed,
+ ce->path,
+ ce->srvtype == DFS_TYPE_ROOT ? "root" : "link",
+ ce->ttl, ce->etime.tv_nsec,
+ IS_INTERLINK_SET(ce->flags) ? "yes" : "no",
+ ce->path_consumed,
cache_entry_expired(ce) ? "yes" : "no");
- list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ list_for_each_entry(t, &ce->tlist, list) {
seq_printf(m, " %s%s\n",
- t->t_name,
- ce->ce_tgthint == t ? " (target hint)" : "");
+ t->name,
+ ce->tgthint == t ? " (target hint)" : "");
}
}
rcu_read_unlock();
- mutex_unlock(&dfs_cache_list_lock);
+ mutex_unlock(&list_lock);
return 0;
}
@@ -205,9 +207,9 @@ static ssize_t dfscache_proc_write(struct file *file, const char __user *buffer,
return -EINVAL;
cifs_dbg(FYI, "clearing dfs cache");
- mutex_lock(&dfs_cache_list_lock);
+ mutex_lock(&list_lock);
flush_cache_ents();
- mutex_unlock(&dfs_cache_list_lock);
+ mutex_unlock(&list_lock);
return count;
}
@@ -226,25 +228,25 @@ const struct file_operations dfscache_proc_fops = {
};
#ifdef CONFIG_CIFS_DEBUG2
-static inline void dump_tgts(const struct dfs_cache_entry *ce)
+static inline void dump_tgts(const struct cache_entry *ce)
{
- struct dfs_cache_tgt *t;
+ struct cache_dfs_tgt *t;
cifs_dbg(FYI, "target list:\n");
- list_for_each_entry(t, &ce->ce_tlist, t_list) {
- cifs_dbg(FYI, " %s%s\n", t->t_name,
- ce->ce_tgthint == t ? " (target hint)" : "");
+ list_for_each_entry(t, &ce->tlist, list) {
+ cifs_dbg(FYI, " %s%s\n", t->name,
+ ce->tgthint == t ? " (target hint)" : "");
}
}
-static inline void dump_ce(const struct dfs_cache_entry *ce)
+static inline void dump_ce(const struct cache_entry *ce)
{
cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,"
- "interlink=%s,path_consumed=%d,expired=%s\n", ce->ce_path,
- ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ce_ttl,
- ce->ce_etime.tv_nsec,
- IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no",
- ce->ce_path_consumed,
+ "interlink=%s,path_consumed=%d,expired=%s\n", ce->path,
+ ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl,
+ ce->etime.tv_nsec,
+ IS_INTERLINK_SET(ce->flags) ? "yes" : "no",
+ ce->path_consumed,
cache_entry_expired(ce) ? "yes" : "no");
dump_tgts(ce);
}
@@ -284,25 +286,33 @@ static inline void dump_refs(const struct dfs_info3_param *refs, int numrefs)
*/
int dfs_cache_init(void)
{
+ int rc;
int i;
- dfs_cache_slab = kmem_cache_create("cifs_dfs_cache",
- sizeof(struct dfs_cache_entry), 0,
- SLAB_HWCACHE_ALIGN, NULL);
- if (!dfs_cache_slab)
+ dfscache_wq = alloc_workqueue("cifs-dfscache",
+ WQ_FREEZABLE | WQ_MEM_RECLAIM, 1);
+ if (!dfscache_wq)
return -ENOMEM;
- for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++)
- INIT_HLIST_HEAD(&dfs_cache_htable[i]);
+ cache_slab = kmem_cache_create("cifs_dfs_cache",
+ sizeof(struct cache_entry), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!cache_slab) {
+ rc = -ENOMEM;
+ goto out_destroy_wq;
+ }
- INIT_LIST_HEAD(&dfs_cache.dc_vol_list);
- mutex_init(&dfs_cache.dc_lock);
- INIT_DELAYED_WORK(&dfs_cache.dc_refresh, refresh_cache_worker);
- dfs_cache.dc_ttl = -1;
- dfs_cache.dc_nlsc = load_nls_default();
+ for (i = 0; i < CACHE_HTABLE_SIZE; i++)
+ INIT_HLIST_HEAD(&cache_htable[i]);
+
+ cache_nlsc = load_nls_default();
cifs_dbg(FYI, "%s: initialized DFS referral cache\n", __func__);
return 0;
+
+out_destroy_wq:
+ destroy_workqueue(dfscache_wq);
+ return rc;
}
static inline unsigned int cache_entry_hash(const void *data, int size)
@@ -310,7 +320,7 @@ static inline unsigned int cache_entry_hash(const void *data, int size)
unsigned int h;
h = jhash(data, size, 0);
- return h & (DFS_CACHE_HTABLE_SIZE - 1);
+ return h & (CACHE_HTABLE_SIZE - 1);
}
/* Check whether second path component of @path is SYSVOL or NETLOGON */
@@ -325,11 +335,11 @@ static inline bool is_sysvol_or_netlogon(const char *path)
}
/* Return target hint of a DFS cache entry */
-static inline char *get_tgt_name(const struct dfs_cache_entry *ce)
+static inline char *get_tgt_name(const struct cache_entry *ce)
{
- struct dfs_cache_tgt *t = ce->ce_tgthint;
+ struct cache_dfs_tgt *t = ce->tgthint;
- return t ? t->t_name : ERR_PTR(-ENOENT);
+ return t ? t->name : ERR_PTR(-ENOENT);
}
/* Return expire time out of a new entry's TTL */
@@ -346,19 +356,19 @@ static inline struct timespec64 get_expire_time(int ttl)
}
/* Allocate a new DFS target */
-static inline struct dfs_cache_tgt *alloc_tgt(const char *name)
+static inline struct cache_dfs_tgt *alloc_tgt(const char *name)
{
- struct dfs_cache_tgt *t;
+ struct cache_dfs_tgt *t;
t = kmalloc(sizeof(*t), GFP_KERNEL);
if (!t)
return ERR_PTR(-ENOMEM);
- t->t_name = kstrndup(name, strlen(name), GFP_KERNEL);
- if (!t->t_name) {
+ t->name = kstrndup(name, strlen(name), GFP_KERNEL);
+ if (!t->name) {
kfree(t);
return ERR_PTR(-ENOMEM);
}
- INIT_LIST_HEAD(&t->t_list);
+ INIT_LIST_HEAD(&t->list);
return t;
}
@@ -367,63 +377,63 @@ static inline struct dfs_cache_tgt *alloc_tgt(const char *name)
* target hint.
*/
static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs,
- struct dfs_cache_entry *ce, const char *tgthint)
+ struct cache_entry *ce, const char *tgthint)
{
int i;
- ce->ce_ttl = refs[0].ttl;
- ce->ce_etime = get_expire_time(ce->ce_ttl);
- ce->ce_srvtype = refs[0].server_type;
- ce->ce_flags = refs[0].ref_flag;
- ce->ce_path_consumed = refs[0].path_consumed;
+ ce->ttl = refs[0].ttl;
+ ce->etime = get_expire_time(ce->ttl);
+ ce->srvtype = refs[0].server_type;
+ ce->flags = refs[0].ref_flag;
+ ce->path_consumed = refs[0].path_consumed;
for (i = 0; i < numrefs; i++) {
- struct dfs_cache_tgt *t;
+ struct cache_dfs_tgt *t;
t = alloc_tgt(refs[i].node_name);
if (IS_ERR(t)) {
free_tgts(ce);
return PTR_ERR(t);
}
- if (tgthint && !strcasecmp(t->t_name, tgthint)) {
- list_add(&t->t_list, &ce->ce_tlist);
+ if (tgthint && !strcasecmp(t->name, tgthint)) {
+ list_add(&t->list, &ce->tlist);
tgthint = NULL;
} else {
- list_add_tail(&t->t_list, &ce->ce_tlist);
+ list_add_tail(&t->list, &ce->tlist);
}
- ce->ce_numtgts++;
+ ce->numtgts++;
}
- ce->ce_tgthint = list_first_entry_or_null(&ce->ce_tlist,
- struct dfs_cache_tgt, t_list);
+ ce->tgthint = list_first_entry_or_null(&ce->tlist,
+ struct cache_dfs_tgt, list);
return 0;
}
/* Allocate a new cache entry */
-static struct dfs_cache_entry *
-alloc_cache_entry(const char *path, const struct dfs_info3_param *refs,
- int numrefs)
+static struct cache_entry *alloc_cache_entry(const char *path,
+ const struct dfs_info3_param *refs,
+ int numrefs)
{
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
int rc;
- ce = kmem_cache_zalloc(dfs_cache_slab, GFP_KERNEL);
+ ce = kmem_cache_zalloc(cache_slab, GFP_KERNEL);
if (!ce)
return ERR_PTR(-ENOMEM);
- ce->ce_path = kstrdup_const(path, GFP_KERNEL);
- if (!ce->ce_path) {
- kmem_cache_free(dfs_cache_slab, ce);
+ ce->path = kstrndup(path, strlen(path), GFP_KERNEL);
+ if (!ce->path) {
+ kmem_cache_free(cache_slab, ce);
return ERR_PTR(-ENOMEM);
}
- INIT_HLIST_NODE(&ce->ce_hlist);
- INIT_LIST_HEAD(&ce->ce_tlist);
+ INIT_HLIST_NODE(&ce->hlist);
+ INIT_LIST_HEAD(&ce->tlist);
rc = copy_ref_data(refs, numrefs, ce, NULL);
if (rc) {
- kfree_const(ce->ce_path);
- kmem_cache_free(dfs_cache_slab, ce);
+ kfree(ce->path);
+ kmem_cache_free(cache_slab, ce);
ce = ERR_PTR(rc);
}
return ce;
@@ -432,13 +442,13 @@ alloc_cache_entry(const char *path, const struct dfs_info3_param *refs,
static void remove_oldest_entry(void)
{
int bucket;
- struct dfs_cache_entry *ce;
- struct dfs_cache_entry *to_del = NULL;
+ struct cache_entry *ce;
+ struct cache_entry *to_del = NULL;
rcu_read_lock();
- hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) {
- if (!to_del || timespec64_compare(&ce->ce_etime,
- &to_del->ce_etime) < 0)
+ hash_for_each_rcu(cache_htable, bucket, ce, hlist) {
+ if (!to_del || timespec64_compare(&ce->etime,
+ &to_del->etime) < 0)
to_del = ce;
}
if (!to_del) {
@@ -453,94 +463,96 @@ out:
}
/* Add a new DFS cache entry */
-static inline struct dfs_cache_entry *
+static inline struct cache_entry *
add_cache_entry(unsigned int hash, const char *path,
const struct dfs_info3_param *refs, int numrefs)
{
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
ce = alloc_cache_entry(path, refs, numrefs);
if (IS_ERR(ce))
return ce;
- hlist_add_head_rcu(&ce->ce_hlist, &dfs_cache_htable[hash]);
+ hlist_add_head_rcu(&ce->hlist, &cache_htable[hash]);
- mutex_lock(&dfs_cache.dc_lock);
- if (dfs_cache.dc_ttl < 0) {
- dfs_cache.dc_ttl = ce->ce_ttl;
- queue_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh,
- dfs_cache.dc_ttl * HZ);
+ spin_lock(&cache_ttl_lock);
+ if (!cache_ttl) {
+ cache_ttl = ce->ttl;
+ queue_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ);
} else {
- dfs_cache.dc_ttl = min_t(int, dfs_cache.dc_ttl, ce->ce_ttl);
- mod_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh,
- dfs_cache.dc_ttl * HZ);
+ cache_ttl = min_t(int, cache_ttl, ce->ttl);
+ mod_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ);
}
- mutex_unlock(&dfs_cache.dc_lock);
+ spin_unlock(&cache_ttl_lock);
return ce;
}
-static struct dfs_cache_entry *__find_cache_entry(unsigned int hash,
- const char *path)
+/*
+ * Find a DFS cache entry in hash table and optionally check prefix path against
+ * @path.
+ * Use whole path components in the match.
+ * Return ERR_PTR(-ENOENT) if the entry is not found.
+ */
+static struct cache_entry *lookup_cache_entry(const char *path,
+ unsigned int *hash)
{
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
+ unsigned int h;
bool found = false;
- rcu_read_lock();
- hlist_for_each_entry_rcu(ce, &dfs_cache_htable[hash], ce_hlist) {
- if (!strcasecmp(path, ce->ce_path)) {
-#ifdef CONFIG_CIFS_DEBUG2
- char *name = get_tgt_name(ce);
+ h = cache_entry_hash(path, strlen(path));
- if (IS_ERR(name)) {
- rcu_read_unlock();
- return ERR_CAST(name);
- }
- cifs_dbg(FYI, "%s: cache hit\n", __func__);
- cifs_dbg(FYI, "%s: target hint: %s\n", __func__, name);
-#endif
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(ce, &cache_htable[h], hlist) {
+ if (!strcasecmp(path, ce->path)) {
found = true;
+ dump_ce(ce);
break;
}
}
rcu_read_unlock();
- return found ? ce : ERR_PTR(-ENOENT);
-}
-/*
- * Find a DFS cache entry in hash table and optionally check prefix path against
- * @path.
- * Use whole path components in the match.
- * Return ERR_PTR(-ENOENT) if the entry is not found.
- */
-static inline struct dfs_cache_entry *find_cache_entry(const char *path,
- unsigned int *hash)
-{
- *hash = cache_entry_hash(path, strlen(path));
- return __find_cache_entry(*hash, path);
+ if (!found)
+ ce = ERR_PTR(-ENOENT);
+ if (hash)
+ *hash = h;
+
+ return ce;
}
static inline void destroy_slab_cache(void)
{
rcu_barrier();
- kmem_cache_destroy(dfs_cache_slab);
+ kmem_cache_destroy(cache_slab);
}
-static inline void free_vol(struct dfs_cache_vol_info *vi)
+static void __vol_release(struct vol_info *vi)
{
- list_del(&vi->vi_list);
- kfree(vi->vi_fullpath);
- kfree(vi->vi_mntdata);
- cifs_cleanup_volume_info_contents(&vi->vi_vol);
+ kfree(vi->fullpath);
+ kfree(vi->mntdata);
+ cifs_cleanup_volume_info_contents(&vi->smb_vol);
kfree(vi);
}
+static void vol_release(struct kref *kref)
+{
+ struct vol_info *vi = container_of(kref, struct vol_info, refcnt);
+
+ spin_lock(&vol_list_lock);
+ list_del(&vi->list);
+ spin_unlock(&vol_list_lock);
+ __vol_release(vi);
+}
+
static inline void free_vol_list(void)
{
- struct dfs_cache_vol_info *vi, *nvi;
+ struct vol_info *vi, *nvi;
- list_for_each_entry_safe(vi, nvi, &dfs_cache.dc_vol_list, vi_list)
- free_vol(vi);
+ list_for_each_entry_safe(vi, nvi, &vol_list, list) {
+ list_del_init(&vi->list);
+ __vol_release(vi);
+ }
}
/**
@@ -548,40 +560,38 @@ static inline void free_vol_list(void)
*/
void dfs_cache_destroy(void)
{
- cancel_delayed_work_sync(&dfs_cache.dc_refresh);
- unload_nls(dfs_cache.dc_nlsc);
+ cancel_delayed_work_sync(&refresh_task);
+ unload_nls(cache_nlsc);
free_vol_list();
- mutex_destroy(&dfs_cache.dc_lock);
-
flush_cache_ents();
destroy_slab_cache();
- mutex_destroy(&dfs_cache_list_lock);
+ destroy_workqueue(dfscache_wq);
cifs_dbg(FYI, "%s: destroyed DFS referral cache\n", __func__);
}
-static inline struct dfs_cache_entry *
+static inline struct cache_entry *
__update_cache_entry(const char *path, const struct dfs_info3_param *refs,
int numrefs)
{
int rc;
unsigned int h;
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
char *s, *th = NULL;
- ce = find_cache_entry(path, &h);
+ ce = lookup_cache_entry(path, &h);
if (IS_ERR(ce))
return ce;
- if (ce->ce_tgthint) {
- s = ce->ce_tgthint->t_name;
+ if (ce->tgthint) {
+ s = ce->tgthint->name;
th = kstrndup(s, strlen(s), GFP_KERNEL);
if (!th)
return ERR_PTR(-ENOMEM);
}
free_tgts(ce);
- ce->ce_numtgts = 0;
+ ce->numtgts = 0;
rc = copy_ref_data(refs, numrefs, ce, th);
kfree(th);
@@ -593,10 +603,10 @@ __update_cache_entry(const char *path, const struct dfs_info3_param *refs,
}
/* Update an expired cache entry by getting a new DFS referral from server */
-static struct dfs_cache_entry *
+static struct cache_entry *
update_cache_entry(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_codepage, int remap,
- const char *path, struct dfs_cache_entry *ce)
+ const char *path, struct cache_entry *ce)
{
int rc;
struct dfs_info3_param *refs = NULL;
@@ -636,20 +646,20 @@ update_cache_entry(const unsigned int xid, struct cifs_ses *ses,
* For interlinks, __cifs_dfs_mount() and expand_dfs_referral() are supposed to
* handle them properly.
*/
-static struct dfs_cache_entry *
+static struct cache_entry *
do_dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
const struct nls_table *nls_codepage, int remap,
const char *path, bool noreq)
{
int rc;
unsigned int h;
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
struct dfs_info3_param *nrefs;
int numnrefs;
cifs_dbg(FYI, "%s: search path: %s\n", __func__, path);
- ce = find_cache_entry(path, &h);
+ ce = lookup_cache_entry(path, &h);
if (IS_ERR(ce)) {
cifs_dbg(FYI, "%s: cache miss\n", __func__);
/*
@@ -690,9 +700,9 @@ do_dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
cifs_dbg(FYI, "%s: new cache entry\n", __func__);
- if (dfs_cache_count >= DFS_CACHE_MAX_ENTRIES) {
+ if (cache_count >= CACHE_MAX_ENTRIES) {
cifs_dbg(FYI, "%s: reached max cache size (%d)",
- __func__, DFS_CACHE_MAX_ENTRIES);
+ __func__, CACHE_MAX_ENTRIES);
remove_oldest_entry();
}
ce = add_cache_entry(h, path, nrefs, numnrefs);
@@ -701,7 +711,7 @@ do_dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
if (IS_ERR(ce))
return ce;
- dfs_cache_count++;
+ cache_count++;
}
dump_ce(ce);
@@ -723,7 +733,7 @@ do_dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
}
/* Set up a new DFS referral from a given cache entry */
-static int setup_ref(const char *path, const struct dfs_cache_entry *ce,
+static int setup_ref(const char *path, const struct cache_entry *ce,
struct dfs_info3_param *ref, const char *tgt)
{
int rc;
@@ -736,7 +746,7 @@ static int setup_ref(const char *path, const struct dfs_cache_entry *ce,
if (!ref->path_name)
return -ENOMEM;
- ref->path_consumed = ce->ce_path_consumed;
+ ref->path_consumed = ce->path_consumed;
ref->node_name = kstrndup(tgt, strlen(tgt), GFP_KERNEL);
if (!ref->node_name) {
@@ -744,9 +754,9 @@ static int setup_ref(const char *path, const struct dfs_cache_entry *ce,
goto err_free_path;
}
- ref->ttl = ce->ce_ttl;
- ref->server_type = ce->ce_srvtype;
- ref->ref_flag = ce->ce_flags;
+ ref->ttl = ce->ttl;
+ ref->server_type = ce->srvtype;
+ ref->ref_flag = ce->flags;
return 0;
@@ -757,25 +767,25 @@ err_free_path:
}
/* Return target list of a DFS cache entry */
-static int get_tgt_list(const struct dfs_cache_entry *ce,
+static int get_tgt_list(const struct cache_entry *ce,
struct dfs_cache_tgt_list *tl)
{
int rc;
struct list_head *head = &tl->tl_list;
- struct dfs_cache_tgt *t;
+ struct cache_dfs_tgt *t;
struct dfs_cache_tgt_iterator *it, *nit;
memset(tl, 0, sizeof(*tl));
INIT_LIST_HEAD(head);
- list_for_each_entry(t, &ce->ce_tlist, t_list) {
+ list_for_each_entry(t, &ce->tlist, list) {
it = kzalloc(sizeof(*it), GFP_KERNEL);
if (!it) {
rc = -ENOMEM;
goto err_free_it;
}
- it->it_name = kstrndup(t->t_name, strlen(t->t_name),
+ it->it_name = kstrndup(t->name, strlen(t->name),
GFP_KERNEL);
if (!it->it_name) {
kfree(it);
@@ -783,12 +793,12 @@ static int get_tgt_list(const struct dfs_cache_entry *ce,
goto err_free_it;
}
- if (ce->ce_tgthint == t)
+ if (ce->tgthint == t)
list_add(&it->it_list, head);
else
list_add_tail(&it->it_list, head);
}
- tl->tl_numtgts = ce->ce_numtgts;
+ tl->tl_numtgts = ce->numtgts;
return 0;
@@ -829,16 +839,13 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
{
int rc;
char *npath;
- struct dfs_cache_entry *ce;
-
- if (unlikely(!is_path_valid(path)))
- return -EINVAL;
+ struct cache_entry *ce;
rc = get_normalized_path(path, &npath);
if (rc)
return rc;
- mutex_lock(&dfs_cache_list_lock);
+ mutex_lock(&list_lock);
ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
if (!IS_ERR(ce)) {
if (ref)
@@ -850,7 +857,7 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses,
} else {
rc = PTR_ERR(ce);
}
- mutex_unlock(&dfs_cache_list_lock);
+ mutex_unlock(&list_lock);
free_normalized_path(path, npath);
return rc;
}
@@ -876,16 +883,13 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
{
int rc;
char *npath;
- struct dfs_cache_entry *ce;
-
- if (unlikely(!is_path_valid(path)))
- return -EINVAL;
+ struct cache_entry *ce;
rc = get_normalized_path(path, &npath);
if (rc)
return rc;
- mutex_lock(&dfs_cache_list_lock);
+ mutex_lock(&list_lock);
ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
@@ -899,7 +903,7 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref,
if (!rc && tgt_list)
rc = get_tgt_list(ce, tgt_list);
out:
- mutex_unlock(&dfs_cache_list_lock);
+ mutex_unlock(&list_lock);
free_normalized_path(path, npath);
return rc;
}
@@ -929,11 +933,8 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
{
int rc;
char *npath;
- struct dfs_cache_entry *ce;
- struct dfs_cache_tgt *t;
-
- if (unlikely(!is_path_valid(path)))
- return -EINVAL;
+ struct cache_entry *ce;
+ struct cache_dfs_tgt *t;
rc = get_normalized_path(path, &npath);
if (rc)
@@ -941,7 +942,7 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
- mutex_lock(&dfs_cache_list_lock);
+ mutex_lock(&list_lock);
ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
@@ -950,14 +951,14 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
rc = 0;
- t = ce->ce_tgthint;
+ t = ce->tgthint;
- if (likely(!strcasecmp(it->it_name, t->t_name)))
+ if (likely(!strcasecmp(it->it_name, t->name)))
goto out;
- list_for_each_entry(t, &ce->ce_tlist, t_list) {
- if (!strcasecmp(t->t_name, it->it_name)) {
- ce->ce_tgthint = t;
+ list_for_each_entry(t, &ce->tlist, list) {
+ if (!strcasecmp(t->name, it->it_name)) {
+ ce->tgthint = t;
cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
it->it_name);
break;
@@ -965,7 +966,7 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses,
}
out:
- mutex_unlock(&dfs_cache_list_lock);
+ mutex_unlock(&list_lock);
free_normalized_path(path, npath);
return rc;
}
@@ -989,10 +990,10 @@ int dfs_cache_noreq_update_tgthint(const char *path,
{
int rc;
char *npath;
- struct dfs_cache_entry *ce;
- struct dfs_cache_tgt *t;
+ struct cache_entry *ce;
+ struct cache_dfs_tgt *t;
- if (unlikely(!is_path_valid(path)) || !it)
+ if (!it)
return -EINVAL;
rc = get_normalized_path(path, &npath);
@@ -1001,7 +1002,7 @@ int dfs_cache_noreq_update_tgthint(const char *path,
cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
- mutex_lock(&dfs_cache_list_lock);
+ mutex_lock(&list_lock);
ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true);
if (IS_ERR(ce)) {
@@ -1011,14 +1012,14 @@ int dfs_cache_noreq_update_tgthint(const char *path,
rc = 0;
- t = ce->ce_tgthint;
+ t = ce->tgthint;
- if (unlikely(!strcasecmp(it->it_name, t->t_name)))
+ if (unlikely(!strcasecmp(it->it_name, t->name)))
goto out;
- list_for_each_entry(t, &ce->ce_tlist, t_list) {
- if (!strcasecmp(t->t_name, it->it_name)) {
- ce->ce_tgthint = t;
+ list_for_each_entry(t, &ce->tlist, list) {
+ if (!strcasecmp(t->name, it->it_name)) {
+ ce->tgthint = t;
cifs_dbg(FYI, "%s: new target hint: %s\n", __func__,
it->it_name);
break;
@@ -1026,7 +1027,7 @@ int dfs_cache_noreq_update_tgthint(const char *path,
}
out:
- mutex_unlock(&dfs_cache_list_lock);
+ mutex_unlock(&list_lock);
free_normalized_path(path, npath);
return rc;
}
@@ -1047,13 +1048,11 @@ int dfs_cache_get_tgt_referral(const char *path,
{
int rc;
char *npath;
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
unsigned int h;
if (!it || !ref)
return -EINVAL;
- if (unlikely(!is_path_valid(path)))
- return -EINVAL;
rc = get_normalized_path(path, &npath);
if (rc)
@@ -1061,9 +1060,9 @@ int dfs_cache_get_tgt_referral(const char *path,
cifs_dbg(FYI, "%s: path: %s\n", __func__, npath);
- mutex_lock(&dfs_cache_list_lock);
+ mutex_lock(&list_lock);
- ce = find_cache_entry(npath, &h);
+ ce = lookup_cache_entry(npath, &h);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
goto out;
@@ -1074,7 +1073,7 @@ int dfs_cache_get_tgt_referral(const char *path,
rc = setup_ref(path, ce, ref, it->it_name);
out:
- mutex_unlock(&dfs_cache_list_lock);
+ mutex_unlock(&list_lock);
free_normalized_path(path, npath);
return rc;
}
@@ -1085,7 +1084,7 @@ static int dup_vol(struct smb_vol *vol, struct smb_vol *new)
if (vol->username) {
new->username = kstrndup(vol->username, strlen(vol->username),
- GFP_KERNEL);
+ GFP_KERNEL);
if (!new->username)
return -ENOMEM;
}
@@ -1103,7 +1102,7 @@ static int dup_vol(struct smb_vol *vol, struct smb_vol *new)
}
if (vol->domainname) {
new->domainname = kstrndup(vol->domainname,
- strlen(vol->domainname), GFP_KERNEL);
+ strlen(vol->domainname), GFP_KERNEL);
if (!new->domainname)
goto err_free_unc;
}
@@ -1150,7 +1149,7 @@ err_free_username:
int dfs_cache_add_vol(char *mntdata, struct smb_vol *vol, const char *fullpath)
{
int rc;
- struct dfs_cache_vol_info *vi;
+ struct vol_info *vi;
if (!vol || !fullpath || !mntdata)
return -EINVAL;
@@ -1161,38 +1160,41 @@ int dfs_cache_add_vol(char *mntdata, struct smb_vol *vol, const char *fullpath)
if (!vi)
return -ENOMEM;
- vi->vi_fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
- if (!vi->vi_fullpath) {
+ vi->fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL);
+ if (!vi->fullpath) {
rc = -ENOMEM;
goto err_free_vi;
}
- rc = dup_vol(vol, &vi->vi_vol);
+ rc = dup_vol(vol, &vi->smb_vol);
if (rc)
goto err_free_fullpath;
- vi->vi_mntdata = mntdata;
+ vi->mntdata = mntdata;
+ spin_lock_init(&vi->smb_vol_lock);
+ kref_init(&vi->refcnt);
+
+ spin_lock(&vol_list_lock);
+ list_add_tail(&vi->list, &vol_list);
+ spin_unlock(&vol_list_lock);
- mutex_lock(&dfs_cache.dc_lock);
- list_add_tail(&vi->vi_list, &dfs_cache.dc_vol_list);
- mutex_unlock(&dfs_cache.dc_lock);
return 0;
err_free_fullpath:
- kfree(vi->vi_fullpath);
+ kfree(vi->fullpath);
err_free_vi:
kfree(vi);
return rc;
}
-static inline struct dfs_cache_vol_info *find_vol(const char *fullpath)
+/* Must be called with vol_list_lock held */
+static struct vol_info *find_vol(const char *fullpath)
{
- struct dfs_cache_vol_info *vi;
+ struct vol_info *vi;
- list_for_each_entry(vi, &dfs_cache.dc_vol_list, vi_list) {
- cifs_dbg(FYI, "%s: vi->vi_fullpath: %s\n", __func__,
- vi->vi_fullpath);
- if (!strcasecmp(vi->vi_fullpath, fullpath))
+ list_for_each_entry(vi, &vol_list, list) {
+ cifs_dbg(FYI, "%s: vi->fullpath: %s\n", __func__, vi->fullpath);
+ if (!strcasecmp(vi->fullpath, fullpath))
return vi;
}
return ERR_PTR(-ENOENT);
@@ -1208,30 +1210,31 @@ static inline struct dfs_cache_vol_info *find_vol(const char *fullpath)
*/
int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server)
{
- int rc;
- struct dfs_cache_vol_info *vi;
+ struct vol_info *vi;
if (!fullpath || !server)
return -EINVAL;
cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
- mutex_lock(&dfs_cache.dc_lock);
-
+ spin_lock(&vol_list_lock);
vi = find_vol(fullpath);
if (IS_ERR(vi)) {
- rc = PTR_ERR(vi);
- goto out;
+ spin_unlock(&vol_list_lock);
+ return PTR_ERR(vi);
}
+ kref_get(&vi->refcnt);
+ spin_unlock(&vol_list_lock);
cifs_dbg(FYI, "%s: updating volume info\n", __func__);
- memcpy(&vi->vi_vol.dstaddr, &server->dstaddr,
- sizeof(vi->vi_vol.dstaddr));
- rc = 0;
+ spin_lock(&vi->smb_vol_lock);
+ memcpy(&vi->smb_vol.dstaddr, &server->dstaddr,
+ sizeof(vi->smb_vol.dstaddr));
+ spin_unlock(&vi->smb_vol_lock);
-out:
- mutex_unlock(&dfs_cache.dc_lock);
- return rc;
+ kref_put(&vi->refcnt, vol_release);
+
+ return 0;
}
/**
@@ -1241,18 +1244,18 @@ out:
*/
void dfs_cache_del_vol(const char *fullpath)
{
- struct dfs_cache_vol_info *vi;
+ struct vol_info *vi;
if (!fullpath || !*fullpath)
return;
cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath);
- mutex_lock(&dfs_cache.dc_lock);
+ spin_lock(&vol_list_lock);
vi = find_vol(fullpath);
- if (!IS_ERR(vi))
- free_vol(vi);
- mutex_unlock(&dfs_cache.dc_lock);
+ spin_unlock(&vol_list_lock);
+
+ kref_put(&vi->refcnt, vol_release);
}
/* Get all tcons that are within a DFS namespace and can be refreshed */
@@ -1280,7 +1283,7 @@ static void get_tcons(struct TCP_Server_Info *server, struct list_head *head)
spin_unlock(&cifs_tcp_ses_lock);
}
-static inline bool is_dfs_link(const char *path)
+static bool is_dfs_link(const char *path)
{
char *s;
@@ -1290,7 +1293,7 @@ static inline bool is_dfs_link(const char *path)
return !!strchr(s + 1, '\\');
}
-static inline char *get_dfs_root(const char *path)
+static char *get_dfs_root(const char *path)
{
char *s, *npath;
@@ -1309,9 +1312,34 @@ static inline char *get_dfs_root(const char *path)
return npath;
}
+static inline void put_tcp_server(struct TCP_Server_Info *server)
+{
+ cifs_put_tcp_session(server, 0);
+}
+
+static struct TCP_Server_Info *get_tcp_server(struct smb_vol *vol)
+{
+ struct TCP_Server_Info *server;
+
+ server = cifs_find_tcp_session(vol);
+ if (IS_ERR_OR_NULL(server))
+ return NULL;
+
+ spin_lock(&GlobalMid_Lock);
+ if (server->tcpStatus != CifsGood) {
+ spin_unlock(&GlobalMid_Lock);
+ put_tcp_server(server);
+ return NULL;
+ }
+ spin_unlock(&GlobalMid_Lock);
+
+ return server;
+}
+
/* Find root SMB session out of a DFS link path */
-static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
- struct cifs_tcon *tcon, const char *path)
+static struct cifs_ses *find_root_ses(struct vol_info *vi,
+ struct cifs_tcon *tcon,
+ const char *path)
{
char *rpath;
int rc;
@@ -1333,8 +1361,7 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
goto out;
}
- mdata = cifs_compose_mount_options(vi->vi_mntdata, rpath, &ref,
- &devname);
+ mdata = cifs_compose_mount_options(vi->mntdata, rpath, &ref, &devname);
free_dfs_info_param(&ref);
if (IS_ERR(mdata)) {
@@ -1351,13 +1378,8 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi,
goto out;
}
- server = cifs_find_tcp_session(&vol);
- if (IS_ERR_OR_NULL(server)) {
- ses = ERR_PTR(-EHOSTDOWN);
- goto out;
- }
- if (server->tcpStatus != CifsGood) {
- cifs_put_tcp_session(server, 0);
+ server = get_tcp_server(&vol);
+ if (!server) {
ses = ERR_PTR(-EHOSTDOWN);
goto out;
}
@@ -1373,14 +1395,13 @@ out:
}
/* Refresh DFS cache entry from a given tcon */
-static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi,
- struct cifs_tcon *tcon)
+static void refresh_tcon(struct vol_info *vi, struct cifs_tcon *tcon)
{
int rc = 0;
unsigned int xid;
char *path, *npath;
unsigned int h;
- struct dfs_cache_entry *ce;
+ struct cache_entry *ce;
struct dfs_info3_param *refs = NULL;
int numrefs = 0;
struct cifs_ses *root_ses = NULL, *ses;
@@ -1393,9 +1414,9 @@ static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi,
if (rc)
goto out;
- mutex_lock(&dfs_cache_list_lock);
- ce = find_cache_entry(npath, &h);
- mutex_unlock(&dfs_cache_list_lock);
+ mutex_lock(&list_lock);
+ ce = lookup_cache_entry(npath, &h);
+ mutex_unlock(&list_lock);
if (IS_ERR(ce)) {
rc = PTR_ERR(ce);
@@ -1421,12 +1442,12 @@ static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi,
rc = -EOPNOTSUPP;
} else {
rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs,
- &numrefs, dc->dc_nlsc,
+ &numrefs, cache_nlsc,
tcon->remap);
if (!rc) {
- mutex_lock(&dfs_cache_list_lock);
+ mutex_lock(&list_lock);
ce = __update_cache_entry(npath, refs, numrefs);
- mutex_unlock(&dfs_cache_list_lock);
+ mutex_unlock(&list_lock);
dump_refs(refs, numrefs);
free_dfs_info_array(refs, numrefs);
if (IS_ERR(ce))
@@ -1448,30 +1469,52 @@ out:
*/
static void refresh_cache_worker(struct work_struct *work)
{
- struct dfs_cache *dc = container_of(work, struct dfs_cache,
- dc_refresh.work);
- struct dfs_cache_vol_info *vi;
+ struct vol_info *vi, *nvi;
struct TCP_Server_Info *server;
- LIST_HEAD(list);
+ LIST_HEAD(vols);
+ LIST_HEAD(tcons);
struct cifs_tcon *tcon, *ntcon;
- mutex_lock(&dc->dc_lock);
-
- list_for_each_entry(vi, &dc->dc_vol_list, vi_list) {
- server = cifs_find_tcp_session(&vi->vi_vol);
- if (IS_ERR_OR_NULL(server))
+ /*
+ * Find SMB volumes that are eligible (server->tcpStatus == CifsGood)
+ * for refreshing.
+ */
+ spin_lock(&vol_list_lock);
+ list_for_each_entry(vi, &vol_list, list) {
+ server = get_tcp_server(&vi->smb_vol);
+ if (!server)
continue;
- if (server->tcpStatus != CifsGood)
- goto next;
- get_tcons(server, &list);
- list_for_each_entry_safe(tcon, ntcon, &list, ulist) {
- do_refresh_tcon(dc, vi, tcon);
+
+ kref_get(&vi->refcnt);
+ list_add_tail(&vi->rlist, &vols);
+ put_tcp_server(server);
+ }
+ spin_unlock(&vol_list_lock);
+
+ /* Walk through all TCONs and refresh any expired cache entry */
+ list_for_each_entry_safe(vi, nvi, &vols, rlist) {
+ spin_lock(&vi->smb_vol_lock);
+ server = get_tcp_server(&vi->smb_vol);
+ spin_unlock(&vi->smb_vol_lock);
+
+ if (!server)
+ goto next_vol;
+
+ get_tcons(server, &tcons);
+ list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) {
+ refresh_tcon(vi, tcon);
list_del_init(&tcon->ulist);
cifs_put_tcon(tcon);
}
-next:
- cifs_put_tcp_session(server, 0);
+
+ put_tcp_server(server);
+
+next_vol:
+ list_del_init(&vi->rlist);
+ kref_put(&vi->refcnt, vol_release);
}
- queue_delayed_work(cifsiod_wq, &dc->dc_refresh, dc->dc_ttl * HZ);
- mutex_unlock(&dc->dc_lock);
+
+ spin_lock(&cache_ttl_lock);
+ queue_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ);
+ spin_unlock(&cache_ttl_lock);
}
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 9ae9a514676c..548047a709bf 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -357,13 +357,10 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid,
if (!tcon->unix_ext && (mode & S_IWUGO) == 0)
create_options |= CREATE_OPTION_READONLY;
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = desired_access;
- oparms.create_options = create_options;
+ oparms.create_options = cifs_create_options(cifs_sb, create_options);
oparms.disposition = disposition;
oparms.path = full_path;
oparms.fid = fid;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 03c85beecec1..a0b99c5e0721 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -223,9 +223,6 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
if (!buf)
return -ENOMEM;
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
/* O_SYNC also has bit for O_DSYNC so following check picks up either */
if (f_flags & O_SYNC)
create_options |= CREATE_WRITE_THROUGH;
@@ -236,7 +233,7 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb,
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = desired_access;
- oparms.create_options = create_options;
+ oparms.create_options = cifs_create_options(cifs_sb, create_options);
oparms.disposition = disposition;
oparms.path = full_path;
oparms.fid = fid;
@@ -751,9 +748,6 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
desired_access = cifs_convert_flags(cfile->f_flags);
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
/* O_SYNC also has bit for O_DSYNC so following check picks up either */
if (cfile->f_flags & O_SYNC)
create_options |= CREATE_WRITE_THROUGH;
@@ -767,7 +761,7 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush)
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = desired_access;
- oparms.create_options = create_options;
+ oparms.create_options = cifs_create_options(cifs_sb, create_options);
oparms.disposition = disposition;
oparms.path = full_path;
oparms.fid = &cfile->fid;
@@ -3194,6 +3188,9 @@ static ssize_t __cifs_writev(
ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
+
+ cifs_revalidate_mapping(file->f_inode);
return __cifs_writev(iocb, from, true);
}
@@ -3874,6 +3871,15 @@ static ssize_t __cifs_readv(
len = ctx->len;
}
+ if (direct) {
+ rc = filemap_write_and_wait_range(file->f_inode->i_mapping,
+ offset, offset + len - 1);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -EAGAIN;
+ }
+ }
+
/* grab a lock here due to read response handlers can access ctx */
mutex_lock(&ctx->aio_mutex);
@@ -4504,9 +4510,9 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
io_error:
kunmap(page);
- unlock_page(page);
read_complete:
+ unlock_page(page);
return rc;
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index fd9e289f3e72..af0980c720c7 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -472,9 +472,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_READ;
- oparms.create_options = CREATE_NOT_DIR;
- if (backup_cred(cifs_sb))
- oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
oparms.disposition = FILE_OPEN;
oparms.path = path;
oparms.fid = &fid;
@@ -1225,7 +1223,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry,
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = DELETE | FILE_WRITE_ATTRIBUTES;
- oparms.create_options = CREATE_NOT_DIR;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
oparms.disposition = FILE_OPEN;
oparms.path = full_path;
oparms.fid = &fid;
@@ -1763,7 +1761,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
oparms.cifs_sb = cifs_sb;
/* open the file to be renamed -- we need DELETE perms */
oparms.desired_access = DELETE;
- oparms.create_options = CREATE_NOT_DIR;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
oparms.disposition = FILE_OPEN;
oparms.path = from_path;
oparms.fid = &fid;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 1a01e108d75e..bc08d856ee05 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -65,7 +65,7 @@ static long cifs_ioctl_query_info(unsigned int xid, struct file *filep,
if (tcon->ses->server->ops->ioctl_query_info)
rc = tcon->ses->server->ops->ioctl_query_info(
- xid, tcon, utf16_path,
+ xid, tcon, cifs_sb, utf16_path,
filep->private_data ? 0 : 1, p);
else
rc = -EOPNOTSUPP;
@@ -191,7 +191,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
rc = put_user(ExtAttrBits &
FS_FL_USER_VISIBLE,
(int __user *)arg);
- if (rc != EOPNOTSUPP)
+ if (rc != -EOPNOTSUPP)
break;
}
#endif /* CONFIG_CIFS_POSIX */
@@ -220,7 +220,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
* pSMBFile->fid.netfid,
* extAttrBits,
* &ExtAttrMask);
- * if (rc != EOPNOTSUPP)
+ * if (rc != -EOPNOTSUPP)
* break;
*/
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index a24bcbbb5033..02949a2f2860 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -318,7 +318,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_READ;
- oparms.create_options = CREATE_NOT_DIR;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
oparms.disposition = FILE_OPEN;
oparms.path = path;
oparms.fid = &fid;
@@ -356,15 +356,11 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
struct cifs_open_parms oparms;
struct cifs_io_parms io_parms;
- int create_options = CREATE_NOT_DIR;
-
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = create_options;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
oparms.disposition = FILE_CREATE;
oparms.path = path;
oparms.fid = &fid;
@@ -405,9 +401,7 @@ smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_READ;
- oparms.create_options = CREATE_NOT_DIR;
- if (backup_cred(cifs_sb))
- oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
oparms.disposition = FILE_OPEN;
oparms.fid = &fid;
oparms.reconnect = false;
@@ -460,14 +454,10 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
struct cifs_open_parms oparms;
struct cifs_io_parms io_parms;
- int create_options = CREATE_NOT_DIR;
__le16 *utf16_path;
__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
struct kvec iov[2];
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
cifs_dbg(FYI, "%s: path: %s\n", __func__, path);
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
@@ -477,10 +467,11 @@ smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = create_options;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
oparms.disposition = FILE_CREATE;
oparms.fid = &fid;
oparms.reconnect = false;
+ oparms.mode = 0644;
rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL,
NULL);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 40ca394fd5de..db1fcdedf289 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -349,6 +349,10 @@ checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
cifs_dbg(VFS, "Length less than smb header size\n");
}
return -EIO;
+ } else if (total_read < sizeof(*smb) + 2 * smb->WordCount) {
+ cifs_dbg(VFS, "%s: can't read BCC due to invalid WordCount(%u)\n",
+ __func__, smb->WordCount);
+ return -EIO;
}
/* otherwise, there is enough to get to the BCC */
@@ -972,8 +976,8 @@ cifs_free_hash(struct crypto_shash **shash, struct sdesc **sdesc)
* Input: rqst - a smb_rqst, page - a page index for rqst
* Output: *len - the length for this page, *offset - the offset for this page
*/
-void rqst_page_get_length(struct smb_rqst *rqst, unsigned int page,
- unsigned int *len, unsigned int *offset)
+void rqst_page_get_length(const struct smb_rqst *rqst, unsigned int page,
+ unsigned int *len, unsigned int *offset)
{
*len = rqst->rq_pagesz;
*offset = (page == 0) ? rqst->rq_offset : 0;
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index e523c05a4487..b130efaf8feb 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -504,7 +504,8 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb_vol *volume_info)
}
static void
-cifs_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+cifs_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb)
{
CIFSSMBQFSDeviceInfo(xid, tcon);
CIFSSMBQFSAttributeInfo(xid, tcon);
@@ -565,7 +566,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.create_options = 0;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.disposition = FILE_OPEN;
oparms.path = full_path;
oparms.fid = &fid;
@@ -793,7 +794,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES;
- oparms.create_options = CREATE_NOT_DIR;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR);
oparms.disposition = FILE_OPEN;
oparms.path = full_path;
oparms.fid = &fid;
@@ -872,7 +873,7 @@ cifs_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
static int
cifs_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
- struct kstatfs *buf)
+ struct cifs_sb_info *cifs_sb, struct kstatfs *buf)
{
int rc = -EOPNOTSUPP;
@@ -970,7 +971,8 @@ cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.create_options = OPEN_REPARSE_POINT;
+ oparms.create_options = cifs_create_options(cifs_sb,
+ OPEN_REPARSE_POINT);
oparms.disposition = FILE_OPEN;
oparms.path = full_path;
oparms.fid = &fid;
@@ -1029,7 +1031,6 @@ cifs_make_node(unsigned int xid, struct inode *inode,
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct inode *newinode = NULL;
int rc = -EPERM;
- int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
FILE_ALL_INFO *buf = NULL;
struct cifs_io_parms io_parms;
__u32 oplock = 0;
@@ -1090,13 +1091,11 @@ cifs_make_node(unsigned int xid, struct inode *inode,
goto out;
}
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = create_options;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR |
+ CREATE_OPTION_SPECIAL);
oparms.disposition = FILE_CREATE;
oparms.path = full_path;
oparms.fid = &fid;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index f2a6f7f28340..c9abda93d65b 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -98,9 +98,7 @@ smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.desired_access = desired_access;
oparms.disposition = create_disposition;
- oparms.create_options = create_options;
- if (backup_cred(cifs_sb))
- oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
+ oparms.create_options = cifs_create_options(cifs_sb, create_options);
oparms.fid = &fid;
oparms.reconnect = false;
oparms.mode = mode;
@@ -456,7 +454,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
/* If it is a root and its handle is cached then use it */
if (!strlen(full_path) && !no_cached_open) {
- rc = open_shroot(xid, tcon, &fid);
+ rc = open_shroot(xid, tcon, cifs_sb, &fid);
if (rc)
goto out;
@@ -473,9 +471,6 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
goto out;
}
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN, create_options,
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 7177720e822e..d3d5d2c6c401 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -302,6 +302,9 @@ static const bool has_smb2_data_area[NUMBER_OF_SMB2_COMMANDS] = {
char *
smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
{
+ const int max_off = 4096;
+ const int max_len = 128 * 1024;
+
*off = 0;
*len = 0;
@@ -369,29 +372,20 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_sync_hdr *shdr)
* Invalid length or offset probably means data area is invalid, but
* we have little choice but to ignore the data area in this case.
*/
- if (*off > 4096) {
- cifs_dbg(VFS, "offset %d too large, data area ignored\n", *off);
- *len = 0;
- *off = 0;
- } else if (*off < 0) {
- cifs_dbg(VFS, "negative offset %d to data invalid ignore data area\n",
- *off);
+ if (unlikely(*off < 0 || *off > max_off ||
+ *len < 0 || *len > max_len)) {
+ cifs_dbg(VFS, "%s: invalid data area (off=%d len=%d)\n",
+ __func__, *off, *len);
*off = 0;
*len = 0;
- } else if (*len < 0) {
- cifs_dbg(VFS, "negative data length %d invalid, data area ignored\n",
- *len);
- *len = 0;
- } else if (*len > 128 * 1024) {
- cifs_dbg(VFS, "data area larger than 128K: %d\n", *len);
+ } else if (*off == 0) {
*len = 0;
}
/* return pointer to beginning of data area, ie offset from SMB start */
- if ((*off != 0) && (*len != 0))
+ if (*off > 0 && *len > 0)
return (char *)shdr + *off;
- else
- return NULL;
+ return NULL;
}
/*
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 57164563eec6..5fea15a5f341 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -79,6 +79,7 @@ smb2_add_credits(struct TCP_Server_Info *server,
*val = 65000; /* Don't get near 64K credits, avoid srv bugs */
printk_once(KERN_WARNING "server overflowed SMB3 credits\n");
}
+ WARN_ON_ONCE(server->in_flight == 0);
server->in_flight--;
if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP)
rc = change_conf(server);
@@ -587,7 +588,7 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon)
if (rc == -EOPNOTSUPP) {
cifs_dbg(FYI,
"server does not support query network interfaces\n");
- goto out;
+ ret_data_len = 0;
} else if (rc != 0) {
cifs_tcon_dbg(VFS, "error %d on ioctl to get interface list\n", rc);
goto out;
@@ -644,7 +645,8 @@ smb2_cached_lease_break(struct work_struct *work)
/*
* Open the directory at the root of a share
*/
-int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
+int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid)
{
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server = ses->server;
@@ -696,7 +698,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid)
rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
oparms.tcon = tcon;
- oparms.create_options = 0;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.desired_access = FILE_READ_ATTRIBUTES;
oparms.disposition = FILE_OPEN;
oparms.fid = pfid;
@@ -812,7 +814,8 @@ oshr_free:
}
static void
-smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb)
{
int rc;
__le16 srch_path = 0; /* Null - open root of share */
@@ -821,18 +824,19 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
struct cifs_fid fid;
bool no_cached_open = tcon->nohandlecache;
- oparms.tcon = tcon;
- oparms.desired_access = FILE_READ_ATTRIBUTES;
- oparms.disposition = FILE_OPEN;
- oparms.create_options = 0;
- oparms.fid = &fid;
- oparms.reconnect = false;
+ oparms = (struct cifs_open_parms) {
+ .tcon = tcon,
+ .desired_access = FILE_READ_ATTRIBUTES,
+ .disposition = FILE_OPEN,
+ .create_options = cifs_create_options(cifs_sb, 0),
+ .fid = &fid,
+ };
if (no_cached_open)
rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL,
NULL);
else
- rc = open_shroot(xid, tcon, &fid);
+ rc = open_shroot(xid, tcon, cifs_sb, &fid);
if (rc)
return;
@@ -854,7 +858,8 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
}
static void
-smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
+smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb)
{
int rc;
__le16 srch_path = 0; /* Null - open root of share */
@@ -865,7 +870,7 @@ smb2_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon)
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES;
oparms.disposition = FILE_OPEN;
- oparms.create_options = 0;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.fid = &fid;
oparms.reconnect = false;
@@ -900,10 +905,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES;
oparms.disposition = FILE_OPEN;
- if (backup_cred(cifs_sb))
- oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
- else
- oparms.create_options = 0;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.fid = &fid;
oparms.reconnect = false;
@@ -960,9 +962,7 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size,
size_t name_len, value_len, user_name_len;
while (src_size > 0) {
- name = &src->ea_data[0];
name_len = (size_t)src->ea_name_length;
- value = &src->ea_data[src->ea_name_length + 1];
value_len = (size_t)le16_to_cpu(src->ea_value_length);
if (name_len == 0)
@@ -974,6 +974,9 @@ move_smb2_ea_to_cifs(char *dst, size_t dst_size,
goto out;
}
+ name = &src->ea_data[0];
+ value = &src->ea_data[src->ea_name_length + 1];
+
if (ea_name) {
if (ea_name_len == name_len &&
memcmp(ea_name, name, name_len) == 0) {
@@ -1178,10 +1181,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.desired_access = FILE_WRITE_EA;
oparms.disposition = FILE_OPEN;
- if (backup_cred(cifs_sb))
- oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
- else
- oparms.create_options = 0;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.fid = &fid;
oparms.reconnect = false;
@@ -1215,6 +1215,8 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
COMPOUND_FID, current->tgid,
FILE_FULL_EA_INFORMATION,
SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto sea_exit;
smb2_set_next_command(tcon, &rqst[1]);
smb2_set_related(&rqst[1]);
@@ -1411,6 +1413,7 @@ req_res_key_exit:
static int
smb2_ioctl_query_info(const unsigned int xid,
struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
__le16 *path, int is_dir,
unsigned long p)
{
@@ -1436,6 +1439,7 @@ smb2_ioctl_query_info(const unsigned int xid,
struct kvec close_iov[1];
unsigned int size[2];
void *data[2];
+ int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR;
memset(rqst, 0, sizeof(rqst));
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
@@ -1471,10 +1475,7 @@ smb2_ioctl_query_info(const unsigned int xid,
memset(&oparms, 0, sizeof(oparms));
oparms.tcon = tcon;
oparms.disposition = FILE_OPEN;
- if (is_dir)
- oparms.create_options = CREATE_NOT_FILE;
- else
- oparms.create_options = CREATE_NOT_DIR;
+ oparms.create_options = cifs_create_options(cifs_sb, create_options);
oparms.fid = &fid;
oparms.reconnect = false;
@@ -1678,7 +1679,7 @@ smb2_copychunk_range(const unsigned int xid,
pcchunk->SourceOffset = cpu_to_le64(src_off);
pcchunk->TargetOffset = cpu_to_le64(dest_off);
pcchunk->Length =
- cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk));
+ cpu_to_le32(min_t(u64, len, tcon->max_bytes_chunk));
/* Request server copy to target from src identified by key */
kfree(retbuf);
@@ -2071,10 +2072,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
oparms.disposition = FILE_OPEN;
- if (backup_cred(cifs_sb))
- oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
- else
- oparms.create_options = 0;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.fid = fid;
oparms.reconnect = false;
@@ -2275,10 +2273,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.desired_access = desired_access;
oparms.disposition = FILE_OPEN;
- if (cifs_sb && backup_cred(cifs_sb))
- oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
- else
- oparms.create_options = 0;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.fid = &fid;
oparms.reconnect = false;
@@ -2334,7 +2329,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
static int
smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
- struct kstatfs *buf)
+ struct cifs_sb_info *cifs_sb, struct kstatfs *buf)
{
struct smb2_query_info_rsp *rsp;
struct smb2_fs_full_size_info *info = NULL;
@@ -2349,7 +2344,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
FS_FULL_SIZE_INFORMATION,
SMB2_O_INFO_FILESYSTEM,
sizeof(struct smb2_fs_full_size_info),
- &rsp_iov, &buftype, NULL);
+ &rsp_iov, &buftype, cifs_sb);
if (rc)
goto qfs_exit;
@@ -2371,7 +2366,7 @@ qfs_exit:
static int
smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
- struct kstatfs *buf)
+ struct cifs_sb_info *cifs_sb, struct kstatfs *buf)
{
int rc;
__le16 srch_path = 0; /* Null - open root of share */
@@ -2380,12 +2375,12 @@ smb311_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
if (!tcon->posix_extensions)
- return smb2_queryfs(xid, tcon, buf);
+ return smb2_queryfs(xid, tcon, cifs_sb, buf);
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES;
oparms.disposition = FILE_OPEN;
- oparms.create_options = 0;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.fid = &fid;
oparms.reconnect = false;
@@ -2503,6 +2498,8 @@ smb2_get_dfs_refer(const unsigned int xid, struct cifs_ses *ses,
(char **)&dfs_rsp, &dfs_rsp_size);
} while (rc == -EAGAIN);
+ if (!rc && !dfs_rsp)
+ rc = -EIO;
if (rc) {
if ((rc != -ENOENT) && (rc != -EOPNOTSUPP))
cifs_tcon_dbg(VFS, "ioctl error in %s rc=%d\n", __func__, rc);
@@ -2654,6 +2651,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
struct smb2_create_rsp *create_rsp;
struct smb2_ioctl_rsp *ioctl_rsp;
struct reparse_data_buffer *reparse_buf;
+ int create_options = is_reparse_point ? OPEN_REPARSE_POINT : 0;
u32 plen;
cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
@@ -2680,14 +2678,7 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES;
oparms.disposition = FILE_OPEN;
-
- if (backup_cred(cifs_sb))
- oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
- else
- oparms.create_options = 0;
- if (is_reparse_point)
- oparms.create_options = OPEN_REPARSE_POINT;
-
+ oparms.create_options = cifs_create_options(cifs_sb, create_options);
oparms.fid = &fid;
oparms.reconnect = false;
@@ -2866,11 +2857,6 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
tcon = tlink_tcon(tlink);
xid = get_xid();
- if (backup_cred(cifs_sb))
- oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
- else
- oparms.create_options = 0;
-
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path) {
rc = -ENOMEM;
@@ -2881,6 +2867,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb,
oparms.tcon = tcon;
oparms.desired_access = READ_CONTROL;
oparms.disposition = FILE_OPEN;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.fid = &fid;
oparms.reconnect = false;
@@ -2922,11 +2909,6 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
tcon = tlink_tcon(tlink);
xid = get_xid();
- if (backup_cred(cifs_sb))
- oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
- else
- oparms.create_options = 0;
-
if (aclflag == CIFS_ACL_OWNER || aclflag == CIFS_ACL_GROUP)
access_flags = WRITE_OWNER;
else
@@ -2941,6 +2923,7 @@ set_smb2_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
oparms.tcon = tcon;
oparms.desired_access = access_flags;
+ oparms.create_options = cifs_create_options(cifs_sb, 0);
oparms.disposition = FILE_OPEN;
oparms.path = path;
oparms.fid = &fid;
@@ -3050,7 +3033,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len)
{
- struct inode *inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
long rc;
@@ -3059,14 +3042,12 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
xid = get_xid();
- inode = d_inode(cfile->dentry);
-
+ inode_lock(inode);
/* Need to make file sparse, if not already, before freeing range. */
/* Consider adding equivalent for compressed since it could also work */
if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
+ goto out;
}
/*
@@ -3085,6 +3066,8 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
true /* is_fctl */, (char *)&fsctl_buf,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
+out:
+ inode_unlock(inode);
free_xid(xid);
return rc;
}
@@ -3622,69 +3605,82 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len,
memcpy(&tr_hdr->SessionId, &shdr->SessionId, 8);
}
-/* We can not use the normal sg_set_buf() as we will sometimes pass a
- * stack object as buf.
- */
-static inline void smb2_sg_set_buf(struct scatterlist *sg, const void *buf,
- unsigned int buflen)
+static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+ int num_rqst, const u8 *sig, u8 **iv,
+ struct aead_request **req, struct scatterlist **sgl,
+ unsigned int *num_sgs)
{
- void *addr;
- /*
- * VMAP_STACK (at least) puts stack into the vmalloc address space
- */
- if (is_vmalloc_addr(buf))
- addr = vmalloc_to_page(buf);
- else
- addr = virt_to_page(buf);
- sg_set_page(sg, addr, buflen, offset_in_page(buf));
+ unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm);
+ unsigned int iv_size = crypto_aead_ivsize(tfm);
+ unsigned int len;
+ u8 *p;
+
+ *num_sgs = cifs_get_num_sgs(rqst, num_rqst, sig);
+
+ len = iv_size;
+ len += crypto_aead_alignmask(tfm) & ~(crypto_tfm_ctx_alignment() - 1);
+ len = ALIGN(len, crypto_tfm_ctx_alignment());
+ len += req_size;
+ len = ALIGN(len, __alignof__(struct scatterlist));
+ len += *num_sgs * sizeof(**sgl);
+
+ p = kmalloc(len, GFP_ATOMIC);
+ if (!p)
+ return NULL;
+
+ *iv = (u8 *)PTR_ALIGN(p, crypto_aead_alignmask(tfm) + 1);
+ *req = (struct aead_request *)PTR_ALIGN(*iv + iv_size,
+ crypto_tfm_ctx_alignment());
+ *sgl = (struct scatterlist *)PTR_ALIGN((u8 *)*req + req_size,
+ __alignof__(struct scatterlist));
+ return p;
}
-/* Assumes the first rqst has a transform header as the first iov.
- * I.e.
- * rqst[0].rq_iov[0] is transform header
- * rqst[0].rq_iov[1+] data to be encrypted/decrypted
- * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
- */
-static struct scatterlist *
-init_sg(int num_rqst, struct smb_rqst *rqst, u8 *sign)
+static void *smb2_get_aead_req(struct crypto_aead *tfm, const struct smb_rqst *rqst,
+ int num_rqst, const u8 *sig, u8 **iv,
+ struct aead_request **req, struct scatterlist **sgl)
{
- unsigned int sg_len;
+ unsigned int off, len, skip;
struct scatterlist *sg;
- unsigned int i;
- unsigned int j;
- unsigned int idx = 0;
- int skip;
-
- sg_len = 1;
- for (i = 0; i < num_rqst; i++)
- sg_len += rqst[i].rq_nvec + rqst[i].rq_npages;
+ unsigned int num_sgs;
+ unsigned long addr;
+ int i, j;
+ void *p;
- sg = kmalloc_array(sg_len, sizeof(struct scatterlist), GFP_KERNEL);
- if (!sg)
+ p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, sgl, &num_sgs);
+ if (!p)
return NULL;
- sg_init_table(sg, sg_len);
+ sg_init_table(*sgl, num_sgs);
+ sg = *sgl;
+
+ /* Assumes the first rqst has a transform header as the first iov.
+ * I.e.
+ * rqst[0].rq_iov[0] is transform header
+ * rqst[0].rq_iov[1+] data to be encrypted/decrypted
+ * rqst[1+].rq_iov[0+] data to be encrypted/decrypted
+ */
for (i = 0; i < num_rqst; i++) {
+ /*
+ * The first rqst has a transform header where the
+ * first 20 bytes are not part of the encrypted blob.
+ */
for (j = 0; j < rqst[i].rq_nvec; j++) {
- /*
- * The first rqst has a transform header where the
- * first 20 bytes are not part of the encrypted blob
- */
- skip = (i == 0) && (j == 0) ? 20 : 0;
- smb2_sg_set_buf(&sg[idx++],
- rqst[i].rq_iov[j].iov_base + skip,
- rqst[i].rq_iov[j].iov_len - skip);
- }
+ struct kvec *iov = &rqst[i].rq_iov[j];
+ skip = (i == 0) && (j == 0) ? 20 : 0;
+ addr = (unsigned long)iov->iov_base + skip;
+ len = iov->iov_len - skip;
+ sg = cifs_sg_set_buf(sg, (void *)addr, len);
+ }
for (j = 0; j < rqst[i].rq_npages; j++) {
- unsigned int len, offset;
-
- rqst_page_get_length(&rqst[i], j, &len, &offset);
- sg_set_page(&sg[idx++], rqst[i].rq_pages[j], len, offset);
+ rqst_page_get_length(&rqst[i], j, &len, &off);
+ sg_set_page(sg++, rqst[i].rq_pages[j], len, off);
}
}
- smb2_sg_set_buf(&sg[idx], sign, SMB2_SIGNATURE_SIZE);
- return sg;
+ cifs_sg_set_buf(sg, sig, SMB2_SIGNATURE_SIZE);
+
+ return p;
}
static int
@@ -3726,11 +3722,11 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
u8 sign[SMB2_SIGNATURE_SIZE] = {};
u8 key[SMB3_SIGN_KEY_SIZE];
struct aead_request *req;
- char *iv;
- unsigned int iv_len;
+ u8 *iv;
DECLARE_CRYPTO_WAIT(wait);
struct crypto_aead *tfm;
unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize);
+ void *creq;
rc = smb2_get_enc_key(server, tr_hdr->SessionId, enc, key);
if (rc) {
@@ -3759,32 +3755,15 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
return rc;
}
- req = aead_request_alloc(tfm, GFP_KERNEL);
- if (!req) {
- cifs_server_dbg(VFS, "%s: Failed to alloc aead request\n", __func__);
+ creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg);
+ if (unlikely(!creq))
return -ENOMEM;
- }
if (!enc) {
memcpy(sign, &tr_hdr->Signature, SMB2_SIGNATURE_SIZE);
crypt_len += SMB2_SIGNATURE_SIZE;
}
- sg = init_sg(num_rqst, rqst, sign);
- if (!sg) {
- cifs_server_dbg(VFS, "%s: Failed to init sg\n", __func__);
- rc = -ENOMEM;
- goto free_req;
- }
-
- iv_len = crypto_aead_ivsize(tfm);
- iv = kzalloc(iv_len, GFP_KERNEL);
- if (!iv) {
- cifs_server_dbg(VFS, "%s: Failed to alloc iv\n", __func__);
- rc = -ENOMEM;
- goto free_sg;
- }
-
if (server->cipher_type == SMB2_ENCRYPTION_AES128_GCM)
memcpy(iv, (char *)tr_hdr->Nonce, SMB3_AES128GCM_NONCE);
else {
@@ -3792,6 +3771,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
memcpy(iv + 1, (char *)tr_hdr->Nonce, SMB3_AES128CCM_NONCE);
}
+ aead_request_set_tfm(req, tfm);
aead_request_set_crypt(req, sg, sg, crypt_len, iv);
aead_request_set_ad(req, assoc_data_len);
@@ -3804,11 +3784,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst,
if (!rc && enc)
memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE);
- kfree(iv);
-free_sg:
- kfree(sg);
-free_req:
- kfree(req);
+ kfree(creq);
return rc;
}
@@ -4485,7 +4461,6 @@ smb2_make_node(unsigned int xid, struct inode *inode,
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
int rc = -EPERM;
- int create_options = CREATE_NOT_DIR | CREATE_OPTION_SPECIAL;
FILE_ALL_INFO *buf = NULL;
struct cifs_io_parms io_parms;
__u32 oplock = 0;
@@ -4521,13 +4496,11 @@ smb2_make_node(unsigned int xid, struct inode *inode,
goto out;
}
- if (backup_cred(cifs_sb))
- create_options |= CREATE_OPEN_BACKUP_INTENT;
-
oparms.tcon = tcon;
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_WRITE;
- oparms.create_options = create_options;
+ oparms.create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR |
+ CREATE_OPTION_SPECIAL);
oparms.disposition = FILE_CREATE;
oparms.path = full_path;
oparms.fid = &fid;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 0857eb7a95e2..da1a1b531ca5 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1100,9 +1100,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon)
pneg_inbuf->Dialects[0] =
cpu_to_le16(server->vals->protocol_id);
pneg_inbuf->DialectCount = cpu_to_le16(1);
- /* structure is big enough for 3 dialects, sending only 1 */
+ /* structure is big enough for 4 dialects, sending only 1 */
inbuflen = sizeof(*pneg_inbuf) -
- sizeof(pneg_inbuf->Dialects[0]) * 2;
+ sizeof(pneg_inbuf->Dialects[0]) * 3;
}
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
@@ -3639,12 +3639,15 @@ smb2_readv_callback(struct mid_q_entry *mid)
(struct smb2_sync_hdr *)rdata->iov[0].iov_base;
struct cifs_credits credits = { .value = 0, .instance = 0 };
struct smb_rqst rqst = { .rq_iov = &rdata->iov[1],
- .rq_nvec = 1,
- .rq_pages = rdata->pages,
- .rq_offset = rdata->page_offset,
- .rq_npages = rdata->nr_pages,
- .rq_pagesz = rdata->pagesz,
- .rq_tailsz = rdata->tailsz };
+ .rq_nvec = 1, };
+
+ if (rdata->got_bytes) {
+ rqst.rq_pages = rdata->pages;
+ rqst.rq_offset = rdata->page_offset;
+ rqst.rq_npages = rdata->nr_pages;
+ rqst.rq_pagesz = rdata->pagesz;
+ rqst.rq_tailsz = rdata->tailsz;
+ }
cifs_dbg(FYI, "%s: mid=%llu state=%d result=%d bytes=%u\n",
__func__, mid->mid, mid->mid_state, rdata->result,
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 739556e385be..297f5e455a34 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -651,7 +651,7 @@ struct smb2_tree_disconnect_rsp {
#define SMB2_CREATE_SD_BUFFER "SecD" /* security descriptor */
#define SMB2_CREATE_DURABLE_HANDLE_REQUEST "DHnQ"
#define SMB2_CREATE_DURABLE_HANDLE_RECONNECT "DHnC"
-#define SMB2_CREATE_ALLOCATION_SIZE "AISi"
+#define SMB2_CREATE_ALLOCATION_SIZE "AlSi"
#define SMB2_CREATE_QUERY_MAXIMAL_ACCESS_REQUEST "MxAc"
#define SMB2_CREATE_TIMEWARP_REQUEST "TWrp"
#define SMB2_CREATE_QUERY_ON_DISK_ID "QFid"
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 57f7075a3587..4d4c0faa3d8a 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -67,7 +67,7 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server,
struct mid_q_entry *mid);
extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon,
- struct cifs_fid *pfid);
+ struct cifs_sb_info *cifs_sb, struct cifs_fid *pfid);
extern void close_shroot(struct cached_fid *cfid);
extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst,
struct smb2_file_all_info *src);
diff --git a/fs/cifs/smbdirect.c b/fs/cifs/smbdirect.c
index 5b1b97e9e0c9..0842a1af0b98 100644
--- a/fs/cifs/smbdirect.c
+++ b/fs/cifs/smbdirect.c
@@ -607,8 +607,13 @@ static struct rdma_cm_id *smbd_create_id(
log_rdma_event(ERR, "rdma_resolve_addr() failed %i\n", rc);
goto out;
}
- wait_for_completion_interruptible_timeout(
+ rc = wait_for_completion_interruptible_timeout(
&info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+ /* e.g. if interrupted returns -ERESTARTSYS */
+ if (rc < 0) {
+ log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
+ goto out;
+ }
rc = info->ri_rc;
if (rc) {
log_rdma_event(ERR, "rdma_resolve_addr() completed %i\n", rc);
@@ -621,8 +626,13 @@ static struct rdma_cm_id *smbd_create_id(
log_rdma_event(ERR, "rdma_resolve_route() failed %i\n", rc);
goto out;
}
- wait_for_completion_interruptible_timeout(
+ rc = wait_for_completion_interruptible_timeout(
&info->ri_done, msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT));
+ /* e.g. if interrupted returns -ERESTARTSYS */
+ if (rc < 0) {
+ log_rdma_event(ERR, "rdma_resolve_addr timeout rc: %i\n", rc);
+ goto out;
+ }
rc = info->ri_rc;
if (rc) {
log_rdma_event(ERR, "rdma_resolve_route() completed %i\n", rc);
@@ -1478,6 +1488,7 @@ void smbd_destroy(struct TCP_Server_Info *server)
destroy_workqueue(info->workqueue);
log_rdma_event(INFO, "rdma session destroyed\n");
kfree(info);
+ server->smbd_conn = NULL;
}
/*
@@ -1783,6 +1794,7 @@ static struct smbd_connection *_smbd_get_connection(
allocate_mr_failed:
/* At this point, need to a full transport shutdown */
+ server->smbd_conn = info;
smbd_destroy(server);
return NULL;
@@ -2346,6 +2358,7 @@ static int allocate_mr_list(struct smbd_connection *info)
atomic_set(&info->mr_ready_count, 0);
atomic_set(&info->mr_used_count, 0);
init_waitqueue_head(&info->wait_for_mr_cleanup);
+ INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
/* Allocate more MRs (2x) than hardware responder_resources */
for (i = 0; i < info->responder_resources * 2; i++) {
smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
@@ -2374,13 +2387,13 @@ static int allocate_mr_list(struct smbd_connection *info)
list_add_tail(&smbdirect_mr->list, &info->mr_list);
atomic_inc(&info->mr_ready_count);
}
- INIT_WORK(&info->mr_recovery_work, smbd_mr_recovery_work);
return 0;
out:
kfree(smbdirect_mr);
list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
+ list_del(&smbdirect_mr->list);
ib_dereg_mr(smbdirect_mr->mr);
kfree(smbdirect_mr->sgl);
kfree(smbdirect_mr);
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 61e7df4d9cb1..60141a7468b0 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -209,10 +209,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg,
*sent = 0;
- smb_msg->msg_name = (struct sockaddr *) &server->dstaddr;
- smb_msg->msg_namelen = sizeof(struct sockaddr);
- smb_msg->msg_control = NULL;
- smb_msg->msg_controllen = 0;
if (server->noblocksnd)
smb_msg->msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
else
@@ -316,7 +312,7 @@ static int
__smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
struct smb_rqst *rqst)
{
- int rc = 0;
+ int rc;
struct kvec *iov;
int n_vec;
unsigned int send_length = 0;
@@ -324,10 +320,11 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
sigset_t mask, oldmask;
size_t total_len = 0, sent, size;
struct socket *ssocket = server->ssocket;
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
int val = 1;
__be32 rfc1002_marker;
+ cifs_in_send_inc(server);
if (cifs_rdma_enabled(server)) {
/* return -EAGAIN when connecting or reconnecting */
rc = -EAGAIN;
@@ -336,14 +333,17 @@ __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
goto smbd_done;
}
+ rc = -EAGAIN;
if (ssocket == NULL)
- return -EAGAIN;
+ goto out;
+ rc = -ERESTARTSYS;
if (fatal_signal_pending(current)) {
cifs_dbg(FYI, "signal pending before send request\n");
- return -ERESTARTSYS;
+ goto out;
}
+ rc = 0;
/* cork the socket */
kernel_setsockopt(ssocket, SOL_TCP, TCP_CORK,
(char *)&val, sizeof(val));
@@ -457,7 +457,8 @@ smbd_done:
rc);
else if (rc > 0)
rc = 0;
-
+out:
+ cifs_in_send_dec(server);
return rc;
}
@@ -834,9 +835,7 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst,
* I/O response may come back and free the mid entry on another thread.
*/
cifs_save_when_sent(mid);
- cifs_in_send_inc(server);
rc = smb_send_rqst(server, 1, rqst, flags);
- cifs_in_send_dec(server);
if (rc < 0) {
revert_current_mid(server, mid->credits);
@@ -1099,9 +1098,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
else
midQ[i]->callback = cifs_compound_last_callback;
}
- cifs_in_send_inc(server);
rc = smb_send_rqst(server, num_rqst, rqst, flags);
- cifs_in_send_dec(server);
for (i = 0; i < num_rqst; i++)
cifs_save_when_sent(midQ[i]);
@@ -1336,9 +1333,7 @@ SendReceive(const unsigned int xid, struct cifs_ses *ses,
midQ->mid_state = MID_REQUEST_SUBMITTED;
- cifs_in_send_inc(server);
rc = smb_send(server, in_buf, len);
- cifs_in_send_dec(server);
cifs_save_when_sent(midQ);
if (rc < 0)
@@ -1475,9 +1470,7 @@ SendReceiveBlockingLock(const unsigned int xid, struct cifs_tcon *tcon,
}
midQ->mid_state = MID_REQUEST_SUBMITTED;
- cifs_in_send_inc(server);
rc = smb_send(server, in_buf, len);
- cifs_in_send_dec(server);
cifs_save_when_sent(midQ);
if (rc < 0)
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index eb3b1898da46..610484c90260 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -790,7 +790,7 @@ static int coda_upcall(struct venus_comm *vcp,
sig_req = kmalloc(sizeof(struct upc_req), GFP_KERNEL);
if (!sig_req) goto exit;
- sig_inputArgs = kvzalloc(sizeof(struct coda_in_hdr), GFP_KERNEL);
+ sig_inputArgs = kvzalloc(sizeof(*sig_inputArgs), GFP_KERNEL);
if (!sig_inputArgs) {
kfree(sig_req);
goto exit;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8fcc53d83af2..22f7dc6688de 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -994,8 +994,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
if (!f.file)
goto out;
- /* RED-PEN how should LSM module know it's handling 32bit? */
- error = security_file_ioctl(f.file, cmd, arg);
+ error = security_file_ioctl_compat(f.file, cmd, arg);
if (error)
goto out_fput;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index d73d88d9c259..bc27e3ad97ff 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -317,6 +317,7 @@ static int configfs_create_dir(struct config_item *item, struct dentry *dentry,
return 0;
out_remove:
+ configfs_put(dentry->d_fsdata);
configfs_remove_dirent(dentry);
return PTR_ERR(inode);
}
@@ -383,6 +384,7 @@ int configfs_create_link(struct configfs_dirent *target, struct dentry *parent,
return 0;
out_remove:
+ configfs_put(dentry->d_fsdata);
configfs_remove_dirent(dentry);
return PTR_ERR(inode);
}
diff --git a/fs/dcache.c b/fs/dcache.c
index c08d8b922770..76517330103b 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -740,12 +740,12 @@ static inline bool fast_dput(struct dentry *dentry)
*/
if (unlikely(ret < 0)) {
spin_lock(&dentry->d_lock);
- if (dentry->d_lockref.count > 1) {
- dentry->d_lockref.count--;
+ if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
spin_unlock(&dentry->d_lock);
return true;
}
- return false;
+ dentry->d_lockref.count--;
+ goto locked;
}
/*
@@ -796,6 +796,7 @@ static inline bool fast_dput(struct dentry *dentry)
* else could have killed it and marked it dead. Either way, we
* don't need to do anything else.
*/
+locked:
if (dentry->d_lockref.count) {
spin_unlock(&dentry->d_lock);
return true;
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index da87615ad69a..3547388ad60b 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -18,6 +18,7 @@
#include <linux/slab.h>
#include <linux/atomic.h>
#include <linux/device.h>
+#include <linux/pm_runtime.h>
#include <linux/poll.h>
#include <linux/security.h>
@@ -377,8 +378,8 @@ ssize_t debugfs_attr_read(struct file *file, char __user *buf,
}
EXPORT_SYMBOL_GPL(debugfs_attr_read);
-ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
+static ssize_t debugfs_attr_write_xsigned(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos, bool is_signed)
{
struct dentry *dentry = F_DENTRY(file);
ssize_t ret;
@@ -386,12 +387,28 @@ ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
ret = debugfs_file_get(dentry);
if (unlikely(ret))
return ret;
- ret = simple_attr_write(file, buf, len, ppos);
+ if (is_signed)
+ ret = simple_attr_write_signed(file, buf, len, ppos);
+ else
+ ret = simple_attr_write(file, buf, len, ppos);
debugfs_file_put(dentry);
return ret;
}
+
+ssize_t debugfs_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return debugfs_attr_write_xsigned(file, buf, len, ppos, false);
+}
EXPORT_SYMBOL_GPL(debugfs_attr_write);
+ssize_t debugfs_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return debugfs_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(debugfs_attr_write_signed);
+
static struct dentry *debugfs_create_mode_unsafe(const char *name, umode_t mode,
struct dentry *parent, void *value,
const struct file_operations *fops,
@@ -784,11 +801,11 @@ static int debugfs_atomic_t_get(void *data, u64 *val)
*val = atomic_read((atomic_t *)data);
return 0;
}
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t, debugfs_atomic_t_get,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t, debugfs_atomic_t_get,
debugfs_atomic_t_set, "%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_ro, debugfs_atomic_t_get, NULL,
"%lld\n");
-DEFINE_DEBUGFS_ATTRIBUTE(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
+DEFINE_DEBUGFS_ATTRIBUTE_SIGNED(fops_atomic_t_wo, NULL, debugfs_atomic_t_set,
"%lld\n");
/**
@@ -1103,7 +1120,14 @@ static int debugfs_show_regset32(struct seq_file *s, void *data)
{
struct debugfs_regset32 *regset = s->private;
+ if (regset->dev)
+ pm_runtime_get_sync(regset->dev);
+
debugfs_print_regs32(s, regset->regs, regset->nregs, regset->base, "");
+
+ if (regset->dev)
+ pm_runtime_put(regset->dev);
+
return 0;
}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index e595a29bf46e..258230f4e485 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -299,13 +299,9 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent)
if (!parent)
parent = debugfs_mount->mnt_root;
- dentry = lookup_one_len_unlocked(name, parent, strlen(name));
+ dentry = lookup_positive_unlocked(name, parent, strlen(name));
if (IS_ERR(dentry))
return NULL;
- if (!d_really_is_positive(dentry)) {
- dput(dentry);
- return NULL;
- }
return dentry;
}
EXPORT_SYMBOL_GPL(debugfs_lookup);
@@ -743,6 +739,28 @@ void debugfs_remove(struct dentry *dentry)
EXPORT_SYMBOL_GPL(debugfs_remove);
/**
+ * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it
+ * @name: a pointer to a string containing the name of the item to look up.
+ * @parent: a pointer to the parent dentry of the item.
+ *
+ * This is the equlivant of doing something like
+ * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting
+ * handled for the directory being looked up.
+ */
+void debugfs_lookup_and_remove(const char *name, struct dentry *parent)
+{
+ struct dentry *dentry;
+
+ dentry = debugfs_lookup(name, parent);
+ if (!dentry)
+ return;
+
+ debugfs_remove(dentry);
+ dput(dentry);
+}
+EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove);
+
+/**
* debugfs_remove_recursive - recursively removes a directory
* @dentry: a pointer to a the dentry of the directory to be removed. If this
* parameter is NULL or an error value, nothing will be done.
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 283c7b94edda..ca06069e95c8 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -198,13 +198,13 @@ void dlm_add_cb(struct dlm_lkb *lkb, uint32_t flags, int mode, int status,
if (!prev_seq) {
kref_get(&lkb->lkb_ref);
+ mutex_lock(&ls->ls_cb_mutex);
if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) {
- mutex_lock(&ls->ls_cb_mutex);
list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay);
- mutex_unlock(&ls->ls_cb_mutex);
} else {
queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
}
+ mutex_unlock(&ls->ls_cb_mutex);
}
out:
mutex_unlock(&lkb->lkb_cb_mutex);
@@ -284,7 +284,9 @@ void dlm_callback_stop(struct dlm_ls *ls)
void dlm_callback_suspend(struct dlm_ls *ls)
{
+ mutex_lock(&ls->ls_cb_mutex);
set_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ mutex_unlock(&ls->ls_cb_mutex);
if (ls->ls_callback_wq)
flush_workqueue(ls->ls_callback_wq);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 9165bf56c6e8..86d645d02d55 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1856,7 +1856,7 @@ static void del_timeout(struct dlm_lkb *lkb)
void dlm_scan_timeout(struct dlm_ls *ls)
{
struct dlm_rsb *r;
- struct dlm_lkb *lkb;
+ struct dlm_lkb *lkb = NULL, *iter;
int do_cancel, do_warn;
s64 wait_us;
@@ -1867,27 +1867,28 @@ void dlm_scan_timeout(struct dlm_ls *ls)
do_cancel = 0;
do_warn = 0;
mutex_lock(&ls->ls_timeout_mutex);
- list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) {
+ list_for_each_entry(iter, &ls->ls_timeout, lkb_time_list) {
wait_us = ktime_to_us(ktime_sub(ktime_get(),
- lkb->lkb_timestamp));
+ iter->lkb_timestamp));
- if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) &&
- wait_us >= (lkb->lkb_timeout_cs * 10000))
+ if ((iter->lkb_exflags & DLM_LKF_TIMEOUT) &&
+ wait_us >= (iter->lkb_timeout_cs * 10000))
do_cancel = 1;
- if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
+ if ((iter->lkb_flags & DLM_IFL_WATCH_TIMEWARN) &&
wait_us >= dlm_config.ci_timewarn_cs * 10000)
do_warn = 1;
if (!do_cancel && !do_warn)
continue;
- hold_lkb(lkb);
+ hold_lkb(iter);
+ lkb = iter;
break;
}
mutex_unlock(&ls->ls_timeout_mutex);
- if (!do_cancel && !do_warn)
+ if (!lkb)
break;
r = lkb->lkb_resource;
@@ -2888,24 +2889,24 @@ static int set_unlock_args(uint32_t flags, void *astarg, struct dlm_args *args)
static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_args *args)
{
- int rv = -EINVAL;
+ int rv = -EBUSY;
if (args->flags & DLM_LKF_CONVERT) {
- if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ if (lkb->lkb_status != DLM_LKSTS_GRANTED)
goto out;
- if (args->flags & DLM_LKF_QUECVT &&
- !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+ if (lkb->lkb_wait_type)
goto out;
- rv = -EBUSY;
- if (lkb->lkb_status != DLM_LKSTS_GRANTED)
+ if (is_overlap(lkb))
goto out;
- if (lkb->lkb_wait_type)
+ rv = -EINVAL;
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
goto out;
- if (is_overlap(lkb))
+ if (args->flags & DLM_LKF_QUECVT &&
+ !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
goto out;
}
@@ -5241,21 +5242,18 @@ void dlm_recover_waiters_pre(struct dlm_ls *ls)
static struct dlm_lkb *find_resend_waiter(struct dlm_ls *ls)
{
- struct dlm_lkb *lkb;
- int found = 0;
+ struct dlm_lkb *lkb = NULL, *iter;
mutex_lock(&ls->ls_waiters_mutex);
- list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
- if (lkb->lkb_flags & DLM_IFL_RESEND) {
- hold_lkb(lkb);
- found = 1;
+ list_for_each_entry(iter, &ls->ls_waiters, lkb_wait_reply) {
+ if (iter->lkb_flags & DLM_IFL_RESEND) {
+ hold_lkb(iter);
+ lkb = iter;
break;
}
}
mutex_unlock(&ls->ls_waiters_mutex);
- if (!found)
- lkb = NULL;
return lkb;
}
@@ -5914,37 +5912,36 @@ int dlm_user_adopt_orphan(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
int mode, uint32_t flags, void *name, unsigned int namelen,
unsigned long timeout_cs, uint32_t *lkid)
{
- struct dlm_lkb *lkb;
+ struct dlm_lkb *lkb = NULL, *iter;
struct dlm_user_args *ua;
int found_other_mode = 0;
- int found = 0;
int rv = 0;
mutex_lock(&ls->ls_orphans_mutex);
- list_for_each_entry(lkb, &ls->ls_orphans, lkb_ownqueue) {
- if (lkb->lkb_resource->res_length != namelen)
+ list_for_each_entry(iter, &ls->ls_orphans, lkb_ownqueue) {
+ if (iter->lkb_resource->res_length != namelen)
continue;
- if (memcmp(lkb->lkb_resource->res_name, name, namelen))
+ if (memcmp(iter->lkb_resource->res_name, name, namelen))
continue;
- if (lkb->lkb_grmode != mode) {
+ if (iter->lkb_grmode != mode) {
found_other_mode = 1;
continue;
}
- found = 1;
- list_del_init(&lkb->lkb_ownqueue);
- lkb->lkb_flags &= ~DLM_IFL_ORPHAN;
- *lkid = lkb->lkb_id;
+ lkb = iter;
+ list_del_init(&iter->lkb_ownqueue);
+ iter->lkb_flags &= ~DLM_IFL_ORPHAN;
+ *lkid = iter->lkb_id;
break;
}
mutex_unlock(&ls->ls_orphans_mutex);
- if (!found && found_other_mode) {
+ if (!lkb && found_other_mode) {
rv = -EAGAIN;
goto out;
}
- if (!found) {
+ if (!lkb) {
rv = -ENOENT;
goto out;
}
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index e7f550327d5d..e338c407cb75 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -113,7 +113,7 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb)
void dlm_timeout_warn(struct dlm_lkb *lkb)
{
- struct sk_buff *uninitialized_var(send_skb);
+ struct sk_buff *send_skb;
struct dlm_lock_data *data;
size_t size;
int rv;
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index a10d2bcfe75a..5f2e2fa2ba09 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -19,20 +19,20 @@ static struct list_head recv_list;
static wait_queue_head_t send_wq;
static wait_queue_head_t recv_wq;
-struct plock_op {
- struct list_head list;
- int done;
- struct dlm_plock_info info;
- int (*callback)(struct file_lock *fl, int result);
-};
-
-struct plock_xop {
- struct plock_op xop;
+struct plock_async_data {
void *fl;
void *file;
struct file_lock flc;
+ int (*callback)(struct file_lock *fl, int result);
};
+struct plock_op {
+ struct list_head list;
+ int done;
+ struct dlm_plock_info info;
+ /* if set indicates async handling */
+ struct plock_async_data *data;
+};
static inline void set_version(struct dlm_plock_info *info)
{
@@ -58,6 +58,12 @@ static int check_version(struct dlm_plock_info *info)
return 0;
}
+static void dlm_release_plock_op(struct plock_op *op)
+{
+ kfree(op->data);
+ kfree(op);
+}
+
static void send_op(struct plock_op *op)
{
set_version(&op->info);
@@ -74,8 +80,7 @@ static void send_op(struct plock_op *op)
abandoned waiter. So, we have to insert the unlock-close when the
lock call is interrupted. */
-static void do_unlock_close(struct dlm_ls *ls, u64 number,
- struct file *file, struct file_lock *fl)
+static void do_unlock_close(const struct dlm_plock_info *info)
{
struct plock_op *op;
@@ -84,15 +89,12 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number,
return;
op->info.optype = DLM_PLOCK_OP_UNLOCK;
- op->info.pid = fl->fl_pid;
- op->info.fsid = ls->ls_global_id;
- op->info.number = number;
+ op->info.pid = info->pid;
+ op->info.fsid = info->fsid;
+ op->info.number = info->number;
op->info.start = 0;
op->info.end = OFFSET_MAX;
- if (fl->fl_lmops && fl->fl_lmops->lm_grant)
- op->info.owner = (__u64) fl->fl_pid;
- else
- op->info.owner = (__u64)(long) fl->fl_owner;
+ op->info.owner = info->owner;
op->info.flags |= DLM_PLOCK_FL_CLOSE;
send_op(op);
@@ -101,22 +103,21 @@ static void do_unlock_close(struct dlm_ls *ls, u64 number,
int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
int cmd, struct file_lock *fl)
{
+ struct plock_async_data *op_data;
struct dlm_ls *ls;
struct plock_op *op;
- struct plock_xop *xop;
int rv;
ls = dlm_find_lockspace_local(lockspace);
if (!ls)
return -EINVAL;
- xop = kzalloc(sizeof(*xop), GFP_NOFS);
- if (!xop) {
+ op = kzalloc(sizeof(*op), GFP_NOFS);
+ if (!op) {
rv = -ENOMEM;
goto out;
}
- op = &xop->xop;
op->info.optype = DLM_PLOCK_OP_LOCK;
op->info.pid = fl->fl_pid;
op->info.ex = (fl->fl_type == F_WRLCK);
@@ -125,35 +126,45 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
+ /* async handling */
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
+ op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
+ if (!op_data) {
+ dlm_release_plock_op(op);
+ rv = -ENOMEM;
+ goto out;
+ }
+
/* fl_owner is lockd which doesn't distinguish
processes on the nfs client */
op->info.owner = (__u64) fl->fl_pid;
- op->callback = fl->fl_lmops->lm_grant;
- locks_init_lock(&xop->flc);
- locks_copy_lock(&xop->flc, fl);
- xop->fl = fl;
- xop->file = file;
+ op_data->callback = fl->fl_lmops->lm_grant;
+ locks_init_lock(&op_data->flc);
+ locks_copy_lock(&op_data->flc, fl);
+ op_data->fl = fl;
+ op_data->file = file;
+
+ op->data = op_data;
+
+ send_op(op);
+ rv = FILE_LOCK_DEFERRED;
+ goto out;
} else {
op->info.owner = (__u64)(long) fl->fl_owner;
}
send_op(op);
- if (!op->callback) {
- rv = wait_event_interruptible(recv_wq, (op->done != 0));
- if (rv == -ERESTARTSYS) {
- log_debug(ls, "dlm_posix_lock: wait killed %llx",
- (unsigned long long)number);
- spin_lock(&ops_lock);
- list_del(&op->list);
- spin_unlock(&ops_lock);
- kfree(xop);
- do_unlock_close(ls, number, file, fl);
- goto out;
- }
- } else {
- rv = FILE_LOCK_DEFERRED;
+ rv = wait_event_killable(recv_wq, (op->done != 0));
+ if (rv == -ERESTARTSYS) {
+ spin_lock(&ops_lock);
+ list_del(&op->list);
+ spin_unlock(&ops_lock);
+ log_debug(ls, "%s: wait interrupted %x %llx pid %d",
+ __func__, ls->ls_global_id,
+ (unsigned long long)number, op->info.pid);
+ dlm_release_plock_op(op);
+ do_unlock_close(&op->info);
goto out;
}
@@ -173,7 +184,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
(unsigned long long)number);
}
- kfree(xop);
+ dlm_release_plock_op(op);
out:
dlm_put_lockspace(ls);
return rv;
@@ -183,11 +194,11 @@ EXPORT_SYMBOL_GPL(dlm_posix_lock);
/* Returns failure iff a successful lock operation should be canceled */
static int dlm_plock_callback(struct plock_op *op)
{
+ struct plock_async_data *op_data = op->data;
struct file *file;
struct file_lock *fl;
struct file_lock *flc;
int (*notify)(struct file_lock *fl, int result) = NULL;
- struct plock_xop *xop = (struct plock_xop *)op;
int rv = 0;
spin_lock(&ops_lock);
@@ -199,10 +210,10 @@ static int dlm_plock_callback(struct plock_op *op)
spin_unlock(&ops_lock);
/* check if the following 2 are still valid or make a copy */
- file = xop->file;
- flc = &xop->flc;
- fl = xop->fl;
- notify = op->callback;
+ file = op_data->file;
+ flc = &op_data->flc;
+ fl = op_data->fl;
+ notify = op_data->callback;
if (op->info.rv) {
notify(fl, op->info.rv);
@@ -233,7 +244,7 @@ static int dlm_plock_callback(struct plock_op *op)
}
out:
- kfree(xop);
+ dlm_release_plock_op(op);
return rv;
}
@@ -303,7 +314,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
rv = 0;
out_free:
- kfree(op);
+ dlm_release_plock_op(op);
out:
dlm_put_lockspace(ls);
fl->fl_flags = fl_flags;
@@ -363,13 +374,15 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
locks_init_lock(fl);
fl->fl_type = (op->info.ex) ? F_WRLCK : F_RDLCK;
fl->fl_flags = FL_POSIX;
- fl->fl_pid = -op->info.pid;
+ fl->fl_pid = op->info.pid;
+ if (op->info.nodeid != dlm_our_nodeid())
+ fl->fl_pid = -fl->fl_pid;
fl->fl_start = op->info.start;
fl->fl_end = op->info.end;
rv = 0;
}
- kfree(op);
+ dlm_release_plock_op(op);
out:
dlm_put_lockspace(ls);
return rv;
@@ -392,7 +405,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
if (op->info.flags & DLM_PLOCK_FL_CLOSE)
list_del(&op->list);
else
- list_move(&op->list, &recv_list);
+ list_move_tail(&op->list, &recv_list);
memcpy(&info, &op->info, sizeof(info));
}
spin_unlock(&ops_lock);
@@ -405,7 +418,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
(the process did not make an unlock call). */
if (op->info.flags & DLM_PLOCK_FL_CLOSE)
- kfree(op);
+ dlm_release_plock_op(op);
if (copy_to_user(u, &info, sizeof(info)))
return -EFAULT;
@@ -417,9 +430,9 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
loff_t *ppos)
{
+ struct plock_op *op = NULL, *iter;
struct dlm_plock_info info;
- struct plock_op *op;
- int found = 0, do_callback = 0;
+ int do_callback = 0;
if (count != sizeof(info))
return -EINVAL;
@@ -430,31 +443,63 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
if (check_version(&info))
return -EINVAL;
+ /*
+ * The results for waiting ops (SETLKW) can be returned in any
+ * order, so match all fields to find the op. The results for
+ * non-waiting ops are returned in the order that they were sent
+ * to userspace, so match the result with the first non-waiting op.
+ */
spin_lock(&ops_lock);
- list_for_each_entry(op, &recv_list, list) {
- if (op->info.fsid == info.fsid &&
- op->info.number == info.number &&
- op->info.owner == info.owner) {
- list_del_init(&op->list);
- memcpy(&op->info, &info, sizeof(info));
- if (op->callback)
- do_callback = 1;
- else
- op->done = 1;
- found = 1;
- break;
+ if (info.wait) {
+ list_for_each_entry(iter, &recv_list, list) {
+ if (iter->info.fsid == info.fsid &&
+ iter->info.number == info.number &&
+ iter->info.owner == info.owner &&
+ iter->info.pid == info.pid &&
+ iter->info.start == info.start &&
+ iter->info.end == info.end &&
+ iter->info.ex == info.ex &&
+ iter->info.wait) {
+ op = iter;
+ break;
+ }
}
+ } else {
+ list_for_each_entry(iter, &recv_list, list) {
+ if (!iter->info.wait &&
+ iter->info.fsid == info.fsid) {
+ op = iter;
+ break;
+ }
+ }
+ }
+
+ if (op) {
+ /* Sanity check that op and info match. */
+ if (info.wait)
+ WARN_ON(op->info.optype != DLM_PLOCK_OP_LOCK);
+ else
+ WARN_ON(op->info.number != info.number ||
+ op->info.owner != info.owner ||
+ op->info.optype != info.optype);
+
+ list_del_init(&op->list);
+ memcpy(&op->info, &info, sizeof(info));
+ if (op->data)
+ do_callback = 1;
+ else
+ op->done = 1;
}
spin_unlock(&ops_lock);
- if (found) {
+ if (op) {
if (do_callback)
dlm_plock_callback(op);
else
wake_up(&recv_wq);
} else
- log_print("dev_write no op %x %llx", info.fsid,
- (unsigned long long)info.number);
+ log_print("%s: no op %x %llx", __func__,
+ info.fsid, (unsigned long long)info.number);
return count;
}
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 8928e99dfd47..df18f38a0273 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -732,10 +732,9 @@ void dlm_recovered_lock(struct dlm_rsb *r)
static void recover_lvb(struct dlm_rsb *r)
{
- struct dlm_lkb *lkb, *high_lkb = NULL;
+ struct dlm_lkb *big_lkb = NULL, *iter, *high_lkb = NULL;
uint32_t high_seq = 0;
int lock_lvb_exists = 0;
- int big_lock_exists = 0;
int lvblen = r->res_ls->ls_lvblen;
if (!rsb_flag(r, RSB_NEW_MASTER2) &&
@@ -751,37 +750,37 @@ static void recover_lvb(struct dlm_rsb *r)
/* we are the new master, so figure out if VALNOTVALID should
be set, and set the rsb lvb from the best lkb available. */
- list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
- if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ list_for_each_entry(iter, &r->res_grantqueue, lkb_statequeue) {
+ if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
continue;
lock_lvb_exists = 1;
- if (lkb->lkb_grmode > DLM_LOCK_CR) {
- big_lock_exists = 1;
+ if (iter->lkb_grmode > DLM_LOCK_CR) {
+ big_lkb = iter;
goto setflag;
}
- if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
- high_lkb = lkb;
- high_seq = lkb->lkb_lvbseq;
+ if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = iter;
+ high_seq = iter->lkb_lvbseq;
}
}
- list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) {
- if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
+ list_for_each_entry(iter, &r->res_convertqueue, lkb_statequeue) {
+ if (!(iter->lkb_exflags & DLM_LKF_VALBLK))
continue;
lock_lvb_exists = 1;
- if (lkb->lkb_grmode > DLM_LOCK_CR) {
- big_lock_exists = 1;
+ if (iter->lkb_grmode > DLM_LOCK_CR) {
+ big_lkb = iter;
goto setflag;
}
- if (((int)lkb->lkb_lvbseq - (int)high_seq) >= 0) {
- high_lkb = lkb;
- high_seq = lkb->lkb_lvbseq;
+ if (((int)iter->lkb_lvbseq - (int)high_seq) >= 0) {
+ high_lkb = iter;
+ high_seq = iter->lkb_lvbseq;
}
}
@@ -790,7 +789,7 @@ static void recover_lvb(struct dlm_rsb *r)
goto out;
/* lvb is invalidated if only NL/CR locks remain */
- if (!big_lock_exists)
+ if (!big_lkb)
rsb_set_flag(r, RSB_VALNOTVALID);
if (!r->res_lvbptr) {
@@ -799,9 +798,9 @@ static void recover_lvb(struct dlm_rsb *r)
goto out;
}
- if (big_lock_exists) {
- r->res_lvbseq = lkb->lkb_lvbseq;
- memcpy(r->res_lvbptr, lkb->lkb_lvbptr, lvblen);
+ if (big_lkb) {
+ r->res_lvbseq = big_lkb->lkb_lvbseq;
+ memcpy(r->res_lvbptr, big_lkb->lkb_lvbptr, lvblen);
} else if (high_lkb) {
r->res_lvbseq = high_lkb->lkb_lvbseq;
memcpy(r->res_lvbptr, high_lkb->lkb_lvbptr, lvblen);
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e23752d9a79f..c867a0d62f36 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -76,6 +76,14 @@ static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb))
return ERR_PTR(-EXDEV);
+
+ /* Reject dealing with casefold directories. */
+ if (IS_CASEFOLDED(lower_inode)) {
+ pr_err_ratelimited("%s: Can't handle casefolded directory.\n",
+ __func__);
+ return ERR_PTR(-EREMOTE);
+ }
+
if (!igrab(lower_inode))
return ERR_PTR(-ESTALE);
inode = iget5_locked(sb, (unsigned long)lower_inode,
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 23b74b8e8f96..38eeec5e3032 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -56,14 +56,18 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
if (page) {
__clear_bit(j, bounced);
- if (kaddr) {
- if (kaddr + PAGE_SIZE == page_address(page))
+ if (!PageHighMem(page)) {
+ if (!i) {
+ kaddr = page_address(page);
+ continue;
+ }
+ if (kaddr &&
+ kaddr + PAGE_SIZE == page_address(page)) {
kaddr += PAGE_SIZE;
- else
- kaddr = NULL;
- } else if (!i) {
- kaddr = page_address(page);
+ continue;
+ }
}
+ kaddr = NULL;
continue;
}
kaddr = NULL;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 544a453f3076..cc7a42682814 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -226,7 +226,7 @@ struct erofs_inode {
unsigned char datalayout;
unsigned char inode_isize;
- unsigned short xattr_isize;
+ unsigned int xattr_isize;
unsigned int xattr_shared_count;
unsigned int *xattr_shared_xattrs;
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index fdd18c250811..fb718c3e3ebd 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -636,9 +636,11 @@ hitted:
tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED &&
clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
- cur = end - min_t(unsigned int, offset + end - map->m_la, end);
+ cur = end - min_t(erofs_off_t, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
zero_user_segment(page, cur, end);
+ ++spiltted;
+ tight = false;
goto next_part;
}
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index fff574100721..6553f58fb289 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -179,6 +179,10 @@ static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
m->clusterofs = le16_to_cpu(di->di_clusterofs);
+ if (m->clusterofs >= 1 << vi->z_logical_clusterbits) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
m->pblk = le32_to_cpu(di->di_u.blkaddr);
break;
default:
@@ -211,7 +215,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
int i;
u8 *in, type;
- if (1 << amortizedshift == 4)
+ if (1 << amortizedshift == 4 && lclusterbits <= 14)
vcnt = 2;
else if (1 << amortizedshift == 2 && lclusterbits == 12)
vcnt = 16;
@@ -269,7 +273,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
- const unsigned int lclusterbits = vi->z_logical_clusterbits;
const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) +
vi->inode_isize + vi->xattr_isize, 8) +
sizeof(struct z_erofs_map_header);
@@ -279,9 +282,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
erofs_off_t pos;
int err;
- if (lclusterbits != 12)
- return -EOPNOTSUPP;
-
if (lcn >= totalidx)
return -EINVAL;
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8b9bd6fb08cd..1d45cedeeb43 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -182,11 +182,14 @@ static __poll_t eventfd_poll(struct file *file, poll_table *wait)
return events;
}
-static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
+void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
{
- *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+ lockdep_assert_held(&ctx->wqh.lock);
+
+ *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count;
ctx->count -= *cnt;
}
+EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
/**
* eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 339453ac834c..8c0e94183186 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1803,6 +1803,25 @@ static inline struct timespec64 ep_set_mstimeout(long ms)
return timespec64_add_safe(now, ts);
}
+/*
+ * autoremove_wake_function, but remove even on failure to wake up, because we
+ * know that default_wake_function/ttwu will only fail if the thread is already
+ * woken, and in that case the ep_poll loop will remove the entry anyways, not
+ * try to reuse it.
+ */
+static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
+ unsigned int mode, int sync, void *key)
+{
+ int ret = default_wake_function(wq_entry, mode, sync, key);
+
+ /*
+ * Pairs with list_empty_careful in ep_poll, and ensures future loop
+ * iterations see the cause of this wakeup.
+ */
+ list_del_init_careful(&wq_entry->entry);
+ return ret;
+}
+
/**
* ep_poll - Retrieves ready events, and delivers them to the caller supplied
* event buffer.
@@ -1880,56 +1899,77 @@ fetch_events:
* normal wakeup path no need to call __remove_wait_queue()
* explicitly, thus ep->lock is not taken, which halts the
* event delivery.
+ *
+ * In fact, we now use an even more aggressive function that
+ * unconditionally removes, because we don't reuse the wait
+ * entry between loop iterations. This lets us also avoid the
+ * performance issue if a process is killed, causing all of its
+ * threads to wake up without being removed normally.
*/
init_wait(&wait);
+ wait.func = ep_autoremove_wake_function;
write_lock_irq(&ep->lock);
- __add_wait_queue_exclusive(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
-
/*
- * We don't want to sleep if the ep_poll_callback() sends us
- * a wakeup in between. That's why we set the task state
- * to TASK_INTERRUPTIBLE before doing the checks.
+ * Barrierless variant, waitqueue_active() is called under
+ * the same lock on wakeup ep_poll_callback() side, so it
+ * is safe to avoid an explicit barrier.
*/
- set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_INTERRUPTIBLE);
+
/*
- * Always short-circuit for fatal signals to allow
- * threads to make a timely exit without the chance of
- * finding more events available and fetching
- * repeatedly.
+ * Do the final check under the lock. ep_scan_ready_list()
+ * plays with two lists (->rdllist and ->ovflist) and there
+ * is always a race when both lists are empty for short
+ * period of time although events are pending, so lock is
+ * important.
*/
- if (fatal_signal_pending(current)) {
- res = -EINTR;
- break;
- }
-
eavail = ep_events_available(ep);
- if (eavail)
- break;
- if (signal_pending(current)) {
- res = -EINTR;
- break;
+ if (!eavail) {
+ if (signal_pending(current))
+ res = -EINTR;
+ else
+ __add_wait_queue_exclusive(&ep->wq, &wait);
}
+ write_unlock_irq(&ep->lock);
- if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) {
- timed_out = 1;
- break;
- }
+ if (!eavail && !res)
+ timed_out = !schedule_hrtimeout_range(to, slack,
+ HRTIMER_MODE_ABS);
- /* We were woken up, thus go and try to harvest some events */
+ /*
+ * We were woken up, thus go and try to harvest some events.
+ * If timed out and still on the wait queue, recheck eavail
+ * carefully under lock, below.
+ */
eavail = 1;
-
} while (0);
__set_current_state(TASK_RUNNING);
if (!list_empty_careful(&wait.entry)) {
write_lock_irq(&ep->lock);
+ /*
+ * If the thread timed out and is not on the wait queue, it
+ * means that the thread was woken up after its timeout expired
+ * before it could reacquire the lock. Thus, when wait.entry is
+ * empty, it needs to harvest events.
+ */
+ if (timed_out)
+ eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
write_unlock_irq(&ep->lock);
}
send_events:
+ if (fatal_signal_pending(current)) {
+ /*
+ * Always short-circuit for fatal signals to allow
+ * threads to make a timely exit without the chance of
+ * finding more events available and fetching
+ * repeatedly.
+ */
+ res = -EINTR;
+ }
/*
* Try to transfer events to user space. In case we get 0 events and
* there's still timeout left over, we go trying again in search of
diff --git a/fs/exec.c b/fs/exec.c
index 88da5e7fe41d..882d31bb4fd3 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -837,6 +837,7 @@ int transfer_args_to_stack(struct linux_binprm *bprm,
goto out;
}
+ bprm->exec += *sp_location - MAX_ARG_PAGES * PAGE_SIZE;
*sp_location = sp;
out:
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 10ab238de9a6..94e39a28bee2 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -68,10 +68,7 @@ struct mb_cache;
* second extended-fs super-block data in memory
*/
struct ext2_sb_info {
- unsigned long s_frag_size; /* Size of a fragment in bytes */
- unsigned long s_frags_per_block;/* Number of fragments per block */
unsigned long s_inodes_per_block;/* Number of inodes per block */
- unsigned long s_frags_per_group;/* Number of fragments in a group */
unsigned long s_blocks_per_group;/* Number of blocks in a group */
unsigned long s_inodes_per_group;/* Number of inodes in a group */
unsigned long s_itb_per_group; /* Number of inode table blocks per group */
@@ -177,6 +174,7 @@ static inline struct ext2_sb_info *EXT2_SB(struct super_block *sb)
#define EXT2_MIN_BLOCK_SIZE 1024
#define EXT2_MAX_BLOCK_SIZE 4096
#define EXT2_MIN_BLOCK_LOG_SIZE 10
+#define EXT2_MAX_BLOCK_LOG_SIZE 16
#define EXT2_BLOCK_SIZE(s) ((s)->s_blocksize)
#define EXT2_ADDR_PER_BLOCK(s) (EXT2_BLOCK_SIZE(s) / sizeof (__u32))
#define EXT2_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits)
@@ -185,15 +183,6 @@ static inline struct ext2_sb_info *EXT2_SB(struct super_block *sb)
#define EXT2_FIRST_INO(s) (EXT2_SB(s)->s_first_ino)
/*
- * Macro-instructions used to manage fragments
- */
-#define EXT2_MIN_FRAG_SIZE 1024
-#define EXT2_MAX_FRAG_SIZE 4096
-#define EXT2_MIN_FRAG_LOG_SIZE 10
-#define EXT2_FRAG_SIZE(s) (EXT2_SB(s)->s_frag_size)
-#define EXT2_FRAGS_PER_BLOCK(s) (EXT2_SB(s)->s_frags_per_block)
-
-/*
* Structure of a blocks group descriptor
*/
struct ext2_group_desc
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index db403c01d4d5..a787f50732b8 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -681,10 +681,9 @@ static int ext2_setup_super (struct super_block * sb,
es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT);
le16_add_cpu(&es->s_mnt_count, 1);
if (test_opt (sb, DEBUG))
- ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, fs=%lu, gc=%lu, "
+ ext2_msg(sb, KERN_INFO, "%s, %s, bs=%lu, gc=%lu, "
"bpg=%lu, ipg=%lu, mo=%04lx]",
EXT2FS_VERSION, EXT2FS_DATE, sb->s_blocksize,
- sbi->s_frag_size,
sbi->s_groups_count,
EXT2_BLOCKS_PER_GROUP(sb),
EXT2_INODES_PER_GROUP(sb),
@@ -967,6 +966,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
+ if (le32_to_cpu(es->s_log_block_size) >
+ (EXT2_MAX_BLOCK_LOG_SIZE - BLOCK_SIZE_BITS)) {
+ ext2_msg(sb, KERN_ERR,
+ "Invalid log block size: %u",
+ le32_to_cpu(es->s_log_block_size));
+ goto failed_mount;
+ }
blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
if (test_opt(sb, DAX)) {
@@ -1024,14 +1030,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
}
}
- sbi->s_frag_size = EXT2_MIN_FRAG_SIZE <<
- le32_to_cpu(es->s_log_frag_size);
- if (sbi->s_frag_size == 0)
- goto cantfind_ext2;
- sbi->s_frags_per_block = sb->s_blocksize / sbi->s_frag_size;
-
sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
- sbi->s_frags_per_group = le32_to_cpu(es->s_frags_per_group);
sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb);
@@ -1057,11 +1056,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
goto failed_mount;
}
- if (sb->s_blocksize != sbi->s_frag_size) {
+ if (es->s_log_frag_size != es->s_log_block_size) {
ext2_msg(sb, KERN_ERR,
- "error: fragsize %lu != blocksize %lu"
- "(not supported yet)",
- sbi->s_frag_size, sb->s_blocksize);
+ "error: fragsize log %u != blocksize log %u",
+ le32_to_cpu(es->s_log_frag_size), sb->s_blocksize_bits);
goto failed_mount;
}
@@ -1071,15 +1069,10 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_blocks_per_group);
goto failed_mount;
}
- if (sbi->s_frags_per_group > sb->s_blocksize * 8) {
+ if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
+ sbi->s_inodes_per_group > sb->s_blocksize * 8) {
ext2_msg(sb, KERN_ERR,
- "error: #fragments per group too big: %lu",
- sbi->s_frags_per_group);
- goto failed_mount;
- }
- if (sbi->s_inodes_per_group > sb->s_blocksize * 8) {
- ext2_msg(sb, KERN_ERR,
- "error: #inodes per group too big: %lu",
+ "error: invalid #inodes per group: %lu",
sbi->s_inodes_per_group);
goto failed_mount;
}
@@ -1089,6 +1082,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
le32_to_cpu(es->s_first_data_block) - 1)
/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
+ if ((u64)sbi->s_groups_count * sbi->s_inodes_per_group !=
+ le32_to_cpu(es->s_inodes_count)) {
+ ext2_msg(sb, KERN_ERR, "error: invalid #inodes: %u vs computed %llu",
+ le32_to_cpu(es->s_inodes_count),
+ (u64)sbi->s_groups_count * sbi->s_inodes_per_group);
+ goto failed_mount;
+ }
db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
EXT2_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc_array (db_count,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index 62acbe27d8bf..ca7ff31a1f19 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -690,10 +690,10 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh,
/* We need to allocate a new block */
ext2_fsblk_t goal = ext2_group_first_block_no(sb,
EXT2_I(inode)->i_block_group);
- int block = ext2_new_block(inode, goal, &error);
+ ext2_fsblk_t block = ext2_new_block(inode, goal, &error);
if (error)
goto cleanup;
- ea_idebug(inode, "creating block %d", block);
+ ea_idebug(inode, "creating block %lu", block);
new_bh = sb_getblk(sb, block);
if (unlikely(!new_bh)) {
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 9b63f5416a2f..7f3b25b3fa6d 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -67,6 +67,11 @@ extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
static inline int
ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir)
{
+ /* usually, the umask is applied by posix_acl_create(), but if
+ ext4 ACL support is disabled at compile time, we need to do
+ it here, because posix_acl_create() will never be called */
+ inode->i_mode &= ~current_umask();
+
return 0;
}
#endif /* CONFIG_EXT4_FS_POSIX_ACL */
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 031ff3f19018..b68cee75f5c5 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -303,6 +303,22 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
return desc;
}
+static ext4_fsblk_t ext4_valid_block_bitmap_padding(struct super_block *sb,
+ ext4_group_t block_group,
+ struct buffer_head *bh)
+{
+ ext4_grpblk_t next_zero_bit;
+ unsigned long bitmap_size = sb->s_blocksize * 8;
+ unsigned int offset = num_clusters_in_group(sb, block_group);
+
+ if (bitmap_size <= offset)
+ return 0;
+
+ next_zero_bit = ext4_find_next_zero_bit(bh->b_data, bitmap_size, offset);
+
+ return (next_zero_bit < bitmap_size ? next_zero_bit : 0);
+}
+
/*
* Return the block number which was discovered to be invalid, or 0 if
* the block bitmap is valid.
@@ -395,6 +411,15 @@ static int ext4_validate_block_bitmap(struct super_block *sb,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
return -EFSCORRUPTED;
}
+ blk = ext4_valid_block_bitmap_padding(sb, block_group, bh);
+ if (unlikely(blk != 0)) {
+ ext4_unlock_group(sb, block_group);
+ ext4_error(sb, "bg %u: block %llu: padding at end of block bitmap is not set",
+ block_group, blk);
+ ext4_mark_group_bitmap_corrupted(sb, block_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
+ return -EFSCORRUPTED;
+ }
set_buffer_verified(bh);
verified:
ext4_unlock_group(sb, block_group);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index e932e8482371..4d02116193de 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -501,7 +501,7 @@ enum {
*
* It's not paranoia if the Murphy's Law really *is* out to get you. :-)
*/
-#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1U << EXT4_INODE_##FLAG))
#define CHECK_FLAG_VALUE(FLAG) BUILD_BUG_ON(!TEST_FLAG_VALUE(FLAG))
static inline void ext4_check_flag_values(void)
@@ -938,11 +938,13 @@ do { \
* where the second inode has larger inode number
* than the first
* I_DATA_SEM_QUOTA - Used for quota inodes only
+ * I_DATA_SEM_EA - Used for ea_inodes only
*/
enum {
I_DATA_SEM_NORMAL = 0,
I_DATA_SEM_OTHER,
I_DATA_SEM_QUOTA,
+ I_DATA_SEM_EA
};
@@ -1439,7 +1441,7 @@ struct ext4_sb_info {
unsigned long s_commit_interval;
u32 s_max_batch_time;
u32 s_min_batch_time;
- struct block_device *journal_bdev;
+ struct block_device *s_journal_bdev;
#ifdef CONFIG_QUOTA
/* Names of quota files with journalled quota */
char __rcu *s_qf_names[EXT4_MAXQUOTAS];
@@ -1527,7 +1529,7 @@ struct ext4_sb_info {
struct task_struct *s_mmp_tsk;
/* record the last minlen when FITRIM is called. */
- atomic_t s_last_trim_minblks;
+ unsigned long s_last_trim_minblks;
/* Reference to checksum algorithm driver via cryptoapi */
struct crypto_shash *s_chksum_driver;
@@ -2608,7 +2610,9 @@ int do_journal_get_write_access(handle_t *handle,
typedef enum {
EXT4_IGET_NORMAL = 0,
EXT4_IGET_SPECIAL = 0x0001, /* OK to iget a system inode */
- EXT4_IGET_HANDLE = 0x0002 /* Inode # is from a handle */
+ EXT4_IGET_HANDLE = 0x0002, /* Inode # is from a handle */
+ EXT4_IGET_BAD = 0x0004, /* Allow to iget a bad inode */
+ EXT4_IGET_EA_INODE = 0x0008 /* Inode should contain an EA value */
} ext4_iget_flags;
extern struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d5e649e578cb..98e1b1ddb4ec 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -500,6 +500,10 @@ static int __ext4_ext_check(const char *function, unsigned int line,
error_msg = "invalid eh_entries";
goto corrupted;
}
+ if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
+ error_msg = "eh_entries is 0 but eh_depth is > 0";
+ goto corrupted;
+ }
if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
@@ -1032,6 +1036,11 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
ix = curp->p_idx;
}
+ if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
+ EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
+ return -EFSCORRUPTED;
+ }
+
len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1;
BUG_ON(len < 0);
if (len > 0) {
@@ -1041,11 +1050,6 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode,
memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx));
}
- if (unlikely(ix > EXT_MAX_INDEX(curp->p_hdr))) {
- EXT4_ERROR_INODE(inode, "ix > EXT_MAX_INDEX!");
- return -EFSCORRUPTED;
- }
-
ix->ei_block = cpu_to_le32(logical);
ext4_idx_store_pblock(ix, ptr);
le16_add_cpu(&curp->p_hdr->eh_entries, 1);
@@ -6018,6 +6022,15 @@ int ext4_clu_mapped(struct inode *inode, ext4_lblk_t lclu)
struct ext4_extent *extent;
ext4_lblk_t first_lblk, first_lclu, last_lclu;
+ /*
+ * if data can be stored inline, the logical cluster isn't
+ * mapped - no physical clusters have been allocated, and the
+ * file has no extents
+ */
+ if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) ||
+ ext4_has_inline_data(inode))
+ return 0;
+
/* search for the extent closest to the first block in the cluster */
path = ext4_find_extent(inode, EXT4_C2B(sbi, lclu), NULL, 0);
if (IS_ERR(path)) {
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 43fba01da6c3..3346a9252063 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -144,14 +144,17 @@
static struct kmem_cache *ext4_es_cachep;
static struct kmem_cache *ext4_pending_cachep;
-static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
+ struct extent_status *prealloc);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved);
+ ext4_lblk_t end, int *reserved,
+ struct extent_status *prealloc);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len,
+ struct pending_reservation **prealloc);
int __init ext4_init_es(void)
{
@@ -269,14 +272,12 @@ static void __es_find_extent_range(struct inode *inode,
/* see if the extent has been cached */
es->es_lblk = es->es_len = es->es_pblk = 0;
- if (tree->cache_es) {
- es1 = tree->cache_es;
- if (in_range(lblk, es1->es_lblk, es1->es_len)) {
- es_debug("%u cached by [%u/%u) %llu %x\n",
- lblk, es1->es_lblk, es1->es_len,
- ext4_es_pblock(es1), ext4_es_status(es1));
- goto out;
- }
+ es1 = READ_ONCE(tree->cache_es);
+ if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u) %llu %x\n",
+ lblk, es1->es_lblk, es1->es_len,
+ ext4_es_pblock(es1), ext4_es_status(es1));
+ goto out;
}
es1 = __es_tree_search(&tree->root, lblk);
@@ -295,7 +296,7 @@ out:
}
if (es1 && matching_fn(es1)) {
- tree->cache_es = es1;
+ WRITE_ONCE(tree->cache_es, es1);
es->es_lblk = es1->es_lblk;
es->es_len = es1->es_len;
es->es_pblk = es1->es_pblk;
@@ -441,22 +442,49 @@ static void ext4_es_list_del(struct inode *inode)
spin_unlock(&sbi->s_es_lock);
}
-static struct extent_status *
-ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
- ext4_fsblk_t pblk)
+static inline struct pending_reservation *__alloc_pending(bool nofail)
+{
+ if (!nofail)
+ return kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
+
+ return kmem_cache_zalloc(ext4_pending_cachep, GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static inline void __free_pending(struct pending_reservation *pr)
+{
+ kmem_cache_free(ext4_pending_cachep, pr);
+}
+
+/*
+ * Returns true if we cannot fail to allocate memory for this extent_status
+ * entry and cannot reclaim it until its status changes.
+ */
+static inline bool ext4_es_must_keep(struct extent_status *es)
+{
+ /* fiemap, bigalloc, and seek_data/hole need to use it. */
+ if (ext4_es_is_delayed(es))
+ return true;
+
+ return false;
+}
+
+static inline struct extent_status *__es_alloc_extent(bool nofail)
+{
+ if (!nofail)
+ return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+
+ return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
+ ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
{
- struct extent_status *es;
- es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
- if (es == NULL)
- return NULL;
es->es_lblk = lblk;
es->es_len = len;
es->es_pblk = pblk;
- /*
- * We don't count delayed extent because we never try to reclaim them
- */
- if (!ext4_es_is_delayed(es)) {
+ /* We never try to reclaim a must kept extent, so we don't count it. */
+ if (!ext4_es_must_keep(es)) {
if (!EXT4_I(inode)->i_es_shk_nr++)
ext4_es_list_add(inode);
percpu_counter_inc(&EXT4_SB(inode->i_sb)->
@@ -465,8 +493,11 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
EXT4_I(inode)->i_es_all_nr++;
percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+}
- return es;
+static inline void __es_free_extent(struct extent_status *es)
+{
+ kmem_cache_free(ext4_es_cachep, es);
}
static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
@@ -474,8 +505,8 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
EXT4_I(inode)->i_es_all_nr--;
percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
- /* Decrease the shrink counter when this es is not delayed */
- if (!ext4_es_is_delayed(es)) {
+ /* Decrease the shrink counter when we can reclaim the extent. */
+ if (!ext4_es_must_keep(es)) {
BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
if (!--EXT4_I(inode)->i_es_shk_nr)
ext4_es_list_del(inode);
@@ -483,7 +514,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
s_es_stats.es_stats_shk_cnt);
}
- kmem_cache_free(ext4_es_cachep, es);
+ __es_free_extent(es);
}
/*
@@ -745,7 +776,8 @@ static inline void ext4_es_insert_extent_check(struct inode *inode,
}
#endif
-static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
+ struct extent_status *prealloc)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node **p = &tree->root.rb_node;
@@ -785,10 +817,15 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
}
}
- es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
- newes->es_pblk);
+ if (prealloc)
+ es = prealloc;
+ else
+ es = __es_alloc_extent(false);
if (!es)
return -ENOMEM;
+ ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
+ newes->es_pblk);
+
rb_link_node(&es->rb_node, parent, p);
rb_insert_color(&es->rb_node, &tree->root);
@@ -809,8 +846,12 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
- int err = 0;
+ int err1 = 0, err2 = 0, err3 = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct extent_status *es1 = NULL;
+ struct extent_status *es2 = NULL;
+ struct pending_reservation *pr = NULL;
+ bool revise_pending = false;
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
@@ -835,29 +876,57 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_es_insert_extent_check(inode, &newes);
+ revise_pending = sbi->s_cluster_ratio > 1 &&
+ test_opt(inode->i_sb, DELALLOC) &&
+ (status & (EXTENT_STATUS_WRITTEN |
+ EXTENT_STATUS_UNWRITTEN));
+retry:
+ if (err1 && !es1)
+ es1 = __es_alloc_extent(true);
+ if ((err1 || err2) && !es2)
+ es2 = __es_alloc_extent(true);
+ if ((err1 || err2 || err3) && revise_pending && !pr)
+ pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, NULL);
- if (err != 0)
+
+ err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
+ if (err1 != 0)
goto error;
-retry:
- err = __es_insert_extent(inode, &newes);
- if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
- if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
- err = 0;
+ /* Free preallocated extent if it didn't get used. */
+ if (es1) {
+ if (!es1->es_len)
+ __es_free_extent(es1);
+ es1 = NULL;
+ }
- if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
- (status & EXTENT_STATUS_WRITTEN ||
- status & EXTENT_STATUS_UNWRITTEN))
- __revise_pending(inode, lblk, len);
+ err2 = __es_insert_extent(inode, &newes, es2);
+ if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
+ err2 = 0;
+ if (err2 != 0)
+ goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es2) {
+ if (!es2->es_len)
+ __es_free_extent(es2);
+ es2 = NULL;
+ }
+ if (revise_pending) {
+ err3 = __revise_pending(inode, lblk, len, &pr);
+ if (err3 != 0)
+ goto error;
+ if (pr) {
+ __free_pending(pr);
+ pr = NULL;
+ }
+ }
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (err1 || err2 || err3)
+ goto retry;
ext4_es_print_tree(inode);
-
- return err;
+ return 0;
}
/*
@@ -887,7 +956,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
if (!es || es->es_lblk > end)
- __es_insert_extent(inode, &newes);
+ __es_insert_extent(inode, &newes, NULL);
write_unlock(&EXT4_I(inode)->i_es_lock);
}
@@ -916,14 +985,12 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
/* find extent in cache firstly */
es->es_lblk = es->es_len = es->es_pblk = 0;
- if (tree->cache_es) {
- es1 = tree->cache_es;
- if (in_range(lblk, es1->es_lblk, es1->es_len)) {
- es_debug("%u cached by [%u/%u)\n",
- lblk, es1->es_lblk, es1->es_len);
- found = 1;
- goto out;
- }
+ es1 = READ_ONCE(tree->cache_es);
+ if (es1 && in_range(lblk, es1->es_lblk, es1->es_len)) {
+ es_debug("%u cached by [%u/%u)\n",
+ lblk, es1->es_lblk, es1->es_len);
+ found = 1;
+ goto out;
}
node = tree->root.rb_node;
@@ -1257,7 +1324,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
rc->ndelonly--;
node = rb_next(&pr->rb_node);
rb_erase(&pr->rb_node, &tree->root);
- kmem_cache_free(ext4_pending_cachep, pr);
+ __free_pending(pr);
if (!node)
break;
pr = rb_entry(node, struct pending_reservation,
@@ -1276,6 +1343,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
* @lblk - first block in range
* @end - last block in range
* @reserved - number of cluster reservations released
+ * @prealloc - pre-allocated es to avoid memory allocation failures
*
* If @reserved is not NULL and delayed allocation is enabled, counts
* block/cluster reservations freed by removing range and if bigalloc
@@ -1283,7 +1351,8 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
* error code on failure.
*/
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved)
+ ext4_lblk_t end, int *reserved,
+ struct extent_status *prealloc)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node *node;
@@ -1291,14 +1360,12 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status orig_es;
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
- int err;
+ int err = 0;
bool count_reserved = true;
struct rsvd_count rc;
if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
count_reserved = false;
-retry:
- err = 0;
es = __es_tree_search(&tree->root, lblk);
if (!es)
@@ -1332,14 +1399,13 @@ retry:
orig_es.es_len - len2;
ext4_es_store_pblock_status(&newes, block,
ext4_es_status(&orig_es));
- err = __es_insert_extent(inode, &newes);
+ err = __es_insert_extent(inode, &newes, prealloc);
if (err) {
+ if (!ext4_es_must_keep(&newes))
+ return 0;
+
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
- if ((err == -ENOMEM) &&
- __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
goto out;
}
} else {
@@ -1352,9 +1418,9 @@ retry:
}
}
if (count_reserved)
- count_rsvd(inode, lblk, orig_es.es_len - len1 - len2,
- &orig_es, &rc);
- goto out;
+ count_rsvd(inode, orig_es.es_lblk + len1,
+ orig_es.es_len - len1 - len2, &orig_es, &rc);
+ goto out_get_reserved;
}
if (len1 > 0) {
@@ -1396,6 +1462,7 @@ retry:
}
}
+out_get_reserved:
if (count_reserved)
*reserved = get_rsvd(inode, end, es, &rc);
out:
@@ -1418,6 +1485,7 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t end;
int err = 0;
int reserved = 0;
+ struct extent_status *es = NULL;
trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
@@ -1429,17 +1497,29 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
end = lblk + len - 1;
BUG_ON(end < lblk);
+retry:
+ if (err && !es)
+ es = __es_alloc_extent(true);
/*
* ext4_clear_inode() depends on us taking i_es_lock unconditionally
* so that we are sure __es_shrink() is done with the inode before it
* is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, &reserved);
+ err = __es_remove_extent(inode, lblk, end, &reserved, es);
+ /* Free preallocated extent if it didn't get used. */
+ if (es) {
+ if (!es->es_len)
+ __es_free_extent(es);
+ es = NULL;
+ }
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (err)
+ goto retry;
+
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
- return err;
+ return 0;
}
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
@@ -1686,11 +1766,8 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
(*nr_to_scan)--;
node = rb_next(&es->rb_node);
- /*
- * We can't reclaim delayed extent from status tree because
- * fiemap, bigallic, and seek_data/hole need to use it.
- */
- if (ext4_es_is_delayed(es))
+
+ if (ext4_es_must_keep(es))
goto next;
if (ext4_es_is_referenced(es)) {
ext4_es_clear_referenced(es);
@@ -1754,7 +1831,7 @@ void ext4_clear_inode_es(struct inode *inode)
while (node) {
es = rb_entry(node, struct extent_status, rb_node);
node = rb_next(node);
- if (!ext4_es_is_delayed(es)) {
+ if (!ext4_es_must_keep(es)) {
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
}
@@ -1841,11 +1918,13 @@ static struct pending_reservation *__get_pending(struct inode *inode,
*
* @inode - file containing the cluster
* @lblk - logical block in the cluster to be added
+ * @prealloc - preallocated pending entry
*
* Returns 0 on successful insertion and -ENOMEM on failure. If the
* pending reservation is already in the set, returns successfully.
*/
-static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
+static int __insert_pending(struct inode *inode, ext4_lblk_t lblk,
+ struct pending_reservation **prealloc)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_pending_tree *tree = &EXT4_I(inode)->i_pending_tree;
@@ -1871,10 +1950,15 @@ static int __insert_pending(struct inode *inode, ext4_lblk_t lblk)
}
}
- pr = kmem_cache_alloc(ext4_pending_cachep, GFP_ATOMIC);
- if (pr == NULL) {
- ret = -ENOMEM;
- goto out;
+ if (likely(*prealloc == NULL)) {
+ pr = __alloc_pending(false);
+ if (!pr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ } else {
+ pr = *prealloc;
+ *prealloc = NULL;
}
pr->lclu = lclu;
@@ -1904,7 +1988,7 @@ static void __remove_pending(struct inode *inode, ext4_lblk_t lblk)
if (pr != NULL) {
tree = &EXT4_I(inode)->i_pending_tree;
rb_erase(&pr->rb_node, &tree->root);
- kmem_cache_free(ext4_pending_cachep, pr);
+ __free_pending(pr);
}
}
@@ -1965,7 +2049,10 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
bool allocated)
{
struct extent_status newes;
- int err = 0;
+ int err1 = 0, err2 = 0, err3 = 0;
+ struct extent_status *es1 = NULL;
+ struct extent_status *es2 = NULL;
+ struct pending_reservation *pr = NULL;
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
lblk, inode->i_ino);
@@ -1977,29 +2064,52 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
ext4_es_insert_extent_check(inode, &newes);
+retry:
+ if (err1 && !es1)
+ es1 = __es_alloc_extent(true);
+ if ((err1 || err2) && !es2)
+ es2 = __es_alloc_extent(true);
+ if ((err1 || err2 || err3) && allocated && !pr)
+ pr = __alloc_pending(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, lblk, NULL);
- if (err != 0)
- goto error;
-retry:
- err = __es_insert_extent(inode, &newes);
- if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
- if (err != 0)
+ err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
+ if (err1 != 0)
goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es1) {
+ if (!es1->es_len)
+ __es_free_extent(es1);
+ es1 = NULL;
+ }
- if (allocated)
- __insert_pending(inode, lblk);
+ err2 = __es_insert_extent(inode, &newes, es2);
+ if (err2 != 0)
+ goto error;
+ /* Free preallocated extent if it didn't get used. */
+ if (es2) {
+ if (!es2->es_len)
+ __es_free_extent(es2);
+ es2 = NULL;
+ }
+ if (allocated) {
+ err3 = __insert_pending(inode, lblk, &pr);
+ if (err3 != 0)
+ goto error;
+ if (pr) {
+ __free_pending(pr);
+ pr = NULL;
+ }
+ }
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (err1 || err2 || err3)
+ goto retry;
ext4_es_print_tree(inode);
ext4_print_pending_tree(inode);
-
- return err;
+ return 0;
}
/*
@@ -2100,21 +2210,24 @@ unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
* @inode - file containing the range
* @lblk - logical block defining the start of range
* @len - length of range in blocks
+ * @prealloc - preallocated pending entry
*
* Used after a newly allocated extent is added to the extents status tree.
* Requires that the extents in the range have either written or unwritten
* status. Must be called while holding i_es_lock.
*/
-static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
+static int __revise_pending(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len,
+ struct pending_reservation **prealloc)
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
ext4_lblk_t end = lblk + len - 1;
ext4_lblk_t first, last;
bool f_del = false, l_del = false;
+ int ret = 0;
if (len == 0)
- return;
+ return 0;
/*
* Two cases - block range within single cluster and block range
@@ -2135,7 +2248,9 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
f_del = __es_scan_range(inode, &ext4_es_is_delonly,
first, lblk - 1);
if (f_del) {
- __insert_pending(inode, first);
+ ret = __insert_pending(inode, first, prealloc);
+ if (ret < 0)
+ goto out;
} else {
last = EXT4_LBLK_CMASK(sbi, end) +
sbi->s_cluster_ratio - 1;
@@ -2143,9 +2258,11 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
l_del = __es_scan_range(inode,
&ext4_es_is_delonly,
end + 1, last);
- if (l_del)
- __insert_pending(inode, last);
- else
+ if (l_del) {
+ ret = __insert_pending(inode, last, prealloc);
+ if (ret < 0)
+ goto out;
+ } else
__remove_pending(inode, last);
}
} else {
@@ -2153,18 +2270,24 @@ static void __revise_pending(struct inode *inode, ext4_lblk_t lblk,
if (first != lblk)
f_del = __es_scan_range(inode, &ext4_es_is_delonly,
first, lblk - 1);
- if (f_del)
- __insert_pending(inode, first);
- else
+ if (f_del) {
+ ret = __insert_pending(inode, first, prealloc);
+ if (ret < 0)
+ goto out;
+ } else
__remove_pending(inode, first);
last = EXT4_LBLK_CMASK(sbi, end) + sbi->s_cluster_ratio - 1;
if (last != end)
l_del = __es_scan_range(inode, &ext4_es_is_delonly,
end + 1, last);
- if (l_del)
- __insert_pending(inode, last);
- else
+ if (l_del) {
+ ret = __insert_pending(inode, last, prealloc);
+ if (ret < 0)
+ goto out;
+ } else
__remove_pending(inode, last);
}
+out:
+ return ret;
}
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 1513e90fb6d2..7ac55d071eb2 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -505,6 +505,12 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
inode_unlock_shared(inode);
break;
}
+ /*
+ * Make sure inline data cannot be created anymore since we are going
+ * to allocate blocks for DIO. We know the inode does not have any
+ * inline data now because ext4_dio_supported() checked for that.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
if (offset < 0)
return offset;
diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c
index 37347ba868b7..d18c4cd4c63f 100644
--- a/fs/ext4/fsmap.c
+++ b/fs/ext4/fsmap.c
@@ -486,6 +486,8 @@ static int ext4_getfsmap_datadev(struct super_block *sb,
keys[0].fmr_physical = bofs;
if (keys[1].fmr_physical >= eofs)
keys[1].fmr_physical = eofs - 1;
+ if (keys[1].fmr_physical < keys[0].fmr_physical)
+ return 0;
start_fsb = keys[0].fmr_physical;
end_fsb = keys[1].fmr_physical;
@@ -574,8 +576,8 @@ static bool ext4_getfsmap_is_valid_device(struct super_block *sb,
if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX ||
fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev))
return true;
- if (EXT4_SB(sb)->journal_bdev &&
- fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev))
+ if (EXT4_SB(sb)->s_journal_bdev &&
+ fm->fmr_device == new_encode_dev(EXT4_SB(sb)->s_journal_bdev->bd_dev))
return true;
return false;
}
@@ -645,9 +647,9 @@ int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head,
memset(handlers, 0, sizeof(handlers));
handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev);
handlers[0].gfd_fn = ext4_getfsmap_datadev;
- if (EXT4_SB(sb)->journal_bdev) {
+ if (EXT4_SB(sb)->s_journal_bdev) {
handlers[1].gfd_dev = new_encode_dev(
- EXT4_SB(sb)->journal_bdev->bd_dev);
+ EXT4_SB(sb)->s_journal_bdev->bd_dev);
handlers[1].gfd_fn = ext4_getfsmap_logdev;
}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 83846cc81485..cbde5a096c7b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -500,7 +500,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
goto fallback;
}
- max_dirs = ndirs / ngroups + inodes_per_group / 16;
+ max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
min_inodes = avefreei - inodes_per_group*flex_size / 4;
if (min_inodes < 1)
min_inodes = 1;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36699a131168..25532bac7777 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -148,6 +148,7 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
struct super_block *sb = inode->i_sb;
Indirect *p = chain;
struct buffer_head *bh;
+ unsigned int key;
int ret = -EIO;
*err = 0;
@@ -156,7 +157,13 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
if (!p->key)
goto no_block;
while (--depth) {
- bh = sb_getblk(sb, le32_to_cpu(p->key));
+ key = le32_to_cpu(p->key);
+ if (key > ext4_blocks_count(EXT4_SB(sb)->s_es)) {
+ /* the block was out of range */
+ ret = -EFSCORRUPTED;
+ goto failure;
+ }
+ bh = sb_getblk(sb, key);
if (unlikely(!bh)) {
ret = -ENOMEM;
goto failure;
@@ -629,6 +636,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
ext4_update_inode_fsync_trans(handle, inode, 1);
count = ar.len;
+
+ /*
+ * Update reserved blocks/metadata blocks after successful block
+ * allocation which had been deferred till now.
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_da_update_reserve_space(inode, count, 1);
+
got_it:
map->m_flags |= EXT4_MAP_MAPPED;
map->m_pblk = le32_to_cpu(chain[depth-1].key);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 62384ae77a78..549ceac9099c 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -32,8 +32,12 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
struct ext4_xattr_ibody_header *header;
struct ext4_xattr_entry *entry;
struct ext4_inode *raw_inode;
+ void *end;
int free, min_offs;
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
+ return 0;
+
min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
EXT4_GOOD_OLD_INODE_SIZE -
EXT4_I(inode)->i_extra_isize -
@@ -52,14 +56,23 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
raw_inode = ext4_raw_inode(iloc);
header = IHDR(inode, raw_inode);
entry = IFIRST(header);
+ end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size;
/* Compute min_offs. */
- for (; !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) {
+ while (!IS_LAST_ENTRY(entry)) {
+ void *next = EXT4_XATTR_NEXT(entry);
+
+ if (next >= end) {
+ EXT4_ERROR_INODE(inode,
+ "corrupt xattr in inline inode");
+ return 0;
+ }
if (!entry->e_value_inum && entry->e_value_size) {
size_t offs = le16_to_cpu(entry->e_value_offs);
if (offs < min_offs)
min_offs = offs;
}
+ entry = next;
}
free = min_offs -
((void *)entry - (void *)IFIRST(header)) - sizeof(__u32);
@@ -154,7 +167,6 @@ int ext4_find_inline_data_nolock(struct inode *inode)
(void *)ext4_raw_inode(&is.iloc));
EXT4_I(inode)->i_inline_size = EXT4_MIN_INLINE_DATA_SIZE +
le32_to_cpu(is.s.here->e_value_size);
- ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
}
out:
brelse(is.iloc.bh);
@@ -204,7 +216,7 @@ out:
/*
* write the buffer to the inline inode.
* If 'create' is set, we don't need to do the extra copy in the xattr
- * value since it is already handled by ext4_xattr_ibody_inline_set.
+ * value since it is already handled by ext4_xattr_ibody_set.
* That saves us one memcpy.
*/
static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc,
@@ -286,7 +298,7 @@ static int ext4_create_inline_data(handle_t *handle,
BUG_ON(!is.s.not_found);
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error) {
if (error == -ENOSPC)
ext4_clear_inode_state(inode,
@@ -346,7 +358,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
value, len);
- if (error == -ENODATA)
+ if (error < 0)
goto out;
BUFFER_TRACE(is.iloc.bh, "get_write_access");
@@ -358,7 +370,7 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
i.value = value;
i.value_len = len;
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error)
goto out;
@@ -431,7 +443,7 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
if (error)
goto out;
- error = ext4_xattr_ibody_inline_set(handle, inode, &i, &is);
+ error = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (error)
goto out;
@@ -1170,6 +1182,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle,
ext4_initialize_dirent_tail(dir_block,
inode->i_sb->s_blocksize);
set_buffer_uptodate(dir_block);
+ unlock_buffer(dir_block);
err = ext4_handle_dirty_dirblock(handle, inode, dir_block);
if (err)
return err;
@@ -1243,6 +1256,7 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
if (!S_ISDIR(inode->i_mode)) {
memcpy(data_bh->b_data, buf, inline_size);
set_buffer_uptodate(data_bh);
+ unlock_buffer(data_bh);
error = ext4_handle_dirty_metadata(handle,
inode, data_bh);
} else {
@@ -1250,7 +1264,6 @@ static int ext4_convert_inline_data_nolock(handle_t *handle,
buf, inline_size);
}
- unlock_buffer(data_bh);
out_restore:
if (error)
ext4_restore_inline_data(handle, inode, iloc, buf, inline_size);
@@ -1967,8 +1980,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
i.value = value;
i.value_len = i_size > EXT4_MIN_INLINE_DATA_SIZE ?
i_size - EXT4_MIN_INLINE_DATA_SIZE : 0;
- err = ext4_xattr_ibody_inline_set(handle, inode,
- &i, &is);
+ err = ext4_xattr_ibody_set(handle, inode, &i, &is);
if (err)
goto out_error;
}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d8fee911d4f4..8a0bca3b653b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -207,6 +207,8 @@ void ext4_evict_inode(struct inode *inode)
trace_ext4_evict_inode(inode);
+ if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)
+ ext4_evict_ea_inode(inode);
if (inode->i_nlink) {
/*
* When journalling data dirty buffers are tracked only in the
@@ -667,16 +669,6 @@ found:
*/
ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
}
-
- /*
- * Update reserved blocks/metadata blocks after successful
- * block allocation which had been deferred till now. We don't
- * support fallocate for non extent files. So we can update
- * reserve space here.
- */
- if ((retval > 0) &&
- (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
- ext4_da_update_reserve_space(inode, retval, 1);
}
if (retval > 0) {
@@ -1317,6 +1309,13 @@ retry_grab:
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
+ /*
+ * The same as page allocation, we prealloc buffer heads before
+ * starting the handle.
+ */
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, inode->i_sb->s_blocksize, 0);
+
unlock_page(page);
retry_journal:
@@ -1430,7 +1429,8 @@ static int ext4_write_end(struct file *file,
bool verity = ext4_verity_in_progress(inode);
trace_ext4_write_end(inode, pos, len, copied);
- if (inline_data) {
+ if (inline_data &&
+ ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_write_inline_data_end(inode, pos, len,
copied, page);
if (ret < 0) {
@@ -1717,7 +1717,14 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd,
ext4_lblk_t start, last;
start = index << (PAGE_SHIFT - inode->i_blkbits);
last = end << (PAGE_SHIFT - inode->i_blkbits);
+
+ /*
+ * avoid racing with extent status tree scans made by
+ * ext4_insert_delayed_block()
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_es_remove_extent(inode, start, last - start + 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
}
pagevec_init(&pvec);
@@ -4523,7 +4530,7 @@ int ext4_truncate(struct inode *inode)
trace_ext4_truncate_enter(inode);
if (!ext4_can_truncate(inode))
- return 0;
+ goto out_trace;
ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
@@ -4534,16 +4541,15 @@ int ext4_truncate(struct inode *inode)
int has_inline = 1;
err = ext4_inline_data_truncate(inode, &has_inline);
- if (err)
- return err;
- if (has_inline)
- return 0;
+ if (err || has_inline)
+ goto out_trace;
}
/* If we zero-out tail of the page, we have to create jinode for jbd2 */
if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
- if (ext4_inode_attach_jinode(inode) < 0)
- return 0;
+ err = ext4_inode_attach_jinode(inode);
+ if (err)
+ goto out_trace;
}
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4552,8 +4558,10 @@ int ext4_truncate(struct inode *inode)
credits = ext4_blocks_for_truncate(inode);
handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
- if (IS_ERR(handle))
- return PTR_ERR(handle);
+ if (IS_ERR(handle)) {
+ err = PTR_ERR(handle);
+ goto out_trace;
+ }
if (inode->i_size & (inode->i_sb->s_blocksize - 1))
ext4_block_truncate_page(handle, mapping, inode->i_size);
@@ -4602,6 +4610,7 @@ out_stop:
ext4_mark_inode_dirty(handle, inode);
ext4_journal_stop(handle);
+out_trace:
trace_ext4_truncate_exit(inode);
return err;
}
@@ -4638,9 +4647,17 @@ static int __ext4_get_inode_loc(struct inode *inode,
inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
inode_offset = ((inode->i_ino - 1) %
EXT4_INODES_PER_GROUP(sb));
- block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block);
iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb);
+ block = ext4_inode_table(sb, gdp);
+ if ((block <= le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) ||
+ (block >= ext4_blocks_count(EXT4_SB(sb)->s_es))) {
+ ext4_error(sb, "Invalid inode table block %llu in "
+ "block_group %u", block, iloc->block_group);
+ return -EFSCORRUPTED;
+ }
+ block += (inode_offset / inodes_per_block);
+
bh = sb_getblk(sb, block);
if (unlikely(!bh))
return -ENOMEM;
@@ -4834,11 +4851,15 @@ static inline int ext4_iget_extra_inode(struct inode *inode,
__le32 *magic = (void *)raw_inode +
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
- EXT4_INODE_SIZE(inode->i_sb) &&
+ if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+ int err;
+
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
- return ext4_find_inline_data_nolock(inode);
+ err = ext4_find_inline_data_nolock(inode);
+ if (!err && ext4_has_inline_data(inode))
+ ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
+ return err;
} else
EXT4_I(inode)->i_inline_off = 0;
return 0;
@@ -4872,6 +4893,24 @@ static inline u64 ext4_inode_peek_iversion(const struct inode *inode)
return inode_peek_iversion(inode);
}
+static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags)
+
+{
+ if (flags & EXT4_IGET_EA_INODE) {
+ if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ return "missing EA_INODE flag";
+ if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) ||
+ EXT4_I(inode)->i_file_acl)
+ return "ea_inode with extended attributes";
+ } else {
+ if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
+ return "unexpected EA_INODE flag";
+ }
+ if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD))
+ return "unexpected bad inode w/o EXT4_IGET_BAD";
+ return NULL;
+}
+
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
unsigned int line)
@@ -4880,6 +4919,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
struct ext4_inode *raw_inode;
struct ext4_inode_info *ei;
struct inode *inode;
+ const char *err_str;
journal_t *journal = EXT4_SB(sb)->s_journal;
long ret;
loff_t size;
@@ -4903,8 +4943,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
inode = iget_locked(sb, ino);
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode->i_state & I_NEW)) {
+ if ((err_str = check_igot_inode(inode, flags)) != NULL) {
+ ext4_error_inode(inode, function, line, 0, err_str);
+ iput(inode);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
return inode;
+ }
ei = EXT4_I(inode);
iloc.bh = NULL;
@@ -4914,13 +4960,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
goto bad_inode;
raw_inode = ext4_raw_inode(&iloc);
- if ((ino == EXT4_ROOT_INO) && (raw_inode->i_links_count == 0)) {
- ext4_error_inode(inode, function, line, 0,
- "iget: root inode unallocated");
- ret = -EFSCORRUPTED;
- goto bad_inode;
- }
-
if ((flags & EXT4_IGET_HANDLE) &&
(raw_inode->i_links_count == 0) && (raw_inode->i_mode == 0)) {
ret = -ESTALE;
@@ -4991,11 +5030,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
* NeilBrown 1999oct15
*/
if (inode->i_nlink == 0) {
- if ((inode->i_mode == 0 ||
+ if ((inode->i_mode == 0 || flags & EXT4_IGET_SPECIAL ||
!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) &&
ino != EXT4_BOOT_LOADER_INO) {
- /* this inode is deleted */
- ret = -ESTALE;
+ /* this inode is deleted or unallocated */
+ if (flags & EXT4_IGET_SPECIAL) {
+ ext4_error_inode(inode, function, line, 0,
+ "iget: special inode unallocated");
+ ret = -EFSCORRUPTED;
+ } else
+ ret = -ESTALE;
goto bad_inode;
}
/* The only unlinked inodes we let through here have
@@ -5171,8 +5215,13 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb))
ext4_error_inode(inode, function, line, 0,
"casefold flag without casefold feature");
- brelse(iloc.bh);
+ if ((err_str = check_igot_inode(inode, flags)) != NULL) {
+ ext4_error_inode(inode, function, line, 0, err_str);
+ ret = -EFSCORRUPTED;
+ goto bad_inode;
+ }
+ brelse(iloc.bh);
unlock_new_inode(inode);
return inode;
@@ -6032,6 +6081,14 @@ static int __ext4_expand_extra_isize(struct inode *inode,
return 0;
}
+ /*
+ * We may need to allocate external xattr block so we need quotas
+ * initialized. Here we can be called with various locks held so we
+ * cannot affort to initialize quotas ourselves. So just bail.
+ */
+ if (dquot_initialize_needed(inode))
+ return -EAGAIN;
+
/* try to expand with EAs present */
error = ext4_expand_extra_isize_ea(inode, new_extra_isize,
raw_inode, handle);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9fa20f9ba52b..ae47505964c4 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -121,7 +121,8 @@ static long swap_inode_boot_loader(struct super_block *sb,
blkcnt_t blocks;
unsigned short bytes;
- inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO, EXT4_IGET_SPECIAL);
+ inode_bl = ext4_iget(sb, EXT4_BOOT_LOADER_INO,
+ EXT4_IGET_SPECIAL | EXT4_IGET_BAD);
if (IS_ERR(inode_bl))
return PTR_ERR(inode_bl);
ei_bl = EXT4_I(inode_bl);
@@ -169,7 +170,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
/* Protect extent tree against block allocations via delalloc */
ext4_double_down_write_data_sem(inode, inode_bl);
- if (inode_bl->i_nlink == 0) {
+ if (is_bad_inode(inode_bl) || !S_ISREG(inode_bl->i_mode)) {
/* this inode has never been used as a BOOT_LOADER */
set_nlink(inode_bl, 1);
i_uid_write(inode_bl, 0);
@@ -178,6 +179,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
ei_bl->i_flags = 0;
inode_set_iversion(inode_bl, 1);
i_size_write(inode_bl, 0);
+ EXT4_I(inode_bl)->i_disksize = inode_bl->i_size;
inode_bl->i_mode = S_IFREG;
if (ext4_has_feature_extents(sb)) {
ext4_set_inode_flag(inode_bl, EXT4_INODE_EXTENTS);
@@ -460,6 +462,10 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
if (ext4_is_quota_file(inode))
return err;
+ err = dquot_initialize(inode);
+ if (err)
+ return err;
+
err = ext4_get_inode_loc(inode, &iloc);
if (err)
return err;
@@ -475,10 +481,6 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
brelse(iloc.bh);
}
- err = dquot_initialize(inode);
- if (err)
- return err;
-
handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
EXT4_QUOTA_INIT_BLOCKS(sb) +
EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
@@ -571,6 +573,7 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
__u32 flags;
+ struct super_block *ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -589,7 +592,9 @@ static int ext4_shutdown(struct super_block *sb, unsigned long arg)
switch (flags) {
case EXT4_GOING_FLAGS_DEFAULT:
- freeze_bdev(sb->s_bdev);
+ ret = freeze_bdev(sb->s_bdev);
+ if (IS_ERR(ret))
+ return PTR_ERR(ret);
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
thaw_bdev(sb->s_bdev, sb);
break;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 3c3166ba4364..b2e7b1907d41 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -16,6 +16,7 @@
#include <linux/slab.h>
#include <linux/nospec.h>
#include <linux/backing-dev.h>
+#include <linux/freezer.h>
#include <trace/events/ext4.h>
#ifdef CONFIG_EXT4_DEBUG
@@ -1801,6 +1802,9 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return err;
ext4_lock_group(ac->ac_sb, group);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ goto out;
+
max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
@@ -1808,6 +1812,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
}
+out:
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
@@ -1834,12 +1839,10 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
if (err)
return err;
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
- ext4_mb_unload_buddy(e4b);
- return 0;
- }
-
ext4_lock_group(ac->ac_sb, group);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ goto out;
+
max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
ex.fe_logical = 0xDEADFA11; /* debug value */
@@ -1872,6 +1875,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
}
+out:
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
@@ -3091,9 +3095,9 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_super_block *es = sbi->s_es;
int bsbits, max;
- ext4_lblk_t end;
- loff_t size, start_off;
+ loff_t size, start_off, end;
loff_t orig_size __maybe_unused;
ext4_lblk_t start;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
@@ -3122,7 +3126,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* first, let's learn actual file size
* given current request is allocated */
- size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+ size = extent_logical_end(sbi, &ac->ac_o_ex);
size = size << bsbits;
if (size < i_size_read(ac->ac_inode))
size = i_size_read(ac->ac_inode);
@@ -3181,6 +3185,10 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
start = max(start, rounddown(ac->ac_o_ex.fe_logical,
(ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
+ /* avoid unnecessary preallocation that may trigger assertions */
+ if (start + size > EXT_MAX_BLOCKS)
+ size = EXT_MAX_BLOCKS - start;
+
/* don't cover already allocated blocks in selected range */
if (ar->pleft && start <= ar->lleft) {
size -= ar->lleft + 1 - start;
@@ -3201,7 +3209,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* check we don't cross already preallocated blocks */
rcu_read_lock();
list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
- ext4_lblk_t pa_end;
+ loff_t pa_end;
if (pa->pa_deleted)
continue;
@@ -3211,8 +3219,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
continue;
}
- pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
- pa->pa_len);
+ pa_end = pa_logical_end(EXT4_SB(ac->ac_sb), pa);
/* PA must not overlap original request */
BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
@@ -3241,12 +3248,11 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
/* XXX: extra loop to check we really don't overlap preallocations */
rcu_read_lock();
list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
- ext4_lblk_t pa_end;
+ loff_t pa_end;
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0) {
- pa_end = pa->pa_lstart + EXT4_C2B(EXT4_SB(ac->ac_sb),
- pa->pa_len);
+ pa_end = pa_logical_end(EXT4_SB(ac->ac_sb), pa);
BUG_ON(!(start >= pa_end || end <= pa->pa_lstart));
}
spin_unlock(&pa->pa_lock);
@@ -3271,18 +3277,21 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
/* define goal start in order to merge */
- if (ar->pright && (ar->lright == (start + size))) {
+ if (ar->pright && (ar->lright == (start + size)) &&
+ ar->pright >= size &&
+ ar->pright - size >= le32_to_cpu(es->s_first_data_block)) {
/* merge to the right */
ext4_get_group_no_and_offset(ac->ac_sb, ar->pright - size,
- &ac->ac_f_ex.fe_group,
- &ac->ac_f_ex.fe_start);
+ &ac->ac_g_ex.fe_group,
+ &ac->ac_g_ex.fe_start);
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
- if (ar->pleft && (ar->lleft + 1 == start)) {
+ if (ar->pleft && (ar->lleft + 1 == start) &&
+ ar->pleft + 1 < ext4_blocks_count(es)) {
/* merge to the left */
ext4_get_group_no_and_offset(ac->ac_sb, ar->pleft + 1,
- &ac->ac_f_ex.fe_group,
- &ac->ac_f_ex.fe_start);
+ &ac->ac_g_ex.fe_group,
+ &ac->ac_g_ex.fe_start);
ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
}
@@ -3374,6 +3383,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
BUG_ON(start < pa->pa_pstart);
BUG_ON(end > pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len));
BUG_ON(pa->pa_free < len);
+ BUG_ON(ac->ac_b_ex.fe_len <= 0);
pa->pa_free -= len;
mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
@@ -3456,8 +3466,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* all fields in this condition don't change,
* so we can skip locking for them */
if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
- ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
- EXT4_C2B(sbi, pa->pa_len)))
+ ac->ac_o_ex.fe_logical >= pa_logical_end(sbi, pa))
continue;
/* non-extent files can't have physical blocks past 2^32 */
@@ -3678,37 +3687,51 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
return -ENOMEM;
if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
- int winl;
- int wins;
- int win;
- int offs;
-
- /* we can't allocate as much as normalizer wants.
- * so, found space must get proper lstart
- * to cover original request */
+ struct ext4_free_extent ex = {
+ .fe_logical = ac->ac_g_ex.fe_logical,
+ .fe_len = ac->ac_g_ex.fe_len,
+ };
+ loff_t orig_goal_end = extent_logical_end(sbi, &ex);
+ loff_t o_ex_end = extent_logical_end(sbi, &ac->ac_o_ex);
+
+ /*
+ * We can't allocate as much as normalizer wants, so we try
+ * to get proper lstart to cover the original request, except
+ * when the goal doesn't cover the original request as below:
+ *
+ * orig_ex:2045/2055(10), isize:8417280 -> normalized:0/2048
+ * best_ex:0/200(200) -> adjusted: 1848/2048(200)
+ */
BUG_ON(ac->ac_g_ex.fe_logical > ac->ac_o_ex.fe_logical);
BUG_ON(ac->ac_g_ex.fe_len < ac->ac_o_ex.fe_len);
- /* we're limited by original request in that
- * logical block must be covered any way
- * winl is window we can move our chunk within */
- winl = ac->ac_o_ex.fe_logical - ac->ac_g_ex.fe_logical;
+ /*
+ * Use the below logic for adjusting best extent as it keeps
+ * fragmentation in check while ensuring logical range of best
+ * extent doesn't overflow out of goal extent:
+ *
+ * 1. Check if best ex can be kept at end of goal and still
+ * cover original start
+ * 2. Else, check if best ex can be kept at start of goal and
+ * still cover original end
+ * 3. Else, keep the best ex at start of original request.
+ */
+ ex.fe_len = ac->ac_b_ex.fe_len;
- /* also, we should cover whole original request */
- wins = EXT4_C2B(sbi, ac->ac_b_ex.fe_len - ac->ac_o_ex.fe_len);
+ ex.fe_logical = orig_goal_end - EXT4_C2B(sbi, ex.fe_len);
+ if (ac->ac_o_ex.fe_logical >= ex.fe_logical)
+ goto adjust_bex;
- /* the smallest one defines real window */
- win = min(winl, wins);
+ ex.fe_logical = ac->ac_g_ex.fe_logical;
+ if (o_ex_end <= extent_logical_end(sbi, &ex))
+ goto adjust_bex;
- offs = ac->ac_o_ex.fe_logical %
- EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
- if (offs && offs < win)
- win = offs;
+ ex.fe_logical = ac->ac_o_ex.fe_logical;
+adjust_bex:
+ ac->ac_b_ex.fe_logical = ex.fe_logical;
- ac->ac_b_ex.fe_logical = ac->ac_o_ex.fe_logical -
- EXT4_NUM_B2C(sbi, win);
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
- BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
+ BUG_ON(extent_logical_end(sbi, &ex) > orig_goal_end);
}
/* preallocation can change ac_b_ex, thus we store actually
@@ -3895,7 +3918,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
trace_ext4_mb_release_group_pa(sb, pa);
BUG_ON(pa->pa_deleted == 0);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit);
- BUG_ON(group != e4b->bd_group && pa->pa_len != 0);
+ if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
+ ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
+ e4b->bd_group, group, pa->pa_pstart);
+ return 0;
+ }
mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
@@ -4212,7 +4239,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
- size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
+ size = extent_logical_end(sbi, &ac->ac_o_ex);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
@@ -4929,8 +4956,8 @@ do_more:
* them with group lock_held
*/
if (test_opt(sb, DISCARD)) {
- err = ext4_issue_discard(sb, block_group, bit, count,
- NULL);
+ err = ext4_issue_discard(sb, block_group, bit,
+ count_clusters, NULL);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%d block:%d count:%lu failed"
@@ -5138,19 +5165,19 @@ error_return:
* @sb: super block for the file system
* @start: starting block of the free extent in the alloc. group
* @count: number of blocks to TRIM
- * @group: alloc. group we are working with
* @e4b: ext4 buddy for the group
*
* Trim "count" blocks starting at "start" in the "group". To assure that no
* one will allocate those blocks, mark it as used in buddy bitmap. This must
* be called with under the group lock.
*/
-static int ext4_trim_extent(struct super_block *sb, int start, int count,
- ext4_group_t group, struct ext4_buddy *e4b)
+static int ext4_trim_extent(struct super_block *sb,
+ int start, int count, struct ext4_buddy *e4b)
__releases(bitlock)
__acquires(bitlock)
{
struct ext4_free_extent ex;
+ ext4_group_t group = e4b->bd_group;
int ret = 0;
trace_ext4_trim_extent(sb, group, start, count);
@@ -5173,6 +5200,81 @@ __acquires(bitlock)
return ret;
}
+static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
+ ext4_group_t grp)
+{
+ unsigned long nr_clusters_in_group;
+
+ if (grp < (ext4_get_groups_count(sb) - 1))
+ nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb);
+ else
+ nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) -
+ ext4_group_first_block_no(sb, grp))
+ >> EXT4_CLUSTER_BITS(sb);
+
+ return nr_clusters_in_group - 1;
+}
+
+static bool ext4_trim_interrupted(void)
+{
+ return fatal_signal_pending(current) || freezing(current);
+}
+
+static int ext4_try_to_trim_range(struct super_block *sb,
+ struct ext4_buddy *e4b, ext4_grpblk_t start,
+ ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+ ext4_grpblk_t next, count, free_count, last, origin_start;
+ bool set_trimmed = false;
+ void *bitmap;
+
+ last = ext4_last_grp_cluster(sb, e4b->bd_group);
+ bitmap = e4b->bd_bitmap;
+ if (start == 0 && max >= last)
+ set_trimmed = true;
+ origin_start = start;
+ start = max(e4b->bd_info->bb_first_free, start);
+ count = 0;
+ free_count = 0;
+
+ while (start <= max) {
+ start = mb_find_next_zero_bit(bitmap, max + 1, start);
+ if (start > max)
+ break;
+
+ next = mb_find_next_bit(bitmap, last + 1, start);
+ if (origin_start == 0 && next >= last)
+ set_trimmed = true;
+
+ if ((next - start) >= minblocks) {
+ int ret = ext4_trim_extent(sb, start, next - start, e4b);
+
+ if (ret && ret != -EOPNOTSUPP)
+ return count;
+ count += next - start;
+ }
+ free_count += next - start;
+ start = next + 1;
+
+ if (ext4_trim_interrupted())
+ return count;
+
+ if (need_resched()) {
+ ext4_unlock_group(sb, e4b->bd_group);
+ cond_resched();
+ ext4_lock_group(sb, e4b->bd_group);
+ }
+
+ if ((e4b->bd_info->bb_free - free_count) < minblocks)
+ break;
+ }
+
+ if (set_trimmed)
+ EXT4_MB_GRP_SET_TRIMMED(e4b->bd_info);
+
+ return count;
+}
+
/**
* ext4_trim_all_free -- function to trim all free space in alloc. group
* @sb: super block for file system
@@ -5196,10 +5298,8 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ext4_grpblk_t start, ext4_grpblk_t max,
ext4_grpblk_t minblocks)
{
- void *bitmap;
- ext4_grpblk_t next, count = 0, free_count = 0;
struct ext4_buddy e4b;
- int ret = 0;
+ int ret;
trace_ext4_trim_all_free(sb, group, start, max);
@@ -5209,58 +5309,20 @@ ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
ret, group);
return ret;
}
- bitmap = e4b.bd_bitmap;
ext4_lock_group(sb, group);
- if (EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) &&
- minblocks >= atomic_read(&EXT4_SB(sb)->s_last_trim_minblks))
- goto out;
-
- start = (e4b.bd_info->bb_first_free > start) ?
- e4b.bd_info->bb_first_free : start;
- while (start <= max) {
- start = mb_find_next_zero_bit(bitmap, max + 1, start);
- if (start > max)
- break;
- next = mb_find_next_bit(bitmap, max + 1, start);
-
- if ((next - start) >= minblocks) {
- ret = ext4_trim_extent(sb, start,
- next - start, group, &e4b);
- if (ret && ret != -EOPNOTSUPP)
- break;
- ret = 0;
- count += next - start;
- }
- free_count += next - start;
- start = next + 1;
-
- if (fatal_signal_pending(current)) {
- count = -ERESTARTSYS;
- break;
- }
-
- if (need_resched()) {
- ext4_unlock_group(sb, group);
- cond_resched();
- ext4_lock_group(sb, group);
- }
-
- if ((e4b.bd_info->bb_free - free_count) < minblocks)
- break;
- }
+ if (!EXT4_MB_GRP_WAS_TRIMMED(e4b.bd_info) ||
+ minblocks < EXT4_SB(sb)->s_last_trim_minblks)
+ ret = ext4_try_to_trim_range(sb, &e4b, start, max, minblocks);
+ else
+ ret = 0;
- if (!ret) {
- ret = count;
- EXT4_MB_GRP_SET_TRIMMED(e4b.bd_info);
- }
-out:
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
ext4_debug("trimmed %d blocks in the group %d\n",
- count, group);
+ ret, group);
return ret;
}
@@ -5305,7 +5367,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
if (minlen > EXT4_CLUSTERS_PER_GROUP(sb))
goto out;
}
- if (end >= max_blks)
+ if (end >= max_blks - 1)
end = max_blks - 1;
if (end <= first_data_blk)
goto out;
@@ -5322,6 +5384,8 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
for (group = first_group; group <= last_group; group++) {
+ if (ext4_trim_interrupted())
+ break;
grp = ext4_get_group_info(sb, group);
/* We only do this if the grp has never been initialized */
if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
@@ -5338,10 +5402,9 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
*/
if (group == last_group)
end = last_cluster;
-
if (grp->bb_free >= minlen) {
cnt = ext4_trim_all_free(sb, group, first_cluster,
- end, minlen);
+ end, minlen);
if (cnt < 0) {
ret = cnt;
break;
@@ -5357,7 +5420,7 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
}
if (!ret)
- atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
+ EXT4_SB(sb)->s_last_trim_minblks = minlen;
out:
range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits;
@@ -5386,8 +5449,7 @@ ext4_mballoc_query_range(
ext4_lock_group(sb, group);
- start = (e4b.bd_info->bb_first_free > start) ?
- e4b.bd_info->bb_first_free : start;
+ start = max(e4b.bd_info->bb_first_free, start);
if (end >= EXT4_CLUSTERS_PER_GROUP(sb))
end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 88c98f17e3d9..4e442bad6d73 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -199,6 +199,20 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
(fex->fe_start << EXT4_SB(sb)->s_cluster_bits);
}
+static inline loff_t extent_logical_end(struct ext4_sb_info *sbi,
+ struct ext4_free_extent *fex)
+{
+ /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
+ return (loff_t)fex->fe_logical + EXT4_C2B(sbi, fex->fe_len);
+}
+
+static inline loff_t pa_logical_end(struct ext4_sb_info *sbi,
+ struct ext4_prealloc_space *pa)
+{
+ /* Use loff_t to avoid end exceeding ext4_lblk_t max. */
+ return (loff_t)pa->pa_lstart + EXT4_C2B(sbi, pa->pa_len);
+}
+
typedef int (*ext4_mballoc_query_range_fn)(
struct super_block *sb,
ext4_group_t agno,
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index c5b2ea1a9372..dbba3c3a2f06 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -435,7 +435,7 @@ int ext4_ext_migrate(struct inode *inode)
struct inode *tmp_inode = NULL;
struct migrate_struct lb;
unsigned long max_entries;
- __u32 goal;
+ __u32 goal, tmp_csum_seed;
uid_t owner[2];
/*
@@ -443,7 +443,8 @@ int ext4_ext_migrate(struct inode *inode)
* already is extent-based, error out.
*/
if (!ext4_has_feature_extents(inode->i_sb) ||
- (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ ext4_has_inline_data(inode))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
@@ -483,6 +484,7 @@ int ext4_ext_migrate(struct inode *inode)
* the migration.
*/
ei = EXT4_I(inode);
+ tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed;
EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed;
i_size_write(tmp_inode, i_size_read(inode));
/*
@@ -593,6 +595,7 @@ err_out:
* the inode is not visible to user space.
*/
tmp_inode->i_blocks = 0;
+ EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed;
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 30ce3dc69378..e71ab64f1df5 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -615,6 +615,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
goto out;
o_end = o_start + len;
+ *moved_len = 0;
while (o_start < o_end) {
struct ext4_extent *ex;
ext4_lblk_t cur_blk, next_blk;
@@ -670,7 +671,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
*/
ext4_double_up_write_data_sem(orig_inode, donor_inode);
/* Swap original branches with new branches */
- move_extent_per_page(o_filp, donor_inode,
+ *moved_len += move_extent_per_page(o_filp, donor_inode,
orig_page_index, donor_page_index,
offset_in_page, cur_len,
unwritten, &ret);
@@ -680,9 +681,6 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
o_start += cur_len;
d_start += cur_len;
}
- *moved_len = o_start - orig_blk;
- if (*moved_len > len)
- *moved_len = len;
out:
if (*moved_len) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d151892db8b0..87c6d619a564 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -54,6 +54,7 @@ static struct buffer_head *ext4_append(handle_t *handle,
struct inode *inode,
ext4_lblk_t *block)
{
+ struct ext4_map_blocks map;
struct buffer_head *bh;
int err;
@@ -63,6 +64,21 @@ static struct buffer_head *ext4_append(handle_t *handle,
return ERR_PTR(-ENOSPC);
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+ map.m_lblk = *block;
+ map.m_len = 1;
+
+ /*
+ * We're appending new directory block. Make sure the block is not
+ * allocated yet, otherwise we will end up corrupting the
+ * directory.
+ */
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+ if (err) {
+ EXT4_ERROR_INODE(inode, "Logical block already allocated");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
if (IS_ERR(bh))
@@ -309,17 +325,17 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
struct buffer_head *bh)
{
struct ext4_dir_entry_tail *t;
+ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
#ifdef PARANOID
struct ext4_dir_entry *d, *top;
d = (struct ext4_dir_entry *)bh->b_data;
top = (struct ext4_dir_entry *)(bh->b_data +
- (EXT4_BLOCK_SIZE(inode->i_sb) -
- sizeof(struct ext4_dir_entry_tail)));
- while (d < top && d->rec_len)
+ (blocksize - sizeof(struct ext4_dir_entry_tail)));
+ while (d < top && ext4_rec_len_from_disk(d->rec_len, blocksize))
d = (struct ext4_dir_entry *)(((void *)d) +
- le16_to_cpu(d->rec_len));
+ ext4_rec_len_from_disk(d->rec_len, blocksize));
if (d != top)
return NULL;
@@ -330,7 +346,8 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
#endif
if (t->det_reserved_zero1 ||
- le16_to_cpu(t->det_rec_len) != sizeof(struct ext4_dir_entry_tail) ||
+ (ext4_rec_len_from_disk(t->det_rec_len, blocksize) !=
+ sizeof(struct ext4_dir_entry_tail)) ||
t->det_reserved_zero2 ||
t->det_reserved_ft != EXT4_FT_DIR_CSUM)
return NULL;
@@ -411,13 +428,14 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
struct ext4_dir_entry *dp;
struct dx_root_info *root;
int count_offset;
+ int blocksize = EXT4_BLOCK_SIZE(inode->i_sb);
+ unsigned int rlen = ext4_rec_len_from_disk(dirent->rec_len, blocksize);
- if (le16_to_cpu(dirent->rec_len) == EXT4_BLOCK_SIZE(inode->i_sb))
+ if (rlen == blocksize)
count_offset = 8;
- else if (le16_to_cpu(dirent->rec_len) == 12) {
+ else if (rlen == 12) {
dp = (struct ext4_dir_entry *)(((void *)dirent) + 12);
- if (le16_to_cpu(dp->rec_len) !=
- EXT4_BLOCK_SIZE(inode->i_sb) - 12)
+ if (ext4_rec_len_from_disk(dp->rec_len, blocksize) != blocksize - 12)
return NULL;
root = (struct dx_root_info *)(((void *)dp + 12));
if (root->reserved_zero ||
@@ -1230,6 +1248,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
unsigned int buflen = bh->b_size;
char *base = bh->b_data;
struct dx_hash_info h = *hinfo;
+ int blocksize = EXT4_BLOCK_SIZE(dir->i_sb);
if (ext4_has_metadata_csum(dir->i_sb))
buflen -= sizeof(struct ext4_dir_entry_tail);
@@ -1243,11 +1262,12 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh,
map_tail--;
map_tail->hash = h.hash;
map_tail->offs = ((char *) de - base)>>2;
- map_tail->size = le16_to_cpu(de->rec_len);
+ map_tail->size = ext4_rec_len_from_disk(de->rec_len,
+ blocksize);
count++;
cond_resched();
}
- de = ext4_next_entry(de, dir->i_sb->s_blocksize);
+ de = ext4_next_entry(de, blocksize);
}
return count;
}
@@ -1486,11 +1506,10 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir,
int has_inline_data = 1;
ret = ext4_find_inline_entry(dir, fname, res_dir,
&has_inline_data);
- if (has_inline_data) {
- if (inlined)
- *inlined = 1;
+ if (inlined)
+ *inlined = has_inline_data;
+ if (has_inline_data)
goto cleanup_and_exit;
- }
}
if ((namelen <= 2) && (name[0] == '.') &&
@@ -2125,8 +2144,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
memcpy(data2, de, len);
de = (struct ext4_dir_entry_2 *) data2;
top = data2 + len;
- while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+ while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
+ (data2 + (blocksize - csum_size) -
+ (char *) de))) {
+ brelse(bh2);
+ brelse(bh);
+ return -EFSCORRUPTED;
+ }
de = de2;
+ }
de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
(char *) de, blocksize);
@@ -2913,11 +2940,8 @@ bool ext4_empty_dir(struct inode *inode)
de = (struct ext4_dir_entry_2 *) (bh->b_data +
(offset & (sb->s_blocksize - 1)));
if (ext4_check_dir_entry(inode, NULL, de, bh,
- bh->b_data, bh->b_size, offset)) {
- offset = (offset | (sb->s_blocksize - 1)) + 1;
- continue;
- }
- if (le32_to_cpu(de->inode)) {
+ bh->b_data, bh->b_size, offset) ||
+ le32_to_cpu(de->inode)) {
brelse(bh);
return false;
}
@@ -3609,7 +3633,8 @@ static void ext4_resetent(handle_t *handle, struct ext4_renament *ent,
* so the old->de may no longer valid and need to find it again
* before reset old inode info.
*/
- old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
+ &old.inlined);
if (IS_ERR(old.bh))
retval = PTR_ERR(old.bh);
if (!old.bh)
@@ -3759,6 +3784,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
retval = dquot_initialize(old.dir);
if (retval)
return retval;
+ retval = dquot_initialize(old.inode);
+ if (retval)
+ return retval;
retval = dquot_initialize(new.dir);
if (retval)
return retval;
@@ -3771,9 +3799,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
return retval;
}
- old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de, NULL);
+ old.bh = ext4_find_entry(old.dir, &old.dentry->d_name, &old.de,
+ &old.inlined);
if (IS_ERR(old.bh))
return PTR_ERR(old.bh);
+
/*
* Check for inode number is _not_ due to possible IO errors.
* We might rmdir the source, keep it as pwd of some process
@@ -3932,6 +3962,7 @@ release_bh:
brelse(old.dir_bh);
brelse(old.bh);
brelse(new.bh);
+
return retval;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index b66b335a0ca6..5858fb9b5cd5 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -380,7 +380,8 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
static int io_submit_add_bh(struct ext4_io_submit *io,
struct inode *inode,
- struct page *page,
+ struct page *pagecache_page,
+ struct page *bounce_page,
struct buffer_head *bh)
{
int ret;
@@ -395,10 +396,11 @@ submit_and_retry:
return ret;
io->io_bio->bi_write_hint = inode->i_write_hint;
}
- ret = bio_add_page(io->io_bio, page, bh->b_size, bh_offset(bh));
+ ret = bio_add_page(io->io_bio, bounce_page ?: pagecache_page,
+ bh->b_size, bh_offset(bh));
if (ret != bh->b_size)
goto submit_and_retry;
- wbc_account_cgroup_owner(io->io_wbc, page, bh->b_size);
+ wbc_account_cgroup_owner(io->io_wbc, pagecache_page, bh->b_size);
io->io_next_block++;
return 0;
}
@@ -511,7 +513,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
do {
if (!buffer_async_write(bh))
continue;
- ret = io_submit_add_bh(io, inode, bounce_page ?: page, bh);
+ ret = io_submit_add_bh(io, inode, page, bounce_page, bh);
if (ret) {
/*
* We only get here on ENOMEM. Not much else
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 6410c1e098d3..d4431ca0c10e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -227,17 +227,24 @@ struct ext4_new_flex_group_data {
in the flex group */
__u16 *bg_flags; /* block group flags of groups
in @groups */
+ ext4_group_t resize_bg; /* number of allocated
+ new_group_data */
ext4_group_t count; /* number of groups in @groups
*/
};
/*
+ * Avoiding memory allocation failures due to too many groups added each time.
+ */
+#define MAX_RESIZE_BG 16384
+
+/*
* alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
* @flexbg_size.
*
* Returns NULL on failure otherwise address of the allocated structure.
*/
-static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size)
{
struct ext4_new_flex_group_data *flex_gd;
@@ -245,17 +252,18 @@ static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
if (flex_gd == NULL)
goto out3;
- if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data))
- goto out2;
- flex_gd->count = flexbg_size;
+ if (unlikely(flexbg_size > MAX_RESIZE_BG))
+ flex_gd->resize_bg = MAX_RESIZE_BG;
+ else
+ flex_gd->resize_bg = flexbg_size;
- flex_gd->groups = kmalloc_array(flexbg_size,
+ flex_gd->groups = kmalloc_array(flex_gd->resize_bg,
sizeof(struct ext4_new_group_data),
GFP_NOFS);
if (flex_gd->groups == NULL)
goto out2;
- flex_gd->bg_flags = kmalloc_array(flexbg_size, sizeof(__u16),
+ flex_gd->bg_flags = kmalloc_array(flex_gd->resize_bg, sizeof(__u16),
GFP_NOFS);
if (flex_gd->bg_flags == NULL)
goto out1;
@@ -292,7 +300,7 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
*/
static int ext4_alloc_group_tables(struct super_block *sb,
struct ext4_new_flex_group_data *flex_gd,
- int flexbg_size)
+ unsigned int flexbg_size)
{
struct ext4_new_group_data *group_data = flex_gd->groups;
ext4_fsblk_t start_blk;
@@ -393,12 +401,12 @@ next_group:
group = group_data[0].group;
printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
- "%d groups, flexbg size is %d:\n", flex_gd->count,
+ "%u groups, flexbg size is %u:\n", flex_gd->count,
flexbg_size);
for (i = 0; i < flex_gd->count; i++) {
ext4_debug(
- "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n",
+ "adding %s group %u: %u blocks (%u free, %u mdata blocks)\n",
ext4_bg_has_super(sb, group + i) ? "normal" :
"no-super", group + i,
group_data[i].blocks_count,
@@ -572,13 +580,8 @@ static int setup_new_flex_group_blocks(struct super_block *sb,
if (meta_bg == 0 && !ext4_bg_has_super(sb, group))
goto handle_itb;
- if (meta_bg == 1) {
- ext4_group_t first_group;
- first_group = ext4_meta_bg_first_group(sb, group);
- if (first_group != group + 1 &&
- first_group != group + EXT4_DESC_PER_BLOCK(sb) - 1)
- goto handle_itb;
- }
+ if (meta_bg == 1)
+ goto handle_itb;
block = start + ext4_bg_has_super(sb, group);
/* Copy all of the GDT blocks into the backup in this group */
@@ -1483,6 +1486,7 @@ static void ext4_update_super(struct super_block *sb,
* Update the fs overhead information
*/
ext4_calculate_overhead(sb);
+ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: added group %u:"
@@ -1563,11 +1567,14 @@ exit_journal:
int gdb_num = group / EXT4_DESC_PER_BLOCK(sb);
int gdb_num_end = ((group + flex_gd->count - 1) /
EXT4_DESC_PER_BLOCK(sb));
- int meta_bg = ext4_has_feature_meta_bg(sb);
+ int meta_bg = ext4_has_feature_meta_bg(sb) &&
+ gdb_num >= le32_to_cpu(es->s_first_meta_bg);
+ sector_t padding_blocks = meta_bg ? 0 : sbi->s_sbh->b_blocknr -
+ ext4_group_first_block_no(sb, 0);
sector_t old_gdb = 0;
- update_backups(sb, sbi->s_sbh->b_blocknr, (char *)es,
- sizeof(struct ext4_super_block), 0);
+ update_backups(sb, ext4_group_first_block_no(sb, 0),
+ (char *)es, sizeof(struct ext4_super_block), 0);
for (; gdb_num <= gdb_num_end; gdb_num++) {
struct buffer_head *gdb_bh;
@@ -1575,8 +1582,8 @@ exit_journal:
gdb_num);
if (old_gdb == gdb_bh->b_blocknr)
continue;
- update_backups(sb, gdb_bh->b_blocknr, gdb_bh->b_data,
- gdb_bh->b_size, meta_bg);
+ update_backups(sb, gdb_bh->b_blocknr - padding_blocks,
+ gdb_bh->b_data, gdb_bh->b_size, meta_bg);
old_gdb = gdb_bh->b_blocknr;
}
}
@@ -1586,8 +1593,7 @@ exit:
static int ext4_setup_next_flex_gd(struct super_block *sb,
struct ext4_new_flex_group_data *flex_gd,
- ext4_fsblk_t n_blocks_count,
- unsigned long flexbg_size)
+ ext4_fsblk_t n_blocks_count)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
@@ -1611,7 +1617,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
BUG_ON(last);
ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
- last_group = group | (flexbg_size - 1);
+ last_group = group | (flex_gd->resize_bg - 1);
if (last_group > n_group)
last_group = n_group;
@@ -1774,7 +1780,7 @@ errout:
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
"blocks\n", ext4_blocks_count(es));
- update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
+ update_backups(sb, ext4_group_first_block_no(sb, 0),
(char *)es, sizeof(struct ext4_super_block), 0);
}
return err;
@@ -1937,9 +1943,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode)
errout:
ret = ext4_journal_stop(handle);
- if (!err)
- err = ret;
- return ret;
+ return err ? err : ret;
invalid_resize_inode:
ext4_error(sb, "corrupted/inconsistent resize inode");
@@ -1967,8 +1971,9 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
ext4_fsblk_t o_blocks_count;
ext4_fsblk_t n_blocks_count_retry = 0;
unsigned long last_update_time = 0;
- int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
+ int err = 0;
int meta_bg;
+ unsigned int flexbg_size = ext4_flex_bg_size(sbi);
/* See if the device is actually as big as what was requested */
bh = sb_bread(sb, n_blocks_count - 1);
@@ -1978,6 +1983,16 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
}
brelse(bh);
+ /*
+ * For bigalloc, trim the requested size to the nearest cluster
+ * boundary to avoid creating an unusable filesystem. We do this
+ * silently, instead of returning an error, to avoid breaking
+ * callers that blindly resize the filesystem to the full size of
+ * the underlying block device.
+ */
+ if (ext4_has_feature_bigalloc(sb))
+ n_blocks_count &= ~((1 << EXT4_CLUSTER_BITS(sb)) - 1);
+
retry:
o_blocks_count = ext4_blocks_count(es);
@@ -2079,7 +2094,7 @@ retry:
goto out;
}
- if (ext4_blocks_count(es) == n_blocks_count)
+ if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0)
goto out;
err = ext4_alloc_flex_bg_array(sb, n_group + 1);
@@ -2099,8 +2114,7 @@ retry:
/* Add flex groups. Note that a regular group is a
* flex group with 1 group.
*/
- while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
- flexbg_size)) {
+ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count)) {
if (jiffies - last_update_time > HZ * 10) {
if (last_update_time)
ext4_msg(sb, KERN_INFO,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eba2506f4399..8ad3de7846c5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -906,10 +906,16 @@ static void ext4_blkdev_put(struct block_device *bdev)
static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
{
struct block_device *bdev;
- bdev = sbi->journal_bdev;
+ bdev = sbi->s_journal_bdev;
if (bdev) {
+ /*
+ * Invalidate the journal device's buffers. We don't want them
+ * floating about in memory - the physical journal device may
+ * hotswapped, and it breaks the `ro-after' testing code.
+ */
+ invalidate_bdev(bdev);
ext4_blkdev_put(bdev);
- sbi->journal_bdev = NULL;
+ sbi->s_journal_bdev = NULL;
}
}
@@ -1034,14 +1040,8 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->journal_bdev && sbi->journal_bdev != sb->s_bdev) {
- /*
- * Invalidate the journal device's buffers. We don't want them
- * floating about in memory - the physical journal device may
- * hotswapped, and it breaks the `ro-after' testing code.
- */
- sync_blockdev(sbi->journal_bdev);
- invalidate_bdev(sbi->journal_bdev);
+ if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
+ sync_blockdev(sbi->s_journal_bdev);
ext4_blkdev_remove(sbi);
}
@@ -1085,6 +1085,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
return NULL;
inode_set_iversion(&ei->vfs_inode, 1);
+ ei->i_flags = 0;
spin_lock_init(&ei->i_raw_lock);
INIT_LIST_HEAD(&ei->i_prealloc_list);
spin_lock_init(&ei->i_prealloc_lock);
@@ -2474,11 +2475,9 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
crc = crc16(crc, (__u8 *)gdp, offset);
offset += sizeof(gdp->bg_checksum); /* skip checksum */
/* for checksum of struct ext4_group_desc do the rest...*/
- if (ext4_has_feature_64bit(sb) &&
- offset < le16_to_cpu(sbi->s_es->s_desc_size))
+ if (ext4_has_feature_64bit(sb) && offset < sbi->s_desc_size)
crc = crc16(crc, (__u8 *)gdp + offset,
- le16_to_cpu(sbi->s_es->s_desc_size) -
- offset);
+ sbi->s_desc_size - offset);
out:
return cpu_to_le16(crc);
@@ -3157,6 +3156,7 @@ static int ext4_lazyinit_thread(void *arg)
unsigned long next_wakeup, cur;
BUG_ON(NULL == eli);
+ set_freezable();
cont_thread:
while (true) {
@@ -3582,7 +3582,7 @@ int ext4_calculate_overhead(struct super_block *sb)
* Add the internal journal blocks whether the journal has been
* loaded or not
*/
- if (sbi->s_journal && !sbi->journal_bdev)
+ if (sbi->s_journal && !sbi->s_journal_bdev)
overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen);
else if (ext4_has_feature_journal(sb) && !sbi->s_journal && j_inum) {
/* j_inum for internal journal is non-zero */
@@ -4387,30 +4387,31 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
ext4_has_feature_journal_needs_recovery(sb)) {
ext4_msg(sb, KERN_ERR, "required journal recovery "
"suppressed and not mounted read-only");
- goto failed_mount_wq;
+ goto failed_mount3a;
} else {
/* Nojournal mode, all journal mount options are illegal */
- if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
- ext4_msg(sb, KERN_ERR, "can't mount with "
- "journal_checksum, fs mounted w/o journal");
- goto failed_mount_wq;
- }
if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"journal_async_commit, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
+ }
+
+ if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) {
+ ext4_msg(sb, KERN_ERR, "can't mount with "
+ "journal_checksum, fs mounted w/o journal");
+ goto failed_mount3a;
}
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"commit=%lu, fs mounted w/o journal",
sbi->s_commit_interval / HZ);
- goto failed_mount_wq;
+ goto failed_mount3a;
}
if (EXT4_MOUNT_DATA_FLAGS &
(sbi->s_mount_opt ^ sbi->s_def_mount_opt)) {
ext4_msg(sb, KERN_ERR, "can't mount with "
"data=, fs mounted w/o journal");
- goto failed_mount_wq;
+ goto failed_mount3a;
}
sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM;
clear_opt(sb, JOURNAL_CHECKSUM);
@@ -4776,6 +4777,7 @@ failed_mount:
ext4_blkdev_remove(sbi);
brelse(bh);
out_fail:
+ invalidate_bdev(sb->s_bdev);
sb->s_fs_info = NULL;
kfree(sbi->s_blockgroup_lock);
out_free_base:
@@ -4834,7 +4836,7 @@ static struct inode *ext4_get_journal_inode(struct super_block *sb,
jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
journal_inode, journal_inode->i_size);
- if (!S_ISREG(journal_inode->i_mode)) {
+ if (!S_ISREG(journal_inode->i_mode) || IS_ENCRYPTED(journal_inode)) {
ext4_msg(sb, KERN_ERR, "invalid journal inode");
iput(journal_inode);
return NULL;
@@ -4951,7 +4953,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
be32_to_cpu(journal->j_superblock->s_nr_users));
goto out_journal;
}
- EXT4_SB(sb)->journal_bdev = bdev;
+ EXT4_SB(sb)->s_journal_bdev = bdev;
ext4_init_journal_params(sb, journal);
return journal;
@@ -5604,9 +5606,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
}
#ifdef CONFIG_QUOTA
- /* Release old quota file names */
- for (i = 0; i < EXT4_MAXQUOTAS; i++)
- kfree(old_opts.s_qf_names[i]);
if (enable_quota) {
if (sb_any_quota_suspended(sb))
dquot_resume(sb, -1);
@@ -5616,6 +5615,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
goto restore_opts;
}
}
+ /* Release old quota file names */
+ for (i = 0; i < EXT4_MAXQUOTAS; i++)
+ kfree(old_opts.s_qf_names[i]);
#endif
if (!test_opt(sb, BLOCK_VALIDITY) && sbi->system_blks)
ext4_release_system_zone(sb);
@@ -5632,6 +5634,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
return 0;
restore_opts:
+ /*
+ * If there was a failing r/w to ro transition, we may need to
+ * re-enable quota
+ */
+ if ((sb->s_flags & SB_RDONLY) && !(old_sb_flags & SB_RDONLY) &&
+ sb_any_quota_suspended(sb))
+ dquot_resume(sb, -1);
sb->s_flags = old_sb_flags;
sbi->s_mount_opt = old_opts.s_mount_opt;
sbi->s_mount_opt2 = old_opts.s_mount_opt2;
@@ -5836,7 +5845,7 @@ static int ext4_write_info(struct super_block *sb, int type)
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
@@ -5953,6 +5962,20 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
return err;
}
+static inline bool ext4_check_quota_inum(int type, unsigned long qf_inum)
+{
+ switch (type) {
+ case USRQUOTA:
+ return qf_inum == EXT4_USR_QUOTA_INO;
+ case GRPQUOTA:
+ return qf_inum == EXT4_GRP_QUOTA_INO;
+ case PRJQUOTA:
+ return qf_inum >= EXT4_GOOD_OLD_FIRST_INO;
+ default:
+ BUG();
+ }
+}
+
static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
unsigned int flags)
{
@@ -5969,9 +5992,16 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
if (!qf_inums[type])
return -EPERM;
+ if (!ext4_check_quota_inum(type, qf_inums[type])) {
+ ext4_error(sb, "Bad quota inum: %lu, type: %d",
+ qf_inums[type], type);
+ return -EUCLEAN;
+ }
+
qf_inode = ext4_iget(sb, qf_inums[type], EXT4_IGET_SPECIAL);
if (IS_ERR(qf_inode)) {
- ext4_error(sb, "Bad quota inode # %lu", qf_inums[type]);
+ ext4_error(sb, "Bad quota inode: %lu, type: %d",
+ qf_inums[type], type);
return PTR_ERR(qf_inode);
}
@@ -6010,8 +6040,9 @@ static int ext4_enable_quotas(struct super_block *sb)
if (err) {
ext4_warning(sb,
"Failed to enable quota tracking "
- "(type=%d, err=%d). Please run "
- "e2fsck to fix.", type, err);
+ "(type=%d, err=%d, ino=%lu). "
+ "Please run e2fsck to fix.", type,
+ err, qf_inums[type]);
for (type--; type >= 0; type--) {
struct inode *inode;
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 9394360ff137..7768d7146d73 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -383,6 +383,11 @@ static void ext4_sb_release(struct kobject *kobj)
complete(&sbi->s_kobj_unregister);
}
+static void ext4_feat_release(struct kobject *kobj)
+{
+ kfree(kobj);
+}
+
static const struct sysfs_ops ext4_attr_ops = {
.show = ext4_attr_show,
.store = ext4_attr_store,
@@ -397,7 +402,7 @@ static struct kobj_type ext4_sb_ktype = {
static struct kobj_type ext4_feat_ktype = {
.default_groups = ext4_feat_groups,
.sysfs_ops = &ext4_attr_ops,
- .release = (void (*)(struct kobject *))kfree,
+ .release = ext4_feat_release,
};
static struct kobject *ext4_root;
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 6a30e54c1128..9879ea046e5a 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -79,8 +79,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
- void *fsdata;
- void *addr;
+ void *fsdata = NULL;
int res;
res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
@@ -88,9 +87,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
if (res)
return res;
- addr = kmap_atomic(page);
- memcpy(addr + offset_in_page(pos), buf, n);
- kunmap_atomic(addr);
+ memcpy_to_page(page, offset_in_page(pos), buf, n);
res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
page, fsdata);
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 20e40cac819e..cd371ac25f84 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -121,7 +121,11 @@ ext4_expand_inode_array(struct ext4_xattr_inode_array **ea_inode_array,
#ifdef CONFIG_LOCKDEP
void ext4_xattr_inode_set_class(struct inode *ea_inode)
{
+ struct ext4_inode_info *ei = EXT4_I(ea_inode);
+
lockdep_set_subclass(&ea_inode->i_rwsem, 1);
+ (void) ei; /* shut up clang warning if !CONFIG_LOCKDEP */
+ lockdep_set_subclass(&ei->i_data_sem, I_DATA_SEM_EA);
}
#endif
@@ -384,7 +388,18 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
struct inode *inode;
int err;
- inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_NORMAL);
+ /*
+ * We have to check for this corruption early as otherwise
+ * iget_locked() could wait indefinitely for the state of our
+ * parent inode.
+ */
+ if (parent->i_ino == ea_ino) {
+ ext4_error(parent->i_sb,
+ "Parent and EA inode have the same ino %lu", ea_ino);
+ return -EFSCORRUPTED;
+ }
+
+ inode = ext4_iget(parent->i_sb, ea_ino, EXT4_IGET_EA_INODE);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
ext4_error(parent->i_sb,
@@ -392,23 +407,6 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
err);
return err;
}
-
- if (is_bad_inode(inode)) {
- ext4_error(parent->i_sb,
- "error while reading EA inode %lu is_bad_inode",
- ea_ino);
- err = -EIO;
- goto error;
- }
-
- if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) {
- ext4_error(parent->i_sb,
- "EA inode %lu does not have EXT4_EA_INODE_FL flag",
- ea_ino);
- err = -EINVAL;
- goto error;
- }
-
ext4_xattr_inode_set_class(inode);
/*
@@ -429,9 +427,21 @@ static int ext4_xattr_inode_iget(struct inode *parent, unsigned long ea_ino,
*ea_inode = inode;
return 0;
-error:
- iput(inode);
- return err;
+}
+
+/* Remove entry from mbcache when EA inode is getting evicted */
+void ext4_evict_ea_inode(struct inode *inode)
+{
+ struct mb_cache_entry *oe;
+
+ if (!EA_INODE_CACHE(inode))
+ return;
+ /* Wait for entry to get unused so that we can remove it */
+ while ((oe = mb_cache_entry_delete_or_get(EA_INODE_CACHE(inode),
+ ext4_xattr_inode_get_hash(inode), inode->i_ino))) {
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(EA_INODE_CACHE(inode), oe);
+ }
}
static int
@@ -1019,10 +1029,8 @@ static int ext4_xattr_ensure_credits(handle_t *handle, struct inode *inode,
static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
int ref_change)
{
- struct mb_cache *ea_inode_cache = EA_INODE_CACHE(ea_inode);
struct ext4_iloc iloc;
s64 ref_count;
- u32 hash;
int ret;
inode_lock(ea_inode);
@@ -1045,14 +1053,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
set_nlink(ea_inode, 1);
ext4_orphan_del(handle, ea_inode);
-
- if (ea_inode_cache) {
- hash = ext4_xattr_inode_get_hash(ea_inode);
- mb_cache_entry_create(ea_inode_cache,
- GFP_NOFS, hash,
- ea_inode->i_ino,
- true /* reusable */);
- }
}
} else {
WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld",
@@ -1065,12 +1065,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode,
clear_nlink(ea_inode);
ext4_orphan_add(handle, ea_inode);
-
- if (ea_inode_cache) {
- hash = ext4_xattr_inode_get_hash(ea_inode);
- mb_cache_entry_delete(ea_inode_cache, hash,
- ea_inode->i_ino);
- }
}
}
@@ -1249,6 +1243,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
if (error)
goto out;
+retry_ref:
lock_buffer(bh);
hash = le32_to_cpu(BHDR(bh)->h_hash);
ref = le32_to_cpu(BHDR(bh)->h_refcount);
@@ -1258,9 +1253,18 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
* This must happen under buffer lock for
* ext4_xattr_block_set() to reliably detect freed block
*/
- if (ea_block_cache)
- mb_cache_entry_delete(ea_block_cache, hash,
- bh->b_blocknr);
+ if (ea_block_cache) {
+ struct mb_cache_entry *oe;
+
+ oe = mb_cache_entry_delete_or_get(ea_block_cache, hash,
+ bh->b_blocknr);
+ if (oe) {
+ unlock_buffer(bh);
+ mb_cache_entry_wait_unused(oe);
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto retry_ref;
+ }
+ }
get_bh(bh);
unlock_buffer(bh);
@@ -1284,7 +1288,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
ce = mb_cache_entry_get(ea_block_cache, hash,
bh->b_blocknr);
if (ce) {
- ce->e_reusable = 1;
+ set_bit(MBE_REUSABLE_B, &ce->e_flags);
mb_cache_entry_put(ea_block_cache, ce);
}
}
@@ -1423,6 +1427,13 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
uid_t owner[2] = { i_uid_read(inode), i_gid_read(inode) };
int err;
+ if (inode->i_sb->s_root == NULL) {
+ ext4_warning(inode->i_sb,
+ "refuse to create EA inode when umounting");
+ WARN_ON(1);
+ return ERR_PTR(-EINVAL);
+ }
+
/*
* Let the next inode be the goal, so we try and allocate the EA inode
* in the same group, or nearby one.
@@ -1442,6 +1453,9 @@ static struct inode *ext4_xattr_inode_create(handle_t *handle,
if (!err)
err = ext4_inode_attach_jinode(ea_inode);
if (err) {
+ if (ext4_xattr_inode_dec_ref(handle, ea_inode))
+ ext4_warning_inode(ea_inode,
+ "cleanup dec ref error %d", err);
iput(ea_inode);
return ERR_PTR(err);
}
@@ -1487,11 +1501,11 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
while (ce) {
ea_inode = ext4_iget(inode->i_sb, ce->e_value,
- EXT4_IGET_NORMAL);
- if (!IS_ERR(ea_inode) &&
- !is_bad_inode(ea_inode) &&
- (EXT4_I(ea_inode)->i_flags & EXT4_EA_INODE_FL) &&
- i_size_read(ea_inode) == value_len &&
+ EXT4_IGET_EA_INODE);
+ if (IS_ERR(ea_inode))
+ goto next_entry;
+ ext4_xattr_inode_set_class(ea_inode);
+ if (i_size_read(ea_inode) == value_len &&
!ext4_xattr_inode_read(ea_inode, ea_data, value_len) &&
!ext4_xattr_inode_verify_hashes(ea_inode, NULL, ea_data,
value_len) &&
@@ -1501,9 +1515,8 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value,
kvfree(ea_data);
return ea_inode;
}
-
- if (!IS_ERR(ea_inode))
- iput(ea_inode);
+ iput(ea_inode);
+ next_entry:
ce = mb_cache_entry_find_next(ea_inode_cache, ce);
}
kvfree(ea_data);
@@ -1729,6 +1742,20 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i,
memmove(here, (void *)here + size,
(void *)last - (void *)here + sizeof(__u32));
memset(last, 0, size);
+
+ /*
+ * Update i_inline_off - moved ibody region might contain
+ * system.data attribute. Handling a failure here won't
+ * cause other complications for setting an xattr.
+ */
+ if (!is_block && ext4_has_inline_data(inode)) {
+ ret = ext4_find_inline_data_nolock(inode);
+ if (ret) {
+ ext4_warning_inode(inode,
+ "unable to update i_inline_off");
+ goto out;
+ }
+ }
} else if (s->not_found) {
/* Insert new name. */
size_t size = EXT4_XATTR_LEN(name_len);
@@ -1868,6 +1895,8 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
#define header(x) ((struct ext4_xattr_header *)(x))
if (s->base) {
+ int offset = (char *)s->here - bs->bh->b_data;
+
BUFFER_TRACE(bs->bh, "get_write_access");
error = ext4_journal_get_write_access(handle, bs->bh);
if (error)
@@ -1882,9 +1911,20 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
* ext4_xattr_block_set() to reliably detect modified
* block
*/
- if (ea_block_cache)
- mb_cache_entry_delete(ea_block_cache, hash,
- bs->bh->b_blocknr);
+ if (ea_block_cache) {
+ struct mb_cache_entry *oe;
+
+ oe = mb_cache_entry_delete_or_get(ea_block_cache,
+ hash, bs->bh->b_blocknr);
+ if (oe) {
+ /*
+ * Xattr block is getting reused. Leave
+ * it alone.
+ */
+ mb_cache_entry_put(ea_block_cache, oe);
+ goto clone_block;
+ }
+ }
ea_bdebug(bs->bh, "modifying in-place");
error = ext4_xattr_set_entry(i, s, handle, inode,
true /* is_block */);
@@ -1899,50 +1939,47 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode,
if (error)
goto cleanup;
goto inserted;
- } else {
- int offset = (char *)s->here - bs->bh->b_data;
+ }
+clone_block:
+ unlock_buffer(bs->bh);
+ ea_bdebug(bs->bh, "cloning");
+ s->base = kmemdup(BHDR(bs->bh), bs->bh->b_size, GFP_NOFS);
+ error = -ENOMEM;
+ if (s->base == NULL)
+ goto cleanup;
+ s->first = ENTRY(header(s->base)+1);
+ header(s->base)->h_refcount = cpu_to_le32(1);
+ s->here = ENTRY(s->base + offset);
+ s->end = s->base + bs->bh->b_size;
- unlock_buffer(bs->bh);
- ea_bdebug(bs->bh, "cloning");
- s->base = kmalloc(bs->bh->b_size, GFP_NOFS);
- error = -ENOMEM;
- if (s->base == NULL)
+ /*
+ * If existing entry points to an xattr inode, we need
+ * to prevent ext4_xattr_set_entry() from decrementing
+ * ref count on it because the reference belongs to the
+ * original block. In this case, make the entry look
+ * like it has an empty value.
+ */
+ if (!s->not_found && s->here->e_value_inum) {
+ ea_ino = le32_to_cpu(s->here->e_value_inum);
+ error = ext4_xattr_inode_iget(inode, ea_ino,
+ le32_to_cpu(s->here->e_hash),
+ &tmp_inode);
+ if (error)
goto cleanup;
- memcpy(s->base, BHDR(bs->bh), bs->bh->b_size);
- s->first = ENTRY(header(s->base)+1);
- header(s->base)->h_refcount = cpu_to_le32(1);
- s->here = ENTRY(s->base + offset);
- s->end = s->base + bs->bh->b_size;
-
- /*
- * If existing entry points to an xattr inode, we need
- * to prevent ext4_xattr_set_entry() from decrementing
- * ref count on it because the reference belongs to the
- * original block. In this case, make the entry look
- * like it has an empty value.
- */
- if (!s->not_found && s->here->e_value_inum) {
- ea_ino = le32_to_cpu(s->here->e_value_inum);
- error = ext4_xattr_inode_iget(inode, ea_ino,
- le32_to_cpu(s->here->e_hash),
- &tmp_inode);
- if (error)
- goto cleanup;
- if (!ext4_test_inode_state(tmp_inode,
- EXT4_STATE_LUSTRE_EA_INODE)) {
- /*
- * Defer quota free call for previous
- * inode until success is guaranteed.
- */
- old_ea_inode_quota = le32_to_cpu(
- s->here->e_value_size);
- }
- iput(tmp_inode);
-
- s->here->e_value_inum = 0;
- s->here->e_value_size = 0;
+ if (!ext4_test_inode_state(tmp_inode,
+ EXT4_STATE_LUSTRE_EA_INODE)) {
+ /*
+ * Defer quota free call for previous
+ * inode until success is guaranteed.
+ */
+ old_ea_inode_quota = le32_to_cpu(
+ s->here->e_value_size);
}
+ iput(tmp_inode);
+
+ s->here->e_value_inum = 0;
+ s->here->e_value_size = 0;
}
} else {
/* Allocate a buffer where we construct the new block. */
@@ -1993,8 +2030,9 @@ inserted:
else {
u32 ref;
+#ifdef EXT4_XATTR_DEBUG
WARN_ON_ONCE(dquot_initialize_needed(inode));
-
+#endif
/* The old block is released after updating
the inode. */
error = dquot_alloc_block(inode,
@@ -2009,18 +2047,13 @@ inserted:
lock_buffer(new_bh);
/*
* We have to be careful about races with
- * freeing, rehashing or adding references to
- * xattr block. Once we hold buffer lock xattr
- * block's state is stable so we can check
- * whether the block got freed / rehashed or
- * not. Since we unhash mbcache entry under
- * buffer lock when freeing / rehashing xattr
- * block, checking whether entry is still
- * hashed is reliable. Same rules hold for
- * e_reusable handling.
+ * adding references to xattr block. Once we
+ * hold buffer lock xattr block's state is
+ * stable so we can check the additional
+ * reference fits.
*/
- if (hlist_bl_unhashed(&ce->e_hash_list) ||
- !ce->e_reusable) {
+ ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
+ if (ref > EXT4_XATTR_REFCOUNT_MAX) {
/*
* Undo everything and check mbcache
* again.
@@ -2035,10 +2068,9 @@ inserted:
new_bh = NULL;
goto inserted;
}
- ref = le32_to_cpu(BHDR(new_bh)->h_refcount) + 1;
BHDR(new_bh)->h_refcount = cpu_to_le32(ref);
- if (ref >= EXT4_XATTR_REFCOUNT_MAX)
- ce->e_reusable = 0;
+ if (ref == EXT4_XATTR_REFCOUNT_MAX)
+ clear_bit(MBE_REUSABLE_B, &ce->e_flags);
ea_bdebug(new_bh, "reusing; refcount now=%d",
ref);
ext4_xattr_block_csum_set(inode, new_bh);
@@ -2062,23 +2094,16 @@ inserted:
/* We need to allocate a new block */
ext4_fsblk_t goal, block;
+#ifdef EXT4_XATTR_DEBUG
WARN_ON_ONCE(dquot_initialize_needed(inode));
-
+#endif
goal = ext4_group_first_block_no(sb,
EXT4_I(inode)->i_block_group);
-
- /* non-extent files can't have physical blocks past 2^32 */
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
-
block = ext4_new_meta_blocks(handle, inode, goal, 0,
NULL, &error);
if (error)
goto cleanup;
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
-
ea_idebug(inode, "creating block %llu",
(unsigned long long)block);
@@ -2184,8 +2209,9 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
struct ext4_inode *raw_inode;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return 0;
+
raw_inode = ext4_raw_inode(&is->iloc);
header = IHDR(inode, raw_inode);
is->s.base = is->s.first = IFIRST(header);
@@ -2205,7 +2231,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
return 0;
}
-int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
+int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_info *i,
struct ext4_xattr_ibody_find *is)
{
@@ -2213,32 +2239,9 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
struct ext4_xattr_search *s = &is->s;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return -ENOSPC;
- error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
- if (error)
- return error;
- header = IHDR(inode, ext4_raw_inode(&is->iloc));
- if (!IS_LAST_ENTRY(s->first)) {
- header->h_magic = cpu_to_le32(EXT4_XATTR_MAGIC);
- ext4_set_inode_state(inode, EXT4_STATE_XATTR);
- } else {
- header->h_magic = cpu_to_le32(0);
- ext4_clear_inode_state(inode, EXT4_STATE_XATTR);
- }
- return 0;
-}
-static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is)
-{
- struct ext4_xattr_ibody_header *header;
- struct ext4_xattr_search *s = &is->s;
- int error;
-
- if (EXT4_I(inode)->i_extra_isize == 0)
- return -ENOSPC;
error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
if (error)
return error;
@@ -2565,13 +2568,13 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
.in_inode = !!entry->e_value_inum,
};
struct ext4_xattr_ibody_header *header = IHDR(inode, raw_inode);
+ int needs_kvfree = 0;
int error;
is = kzalloc(sizeof(struct ext4_xattr_ibody_find), GFP_NOFS);
bs = kzalloc(sizeof(struct ext4_xattr_block_find), GFP_NOFS);
- buffer = kmalloc(value_size, GFP_NOFS);
b_entry_name = kmalloc(entry->e_name_len + 1, GFP_NOFS);
- if (!is || !bs || !buffer || !b_entry_name) {
+ if (!is || !bs || !b_entry_name) {
error = -ENOMEM;
goto out;
}
@@ -2583,12 +2586,18 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
/* Save the entry name and the entry value */
if (entry->e_value_inum) {
+ buffer = kvmalloc(value_size, GFP_NOFS);
+ if (!buffer) {
+ error = -ENOMEM;
+ goto out;
+ }
+ needs_kvfree = 1;
error = ext4_xattr_inode_get(inode, entry, buffer, value_size);
if (error)
goto out;
} else {
size_t value_offs = le16_to_cpu(entry->e_value_offs);
- memcpy(buffer, (void *)IFIRST(header) + value_offs, value_size);
+ buffer = (void *)IFIRST(header) + value_offs;
}
memcpy(b_entry_name, entry->e_name, entry->e_name_len);
@@ -2603,25 +2612,26 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
if (error)
goto out;
- /* Remove the chosen entry from the inode */
- error = ext4_xattr_ibody_set(handle, inode, &i, is);
- if (error)
- goto out;
-
i.value = buffer;
i.value_len = value_size;
error = ext4_xattr_block_find(inode, &i, bs);
if (error)
goto out;
- /* Add entry which was removed from the inode into the block */
+ /* Move ea entry from the inode into the block */
error = ext4_xattr_block_set(handle, inode, &i, bs);
if (error)
goto out;
- error = 0;
+
+ /* Remove the chosen entry from the inode */
+ i.value = NULL;
+ i.value_len = 0;
+ error = ext4_xattr_ibody_set(handle, inode, &i, is);
+
out:
kfree(b_entry_name);
- kfree(buffer);
+ if (needs_kvfree && buffer)
+ kvfree(buffer);
if (is)
brelse(is->iloc.bh);
if (bs)
@@ -2796,6 +2806,9 @@ shift:
(void *)header, total_ino);
EXT4_I(inode)->i_extra_isize = new_extra_isize;
+ if (ext4_has_inline_data(inode))
+ error = ext4_find_inline_data_nolock(inode);
+
cleanup:
if (error && (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count))) {
ext4_warning(inode->i_sb, "Unable to expand inode %lu. Delete some EAs or run e2fsck.",
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index f39cad2abe2a..66911f8a11f8 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -95,6 +95,19 @@ struct ext4_xattr_entry {
#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+/*
+ * If we want to add an xattr to the inode, we should make sure that
+ * i_extra_isize is not 0 and that the inode size is not less than
+ * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
+ * EXT4_GOOD_OLD_INODE_SIZE extra_isize header entry pad data
+ * |--------------------------|------------|------|---------|---|-------|
+ */
+#define EXT4_INODE_HAS_XATTR_SPACE(inode) \
+ ((EXT4_I(inode)->i_extra_isize != 0) && \
+ (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize + \
+ sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <= \
+ EXT4_INODE_SIZE((inode)->i_sb)))
+
struct ext4_xattr_info {
const char *name;
const void *value;
@@ -177,6 +190,7 @@ extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array);
extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize,
struct ext4_inode *raw_inode, handle_t *handle);
+extern void ext4_evict_ea_inode(struct inode *inode);
extern const struct xattr_handler *ext4_xattr_handlers[];
@@ -185,9 +199,9 @@ extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i,
extern int ext4_xattr_ibody_get(struct inode *inode, int name_index,
const char *name,
void *buffer, size_t buffer_size);
-extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
- struct ext4_xattr_info *i,
- struct ext4_xattr_ibody_find *is);
+extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+ struct ext4_xattr_info *i,
+ struct ext4_xattr_ibody_find *is);
extern struct mb_cache *ext4_xattr_create_cache(void);
extern void ext4_xattr_destroy_cache(struct mb_cache *);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 44c5110e18f0..84e98dacd452 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -137,7 +137,7 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
unsigned int segno, offset;
bool exist;
- if (type != DATA_GENERIC_ENHANCE && type != DATA_GENERIC_ENHANCE_READ)
+ if (type == DATA_GENERIC)
return true;
segno = GET_SEGNO(sbi, blkaddr);
@@ -145,6 +145,13 @@ static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr,
se = get_seg_entry(sbi, segno);
exist = f2fs_test_bit(offset, se->cur_valid_map);
+ if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) {
+ f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
+ blkaddr, exist);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ return exist;
+ }
+
if (!exist && type == DATA_GENERIC_ENHANCE) {
f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
blkaddr, exist);
@@ -182,6 +189,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
case DATA_GENERIC:
case DATA_GENERIC_ENHANCE:
case DATA_GENERIC_ENHANCE_READ:
+ case DATA_GENERIC_ENHANCE_UPDATE:
if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
blkaddr < MAIN_BLKADDR(sbi))) {
f2fs_warn(sbi, "access invalid blkaddr:%u",
@@ -298,8 +306,15 @@ static int __f2fs_write_meta_page(struct page *page,
trace_f2fs_writepage(page, META);
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi))) {
+ if (is_sbi_flag_set(sbi, SBI_IS_CLOSE)) {
+ ClearPageUptodate(page);
+ dec_page_count(sbi, F2FS_DIRTY_META);
+ unlock_page(page);
+ return 0;
+ }
goto redirty_out;
+ }
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
goto redirty_out;
if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
@@ -1269,7 +1284,8 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
if (!get_pages(sbi, type))
break;
- if (unlikely(f2fs_cp_error(sbi)))
+ if (unlikely(f2fs_cp_error(sbi) &&
+ !is_sbi_flag_set(sbi, SBI_IS_CLOSE)))
break;
io_schedule_timeout(HZ/50);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 773028921c48..8f78050c935d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -496,7 +496,7 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio)
}
if (fio->io_wbc && !is_read_io(fio->op))
- wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
bio_set_op_attrs(bio, fio->op, fio->op_flags);
@@ -575,7 +575,7 @@ alloc_new:
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
inc_page_count(fio->sbi, WB_DATA_TYPE(page));
@@ -652,7 +652,7 @@ alloc_new:
}
if (fio->io_wbc)
- wbc_account_cgroup_owner(fio->io_wbc, bio_page, PAGE_SIZE);
+ wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
io->last_block_in_bio = fio->new_blkaddr;
f2fs_trace_ios(fio, 0);
@@ -2130,7 +2130,8 @@ static int __write_data_page(struct page *page, bool *submitted,
* don't drop any dirty dentry pages for keeping lastest
* directory structure.
*/
- if (S_ISDIR(inode->i_mode))
+ if (S_ISDIR(inode->i_mode) &&
+ !is_sbi_flag_set(sbi, SBI_IS_CLOSE))
goto redirty_out;
goto out;
}
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index e60078460ad1..dd53c9294ad9 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -381,7 +381,8 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
struct extent_node *en;
bool ret = false;
- f2fs_bug_on(sbi, !et);
+ if (!et)
+ return false;
trace_f2fs_lookup_extent_tree_start(inode, pgofs);
@@ -729,9 +730,8 @@ void f2fs_drop_extent_tree(struct inode *inode)
if (!f2fs_may_extent_tree(inode))
return;
- set_inode_flag(inode, FI_NO_EXTENT);
-
write_lock(&et->lock);
+ set_inode_flag(inode, FI_NO_EXTENT);
__free_extent_tree(sbi, et);
if (et->largest.len) {
et->largest.len = 0;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index c73a1638c18b..f75d25682734 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -225,6 +225,10 @@ enum {
* condition of read on truncated area
* by extent_cache
*/
+ DATA_GENERIC_ENHANCE_UPDATE, /*
+ * strong check on range and segment
+ * bitmap for update case
+ */
META_GENERIC,
};
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index ef08ef017030..2330600dbe02 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -2821,15 +2821,16 @@ int f2fs_transfer_project_quota(struct inode *inode, kprojid_t kprojid)
struct dquot *transfer_to[MAXQUOTAS] = {};
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct super_block *sb = sbi->sb;
- int err = 0;
+ int err;
transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
- if (!IS_ERR(transfer_to[PRJQUOTA])) {
- err = __dquot_transfer(inode, transfer_to);
- if (err)
- set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
- dqput(transfer_to[PRJQUOTA]);
- }
+ if (IS_ERR(transfer_to[PRJQUOTA]))
+ return PTR_ERR(transfer_to[PRJQUOTA]);
+
+ err = __dquot_transfer(inode, transfer_to);
+ if (err)
+ set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR);
+ dqput(transfer_to[PRJQUOTA]);
return err;
}
@@ -3116,6 +3117,7 @@ int f2fs_precache_extents(struct inode *inode)
return -EOPNOTSUPP;
map.m_lblk = 0;
+ map.m_pblk = 0;
map.m_next_pgofs = NULL;
map.m_next_extent = &m_next_extent;
map.m_seg_type = NO_CHECK_TYPE;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 68d5c73c5ed1..d26b3fe38bd9 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -612,7 +612,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
{
struct page *node_page;
nid_t nid;
- unsigned int ofs_in_node;
+ unsigned int ofs_in_node, max_addrs, base;
block_t source_blkaddr;
nid = le32_to_cpu(sum->nid);
@@ -638,6 +638,21 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
return false;
}
+ if (IS_INODE(node_page)) {
+ base = offset_in_addr(F2FS_INODE(node_page));
+ max_addrs = DEF_ADDRS_PER_INODE;
+ } else {
+ base = 0;
+ max_addrs = DEF_ADDRS_PER_BLOCK;
+ }
+
+ if (base + ofs_in_node >= max_addrs) {
+ f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u",
+ base, ofs_in_node, max_addrs, dni->ino, dni->nid);
+ f2fs_put_page(node_page, 1);
+ return false;
+ }
+
*nofs = ofs_of_node(node_page);
source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
@@ -1054,7 +1069,8 @@ next_step:
if (phase == 3) {
inode = f2fs_iget(sb, dni.ino);
- if (IS_ERR(inode) || is_bad_inode(inode))
+ if (IS_ERR(inode) || is_bad_inode(inode) ||
+ special_file(inode->i_mode))
continue;
if (!down_write_trylock(
@@ -1245,8 +1261,9 @@ freed:
seg_freed++;
migrated++;
- if (__is_large_section(sbi) && segno + 1 < end_segno)
- sbi->next_victim_seg[gc_type] = segno + 1;
+ if (__is_large_section(sbi))
+ sbi->next_victim_seg[gc_type] =
+ (segno + 1 < end_segno) ? segno + 1 : NULL_SEGNO;
skip:
f2fs_put_page(sum_page, 0);
}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index c6bd669f4b4e..a700df05cd18 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -43,7 +43,6 @@ bool f2fs_may_inline_dentry(struct inode *inode)
void f2fs_do_read_inline_data(struct page *page, struct page *ipage)
{
struct inode *inode = page->mapping->host;
- void *src_addr, *dst_addr;
if (PageUptodate(page))
return;
@@ -53,11 +52,8 @@ void f2fs_do_read_inline_data(struct page *page, struct page *ipage)
zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE);
/* Copy the whole inline data block */
- src_addr = inline_data_addr(inode, ipage);
- dst_addr = kmap_atomic(page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
- flush_dcache_page(page);
- kunmap_atomic(dst_addr);
+ memcpy_to_page(page, 0, inline_data_addr(inode, ipage),
+ MAX_INLINE_DATA(inode));
if (!PageUptodate(page))
SetPageUptodate(page);
}
@@ -224,7 +220,6 @@ out:
int f2fs_write_inline_data(struct inode *inode, struct page *page)
{
- void *src_addr, *dst_addr;
struct dnode_of_data dn;
int err;
@@ -241,10 +236,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
f2fs_bug_on(F2FS_I_SB(inode), page->index);
f2fs_wait_on_page_writeback(dn.inode_page, NODE, true, true);
- src_addr = kmap_atomic(page);
- dst_addr = inline_data_addr(inode, dn.inode_page);
- memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode));
- kunmap_atomic(src_addr);
+ memcpy_from_page(inline_data_addr(inode, dn.inode_page),
+ page, 0, MAX_INLINE_DATA(inode));
set_page_dirty(dn.inode_page);
f2fs_clear_page_cache_dirty_tag(page);
@@ -402,18 +395,17 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage,
dentry_blk = page_address(page);
+ /*
+ * Start by zeroing the full block, to ensure that all unused space is
+ * zeroed and no uninitialized memory is leaked to disk.
+ */
+ memset(dentry_blk, 0, F2FS_BLKSIZE);
+
make_dentry_ptr_inline(dir, &src, inline_dentry);
make_dentry_ptr_block(dir, &dst, dentry_blk);
/* copy data from inline dentry block to new dentry block */
memcpy(dst.bitmap, src.bitmap, src.nr_bitmap);
- memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap);
- /*
- * we do not need to zero out remainder part of dentry and filename
- * field, since we have used bitmap for marking the usage status of
- * them, besides, we can also ignore copying/zeroing reserved space
- * of dentry block, because them haven't been used so far.
- */
memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max);
memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index ed95c27e9302..99a91c746b39 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -1009,7 +1009,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
}
if (old_dir_entry) {
- if (old_dir != new_dir && !whiteout)
+ if (old_dir != new_dir)
f2fs_set_link(old_inode, old_dir_entry,
old_dir_page, new_dir);
else
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 3dc7cc3d6ac6..8256a2dedae8 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -889,8 +889,10 @@ static int truncate_dnode(struct dnode_of_data *dn)
dn->ofs_in_node = 0;
f2fs_truncate_data_blocks(dn);
err = truncate_node(dn);
- if (err)
+ if (err) {
+ f2fs_put_page(page, 1);
return err;
+ }
return 1;
}
@@ -1240,7 +1242,11 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs)
dec_valid_node_count(sbi, dn->inode, !ofs);
goto fail;
}
- f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
+ if (unlikely(new_ni.blk_addr != NULL_ADDR)) {
+ err = -EFSCORRUPTED;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ goto fail;
+ }
#endif
new_ni.nid = dn->nid;
new_ni.ino = dn->inode->i_ino;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 5f230e981c48..7e30326b296c 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -406,7 +406,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
struct dnode_of_data tdn = *dn;
nid_t ino, nid;
struct inode *inode;
- unsigned int offset;
+ unsigned int offset, ofs_in_node, max_addrs;
block_t bidx;
int i;
@@ -432,15 +432,24 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
got_it:
/* Use the locked dnode page and inode */
nid = le32_to_cpu(sum.nid);
+ ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+
+ max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode);
+ if (ofs_in_node >= max_addrs) {
+ f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u",
+ ofs_in_node, dn->inode->i_ino, nid, max_addrs);
+ return -EFSCORRUPTED;
+ }
+
if (dn->inode->i_ino == nid) {
tdn.nid = nid;
if (!dn->inode_page_locked)
lock_page(dn->inode_page);
tdn.node_page = dn->inode_page;
- tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ tdn.ofs_in_node = ofs_in_node;
goto truncate_out;
} else if (dn->nid == nid) {
- tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ tdn.ofs_in_node = ofs_in_node;
goto truncate_out;
}
@@ -602,7 +611,16 @@ retry_dn:
*/
if (dest == NEW_ADDR) {
f2fs_truncate_data_blocks_range(&dn, 1);
- f2fs_reserve_new_block(&dn);
+ do {
+ err = f2fs_reserve_new_block(&dn);
+ if (err == -ENOSPC) {
+ f2fs_bug_on(sbi, 1);
+ break;
+ }
+ } while (err &&
+ IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
+ if (err)
+ goto err;
continue;
}
@@ -610,12 +628,14 @@ retry_dn:
if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
if (src == NULL_ADDR) {
- err = f2fs_reserve_new_block(&dn);
- while (err &&
- IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION))
+ do {
err = f2fs_reserve_new_block(&dn);
- /* We should not get -ENOSPC */
- f2fs_bug_on(sbi, err);
+ if (err == -ENOSPC) {
+ f2fs_bug_on(sbi, 1);
+ break;
+ }
+ } while (err &&
+ IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
if (err)
goto err;
}
@@ -630,6 +650,14 @@ retry_prev:
goto err;
}
+ if (f2fs_is_valid_blkaddr(sbi, dest,
+ DATA_GENERIC_ENHANCE_UPDATE)) {
+ f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u",
+ dest, inode->i_ino, dn.ofs_in_node);
+ err = -EFSCORRUPTED;
+ goto err;
+ }
+
/* write dummy data page */
f2fs_replace_block(sbi, &dn, src, dest,
ni.version, false, false);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 7759323bd775..e43b57755a7f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1486,7 +1486,7 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
if (i + 1 < dpolicy->granularity)
break;
- if (i < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
+ if (i + 1 < DEFAULT_DISCARD_GRANULARITY && dpolicy->ordered)
return __issue_discard_cmd_orderly(sbi, dpolicy);
pend_list = &dcc->pend_list[i];
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 232c99e4a1ee..401bc0e7f6eb 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -255,10 +255,10 @@ static int f2fs_sb_read_encoding(const struct f2fs_super_block *sb,
static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
{
- block_t limit = min((sbi->user_block_count << 1) / 1000,
+ block_t limit = min((sbi->user_block_count >> 3),
sbi->user_block_count - sbi->reserved_blocks);
- /* limit is 0.2% */
+ /* limit is 12.5% */
if (test_opt(sbi, RESERVE_ROOT) &&
F2FS_OPTION(sbi).root_reserved_blocks > limit) {
F2FS_OPTION(sbi).root_reserved_blocks = limit;
@@ -1824,7 +1824,6 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
size_t toread;
loff_t i_size = i_size_read(inode);
struct page *page;
- char *kaddr;
if (off > i_size)
return 0;
@@ -1857,9 +1856,7 @@ repeat:
return -EIO;
}
- kaddr = kmap_atomic(page);
- memcpy(data, kaddr + offset, tocopy);
- kunmap_atomic(kaddr);
+ memcpy_from_page(data, page, offset, tocopy);
f2fs_put_page(page, 1);
offset = 0;
@@ -1881,7 +1878,6 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
size_t towrite = len;
struct page *page;
void *fsdata = NULL;
- char *kaddr;
int err = 0;
int tocopy;
@@ -1900,10 +1896,7 @@ retry:
break;
}
- kaddr = kmap_atomic(page);
- memcpy(kaddr + offset, data, tocopy);
- kunmap_atomic(kaddr);
- flush_dcache_page(page);
+ memcpy_to_page(page, offset, data, tocopy);
a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
page, fsdata);
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 7944a08a3977..c4bd5a154252 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -45,16 +45,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count,
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
- void *addr;
page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT,
NULL);
if (IS_ERR(page))
return PTR_ERR(page);
- addr = kmap_atomic(page);
- memcpy(buf, addr + offset_in_page(pos), n);
- kunmap_atomic(addr);
+ memcpy_from_page(buf, page, offset_in_page(pos), n);
put_page(page);
@@ -79,8 +76,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
size_t n = min_t(size_t, count,
PAGE_SIZE - offset_in_page(pos));
struct page *page;
- void *fsdata;
- void *addr;
+ void *fsdata = NULL;
int res;
res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0,
@@ -88,9 +84,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count,
if (res)
return res;
- addr = kmap_atomic(page);
- memcpy(addr + offset_in_page(pos), buf, n);
- kunmap_atomic(addr);
+ memcpy_to_page(page, offset_in_page(pos), buf, n);
res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n,
page, fsdata);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index cf1bfed1e662..cef2825ff069 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -722,6 +722,12 @@ static int __f2fs_setxattr(struct inode *inode, int index,
memcpy(pval, value, size);
last->e_value_size = cpu_to_le16(size);
new_hsize += newsize;
+ /*
+ * Explicitly add the null terminator. The unused xattr space
+ * is supposed to always be zeroed, which would make this
+ * unnecessary, but don't depend on that.
+ */
+ *(u32 *)((u8 *)last + newsize) = 0;
}
error = write_all_xattrs(inode, new_hsize, base_addr, ipage);
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 549d91f5245c..16795955acad 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -1303,7 +1303,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
struct super_block *sb = dir->i_sb;
struct msdos_sb_info *sbi = MSDOS_SB(sb);
struct buffer_head *bh, *prev, *bhs[3]; /* 32*slots (672bytes) */
- struct msdos_dir_entry *uninitialized_var(de);
+ struct msdos_dir_entry *de;
int err, free_slots, i, nr_bhs;
loff_t pos, i_pos;
diff --git a/fs/fat/nfs.c b/fs/fat/nfs.c
index af191371c352..bab63eeaf9cb 100644
--- a/fs/fat/nfs.c
+++ b/fs/fat/nfs.c
@@ -130,6 +130,12 @@ fat_encode_fh_nostale(struct inode *inode, __u32 *fh, int *lenp,
fid->parent_i_gen = parent->i_generation;
type = FILEID_FAT_WITH_PARENT;
*lenp = FAT_FID_SIZE_WITH_PARENT;
+ } else {
+ /*
+ * We need to initialize this field because the fh is actually
+ * 12 bytes long
+ */
+ fid->parent_i_pos_hi = 0;
}
return type;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index 01263ffbc4c0..9a5f153c8919 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -37,7 +37,7 @@ static long do_sys_name_to_handle(struct path *path,
if (f_handle.handle_bytes > MAX_HANDLE_SZ)
return -EINVAL;
- handle = kmalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
+ handle = kzalloc(sizeof(struct file_handle) + f_handle.handle_bytes,
GFP_KERNEL);
if (!handle)
return -ENOMEM;
diff --git a/fs/file.c b/fs/file.c
index 51f53a7dc221..e56059fa1b30 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -654,6 +654,7 @@ int __close_fd_get_file(unsigned int fd, struct file **res)
fdt = files_fdtable(files);
if (fd >= fdt->max_fds)
goto out_unlock;
+ fd = array_index_nospec(fd, fdt->max_fds);
file = fdt->fd[fd];
if (!file)
goto out_unlock;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5b3a288e0f14..3ec2c16abece 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -700,7 +700,7 @@ void wbc_detach_inode(struct writeback_control *wbc)
* is okay. The main goal is avoiding keeping an inode on
* the wrong wb for an extended period of time.
*/
- if (hweight32(history) > WB_FRN_HIST_THR_SLOTS)
+ if (hweight16(history) > WB_FRN_HIST_THR_SLOTS)
inode_switch_wbs(inode, max_id);
}
diff --git a/fs/fs_context.c b/fs/fs_context.c
index e492a83fa100..412712eb59ee 100644
--- a/fs/fs_context.c
+++ b/fs/fs_context.c
@@ -598,7 +598,8 @@ static int legacy_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -ENOMEM;
}
- ctx->legacy_data[size++] = ',';
+ if (size)
+ ctx->legacy_data[size++] = ',';
len = strlen(param->key);
memcpy(ctx->legacy_data + size, param->key, len);
size += len;
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index c23f6f243ad4..7092958c43ed 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -120,7 +120,7 @@ static ssize_t fuse_conn_max_background_write(struct file *file,
const char __user *buf,
size_t count, loff_t *ppos)
{
- unsigned uninitialized_var(val);
+ unsigned val;
ssize_t ret;
ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
@@ -162,7 +162,7 @@ static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
const char __user *buf,
size_t count, loff_t *ppos)
{
- unsigned uninitialized_var(val);
+ unsigned val;
struct fuse_conn *fc;
ssize_t ret;
@@ -265,7 +265,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
struct dentry *parent;
char name[32];
- if (!fuse_control_sb)
+ if (!fuse_control_sb || fc->no_control)
return 0;
parent = fuse_control_sb->s_root;
@@ -303,7 +303,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
{
int i;
- if (!fuse_control_sb)
+ if (!fuse_control_sb || fc->no_control)
return;
for (i = fc->ctl_ndents - 1; i >= 0; i--) {
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index e51b7019e887..072b68fbf8f4 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -270,7 +270,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
{
char *end = p + len;
- char *uninitialized_var(key), *uninitialized_var(val);
+ char *key, *val;
int rc;
while (true) {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 34487bf1d791..b2f37809fa9b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -246,7 +246,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
spin_unlock(&fi->lock);
}
kfree(forget);
- if (ret == -ENOMEM)
+ if (ret == -ENOMEM || ret == -EINTR)
goto out;
if (ret || fuse_invalid_attr(&outarg.attr) ||
(outarg.attr.mode ^ inode->i_mode) & S_IFMT)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index efb2a4871291..f6c499dbfd06 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -2994,7 +2994,7 @@ static void fuse_register_polled_file(struct fuse_conn *fc,
{
spin_lock(&fc->lock);
if (RB_EMPTY_NODE(&ff->polled_node)) {
- struct rb_node **link, *uninitialized_var(parent);
+ struct rb_node **link, *parent;
link = fuse_find_polled_node(fc, ff->kh, &parent);
BUG_ON(*link);
@@ -3212,24 +3212,19 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
.mode = mode
};
int err;
- bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
- (mode & FALLOC_FL_PUNCH_HOLE);
-
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
if (fc->no_fallocate)
return -EOPNOTSUPP;
- if (lock_inode) {
- inode_lock(inode);
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- loff_t endbyte = offset + length - 1;
+ inode_lock(inode);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ loff_t endbyte = offset + length - 1;
- err = fuse_writeback_range(inode, offset, endbyte);
- if (err)
- goto out;
- }
+ err = fuse_writeback_range(inode, offset, endbyte);
+ if (err)
+ goto out;
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -3239,6 +3234,10 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
goto out;
}
+ err = file_modified(file);
+ if (err)
+ goto out;
+
if (!(mode & FALLOC_FL_KEEP_SIZE))
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
@@ -3272,8 +3271,7 @@ out:
if (!(mode & FALLOC_FL_KEEP_SIZE))
clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
- if (lock_inode)
- inode_unlock(inode);
+ inode_unlock(inode);
return err;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 83c2855bc740..e3b3dc6861c5 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -792,7 +792,6 @@ static inline u64 fuse_get_attr_version(struct fuse_conn *fc)
static inline void fuse_make_bad(struct inode *inode)
{
- remove_inode_hash(inode);
set_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state);
}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index aa1d5cf1bc3a..287e850fbd64 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -162,6 +162,12 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
inode->i_uid = make_kuid(fc->user_ns, attr->uid);
inode->i_gid = make_kgid(fc->user_ns, attr->gid);
inode->i_blocks = attr->blocks;
+
+ /* Sanitize nsecs */
+ attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1);
+ attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1);
+ attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1);
+
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
/* mtime from server may be stale due to local buffered write */
@@ -307,8 +313,11 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
/* Inode has changed type, any I/O on the old should fail */
fuse_make_bad(inode);
- iput(inode);
- goto retry;
+ if (inode != d_inode(sb->s_root)) {
+ remove_inode_hash(inode);
+ iput(inode);
+ goto retry;
+ }
}
fi = get_fuse_inode(inode);
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 70f685b61e3a..512609da8590 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -238,8 +238,16 @@ retry:
dput(dentry);
dentry = alias;
}
- if (IS_ERR(dentry))
+ if (IS_ERR(dentry)) {
+ if (!IS_ERR(inode)) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fi->lock);
+ fi->nlookup--;
+ spin_unlock(&fi->lock);
+ }
return PTR_ERR(dentry);
+ }
}
if (fc->readdirplus_auto)
set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index b9fe975d7625..b1ce0061f127 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -156,7 +156,6 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
{
struct inode *inode = page->mapping->host;
struct gfs2_inode *ip = GFS2_I(inode);
- struct gfs2_sbd *sdp = GFS2_SB(inode);
if (PageChecked(page)) {
ClearPageChecked(page);
@@ -164,7 +163,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w
create_empty_buffers(page, inode->i_sb->s_blocksize,
BIT(BH_Dirty)|BIT(BH_Uptodate));
}
- gfs2_page_add_databufs(ip, page, 0, sdp->sd_vfs->s_blocksize);
+ gfs2_page_add_databufs(ip, page, 0, PAGE_SIZE);
}
return gfs2_write_full_page(page, gfs2_get_block_noalloc, wbc);
}
@@ -337,7 +336,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping,
int done = 0;
struct pagevec pvec;
int nr_pages;
- pgoff_t uninitialized_var(writeback_index);
+ pgoff_t writeback_index;
pgoff_t index;
pgoff_t end;
pgoff_t done_index;
@@ -457,8 +456,6 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page)
return error;
kaddr = kmap_atomic(page);
- if (dsize > gfs2_max_stuffed_size(ip))
- dsize = gfs2_max_stuffed_size(ip);
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap_atomic(kaddr);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 77a497a4b236..63e925aa12a7 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -70,9 +70,6 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
void *kaddr = kmap(page);
u64 dsize = i_size_read(inode);
- if (dsize > gfs2_max_stuffed_size(ip))
- dsize = gfs2_max_stuffed_size(ip);
-
memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
kunmap(page);
@@ -1766,7 +1763,7 @@ static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
__u16 start_list[GFS2_MAX_META_HEIGHT];
__u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
- unsigned int start_aligned, uninitialized_var(end_aligned);
+ unsigned int start_aligned, end_aligned;
unsigned int strip_h = ip->i_height - 1;
u32 btotal = 0;
int ret, state;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d5b9274662db..092223a8b120 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -362,6 +362,7 @@ static int inode_go_demote_ok(const struct gfs2_glock *gl)
static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
{
+ struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
const struct gfs2_dinode *str = buf;
struct timespec64 atime;
u16 height, depth;
@@ -401,7 +402,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
/* i_diskflags and i_eattr must be set before gfs2_set_inode_flags() */
gfs2_set_inode_flags(&ip->i_inode);
height = be16_to_cpu(str->di_height);
- if (unlikely(height > GFS2_MAX_META_HEIGHT))
+ if (unlikely(height > sdp->sd_max_height))
goto corrupt;
ip->i_height = (u8)height;
@@ -411,6 +412,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
ip->i_depth = (u8)depth;
ip->i_entries = be32_to_cpu(str->di_entries);
+ if (gfs2_is_stuffed(ip) && ip->i_inode.i_size > gfs2_max_stuffed_size(ip))
+ goto corrupt;
+
if (S_ISREG(ip->i_inode.i_mode))
gfs2_set_aops(&ip->i_inode);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index d2ed4dc4434c..4aaaf3a4203c 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -414,7 +414,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd,
struct page *page)
{
struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
- struct gfs2_log_header_host uninitialized_var(lh);
+ struct gfs2_log_header_host lh;
void *kaddr = kmap_atomic(page);
unsigned int offset;
bool ret = false;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 29b27d769860..2112ff7a0172 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -180,7 +180,10 @@ static int gfs2_check_sb(struct gfs2_sbd *sdp, int silent)
pr_warn("Invalid superblock size\n");
return -EINVAL;
}
-
+ if (sb->sb_bsize_shift != ffs(sb->sb_bsize) - 1) {
+ pr_warn("Invalid block size shift\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -377,8 +380,10 @@ static int init_names(struct gfs2_sbd *sdp, int silent)
if (!table[0])
table = sdp->sd_vfs->s_id;
- strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN);
- strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN);
+ BUILD_BUG_ON(GFS2_LOCKNAME_LEN > GFS2_FSNAME_LEN);
+
+ strscpy(sdp->sd_proto_name, proto, GFS2_LOCKNAME_LEN);
+ strscpy(sdp->sd_table_name, table, GFS2_LOCKNAME_LEN);
table = sdp->sd_table_name;
while ((table = strchr(table, '/')))
@@ -1349,13 +1354,13 @@ static int gfs2_parse_param(struct fs_context *fc, struct fs_parameter *param)
switch (o) {
case Opt_lockproto:
- strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_locktable:
- strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_hostdata:
- strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_spectator:
args->ar_spectator = 1;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index cbee745169b8..ce3d65787e01 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -431,6 +431,17 @@ static int qd_check_sync(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd,
(sync_gen && (qd->qd_sync_gen >= *sync_gen)))
return 0;
+ /*
+ * If qd_change is 0 it means a pending quota change was negated.
+ * We should not sync it, but we still have a qd reference and slot
+ * reference taken by gfs2_quota_change -> do_qc that need to be put.
+ */
+ if (!qd->qd_change && test_and_clear_bit(QDF_CHANGE, &qd->qd_flags)) {
+ slot_put(qd);
+ qd_put(qd);
+ return 0;
+ }
+
if (!lockref_get_not_dead(&qd->qd_lockref))
return 0;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 8153a3eac540..f0a135a48cc3 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2275,7 +2275,7 @@ void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_glock *gl,
(unsigned long long)rgd->rd_addr, rgd->rd_flags,
rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
rgd->rd_reserved, rgd->rd_extfail_pt);
- if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
+ if (rgd->rd_sbd->sd_args.ar_rgrplvb && rgd->rd_rgl) {
struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
gfs2_print_dbg(seq, "%s L: f:%02x b:%u i:%u\n", fs_id_buf,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 9c593fd50c6a..15e757f76380 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1046,7 +1046,14 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
{
struct gfs2_sbd *sdp = root->d_sb->s_fs_info;
struct gfs2_args *args = &sdp->sd_args;
- int val;
+ unsigned int logd_secs, statfs_slow, statfs_quantum, quota_quantum;
+
+ spin_lock(&sdp->sd_tune.gt_spin);
+ logd_secs = sdp->sd_tune.gt_logd_secs;
+ quota_quantum = sdp->sd_tune.gt_quota_quantum;
+ statfs_quantum = sdp->sd_tune.gt_statfs_quantum;
+ statfs_slow = sdp->sd_tune.gt_statfs_slow;
+ spin_unlock(&sdp->sd_tune.gt_spin);
if (is_ancestor(root, sdp->sd_master_dir))
seq_puts(s, ",meta");
@@ -1101,17 +1108,14 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
}
if (args->ar_discard)
seq_puts(s, ",discard");
- val = sdp->sd_tune.gt_logd_secs;
- if (val != 30)
- seq_printf(s, ",commit=%d", val);
- val = sdp->sd_tune.gt_statfs_quantum;
- if (val != 30)
- seq_printf(s, ",statfs_quantum=%d", val);
- else if (sdp->sd_tune.gt_statfs_slow)
+ if (logd_secs != 30)
+ seq_printf(s, ",commit=%d", logd_secs);
+ if (statfs_quantum != 30)
+ seq_printf(s, ",statfs_quantum=%d", statfs_quantum);
+ else if (statfs_slow)
seq_puts(s, ",statfs_quantum=0");
- val = sdp->sd_tune.gt_quota_quantum;
- if (val != 60)
- seq_printf(s, ",quota_quantum=%d", val);
+ if (quota_quantum != 60)
+ seq_printf(s, ",quota_quantum=%d", quota_quantum);
if (args->ar_statfs_percent)
seq_printf(s, ",statfs_percent=%d", args->ar_statfs_percent);
if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
@@ -1258,6 +1262,14 @@ static void gfs2_evict_inode(struct inode *inode)
if (inode->i_nlink || sb_rdonly(sb))
goto out;
+ /*
+ * In case of an incomplete mount, gfs2_evict_inode() may be called for
+ * system files without having an active journal to write to. In that
+ * case, skip the filesystem evict.
+ */
+ if (!sdp->sd_jdesc)
+ goto out;
+
if (test_bit(GIF_ALLOC_FAILED, &ip->i_flags)) {
BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));
gfs2_holder_mark_uninitialized(&gh);
diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c
index c0a73a6ffb28..397e02a56697 100644
--- a/fs/hfs/bnode.c
+++ b/fs/hfs/bnode.c
@@ -281,6 +281,7 @@ static struct hfs_bnode *__hfs_bnode_create(struct hfs_btree *tree, u32 cnid)
tree->node_hash[hash] = node;
tree->node_hash_cnt++;
} else {
+ hfs_bnode_get(node2);
spin_unlock(&tree->hash_lock);
kfree(node);
wait_event(node2->lock_wq, !test_bit(HFS_BNODE_NEW, &node2->flags));
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index da243c84e93b..ee2ea5532e69 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -453,14 +453,16 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
/* panic? */
return -EIO;
+ res = -EIO;
+ if (HFS_I(main_inode)->cat_key.CName.len > HFS_NAMELEN)
+ goto out;
fd.search_key->cat = HFS_I(main_inode)->cat_key;
if (hfs_brec_find(&fd))
- /* panic? */
goto out;
if (S_ISDIR(main_inode->i_mode)) {
if (fd.entrylength < sizeof(struct hfs_cat_dir))
- /* panic? */;
+ goto out;
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_dir));
if (rec.type != HFS_CDR_DIR ||
@@ -473,6 +475,8 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_dir));
} else if (HFS_IS_RSRC(inode)) {
+ if (fd.entrylength < sizeof(struct hfs_cat_file))
+ goto out;
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
hfs_inode_write_fork(inode, rec.file.RExtRec,
@@ -481,7 +485,7 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
sizeof(struct hfs_cat_file));
} else {
if (fd.entrylength < sizeof(struct hfs_cat_file))
- /* panic? */;
+ goto out;
hfs_bnode_read(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
if (rec.type != HFS_CDR_FIL ||
@@ -498,9 +502,10 @@ int hfs_write_inode(struct inode *inode, struct writeback_control *wbc)
hfs_bnode_write(fd.bnode, &rec, fd.entryoffset,
sizeof(struct hfs_cat_file));
}
+ res = 0;
out:
hfs_find_exit(&fd);
- return 0;
+ return res;
}
static struct dentry *hfs_file_lookup(struct inode *dir, struct dentry *dentry,
diff --git a/fs/hfs/trans.c b/fs/hfs/trans.c
index 39f5e343bf4d..fdb0edb8a607 100644
--- a/fs/hfs/trans.c
+++ b/fs/hfs/trans.c
@@ -109,7 +109,7 @@ void hfs_asc2mac(struct super_block *sb, struct hfs_name *out, const struct qstr
if (nls_io) {
wchar_t ch;
- while (srclen > 0) {
+ while (srclen > 0 && dstlen > 0) {
size = nls_io->char2uni(src, srclen, &ch);
if (size < 0) {
ch = '?';
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index b8471bf05def..9580179ff535 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -198,6 +198,8 @@ struct hfsplus_sb_info {
#define HFSPLUS_SB_HFSX 3
#define HFSPLUS_SB_CASEFOLD 4
#define HFSPLUS_SB_NOBARRIER 5
+#define HFSPLUS_SB_UID 6
+#define HFSPLUS_SB_GID 7
static inline struct hfsplus_sb_info *HFSPLUS_SB(struct super_block *sb)
{
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index d131c8ea7eb6..15c14a6a9f7f 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -187,11 +187,11 @@ static void hfsplus_get_perms(struct inode *inode,
mode = be16_to_cpu(perms->mode);
i_uid_write(inode, be32_to_cpu(perms->owner));
- if (!i_uid_read(inode) && !mode)
+ if ((test_bit(HFSPLUS_SB_UID, &sbi->flags)) || (!i_uid_read(inode) && !mode))
inode->i_uid = sbi->uid;
i_gid_write(inode, be32_to_cpu(perms->group));
- if (!i_gid_read(inode) && !mode)
+ if ((test_bit(HFSPLUS_SB_GID, &sbi->flags)) || (!i_gid_read(inode) && !mode))
inode->i_gid = sbi->gid;
if (dir) {
@@ -497,8 +497,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
if (type == HFSPLUS_FOLDER) {
struct hfsplus_cat_folder *folder = &entry.folder;
- if (fd->entrylength < sizeof(struct hfsplus_cat_folder))
- /* panic? */;
+ if (fd->entrylength < sizeof(struct hfsplus_cat_folder)) {
+ pr_err("bad catalog folder entry\n");
+ res = -EIO;
+ goto out;
+ }
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_folder));
hfsplus_get_perms(inode, &folder->permissions, 1);
@@ -518,8 +521,11 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
} else if (type == HFSPLUS_FILE) {
struct hfsplus_cat_file *file = &entry.file;
- if (fd->entrylength < sizeof(struct hfsplus_cat_file))
- /* panic? */;
+ if (fd->entrylength < sizeof(struct hfsplus_cat_file)) {
+ pr_err("bad catalog file entry\n");
+ res = -EIO;
+ goto out;
+ }
hfs_bnode_read(fd->bnode, &entry, fd->entryoffset,
sizeof(struct hfsplus_cat_file));
@@ -550,6 +556,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
pr_err("bad catalog entry used to create inode\n");
res = -EIO;
}
+out:
return res;
}
@@ -558,6 +565,7 @@ int hfsplus_cat_write_inode(struct inode *inode)
struct inode *main_inode = inode;
struct hfs_find_data fd;
hfsplus_cat_entry entry;
+ int res = 0;
if (HFSPLUS_IS_RSRC(inode))
main_inode = HFSPLUS_I(inode)->rsrc_inode;
@@ -576,8 +584,11 @@ int hfsplus_cat_write_inode(struct inode *inode)
if (S_ISDIR(main_inode->i_mode)) {
struct hfsplus_cat_folder *folder = &entry.folder;
- if (fd.entrylength < sizeof(struct hfsplus_cat_folder))
- /* panic? */;
+ if (fd.entrylength < sizeof(struct hfsplus_cat_folder)) {
+ pr_err("bad catalog folder entry\n");
+ res = -EIO;
+ goto out;
+ }
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_folder));
/* simple node checks? */
@@ -602,8 +613,11 @@ int hfsplus_cat_write_inode(struct inode *inode)
} else {
struct hfsplus_cat_file *file = &entry.file;
- if (fd.entrylength < sizeof(struct hfsplus_cat_file))
- /* panic? */;
+ if (fd.entrylength < sizeof(struct hfsplus_cat_file)) {
+ pr_err("bad catalog file entry\n");
+ res = -EIO;
+ goto out;
+ }
hfs_bnode_read(fd.bnode, &entry, fd.entryoffset,
sizeof(struct hfsplus_cat_file));
hfsplus_inode_write_fork(inode, &file->data_fork);
@@ -624,5 +638,5 @@ int hfsplus_cat_write_inode(struct inode *inode)
set_bit(HFSPLUS_I_CAT_DIRTY, &HFSPLUS_I(inode)->flags);
out:
hfs_find_exit(&fd);
- return 0;
+ return res;
}
diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c
index 047e05c57560..c94a58762ad6 100644
--- a/fs/hfsplus/options.c
+++ b/fs/hfsplus/options.c
@@ -140,6 +140,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
if (!uid_valid(sbi->uid)) {
pr_err("invalid uid specified\n");
return 0;
+ } else {
+ set_bit(HFSPLUS_SB_UID, &sbi->flags);
}
break;
case opt_gid:
@@ -151,6 +153,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi)
if (!gid_valid(sbi->gid)) {
pr_err("invalid gid specified\n");
return 0;
+ } else {
+ set_bit(HFSPLUS_SB_GID, &sbi->flags);
}
break;
case opt_part:
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 2b9e5743105e..29a39afe2653 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -295,11 +295,11 @@ static void hfsplus_put_super(struct super_block *sb)
hfsplus_sync_fs(sb, 1);
}
+ iput(sbi->alloc_file);
+ iput(sbi->hidden_dir);
hfs_btree_close(sbi->attr_tree);
hfs_btree_close(sbi->cat_tree);
hfs_btree_close(sbi->ext_tree);
- iput(sbi->alloc_file);
- iput(sbi->hidden_dir);
kfree(sbi->s_vhdr_buf);
kfree(sbi->s_backup_vhdr_buf);
unload_nls(sbi->nls);
diff --git a/fs/hfsplus/unicode.c b/fs/hfsplus/unicode.c
index c8d1b2be7854..73342c925a4b 100644
--- a/fs/hfsplus/unicode.c
+++ b/fs/hfsplus/unicode.c
@@ -398,7 +398,7 @@ int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
astr = str->name;
len = str->len;
while (len > 0) {
- int uninitialized_var(dsize);
+ int dsize;
size = asc2unichar(sb, astr, len, &c);
astr += size;
len -= size;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7d039ba5ae28..47b292f9b4f8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1205,6 +1205,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
{
struct hugetlbfs_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
+ struct hstate *h;
char *rest;
unsigned long ps;
int opt;
@@ -1232,7 +1233,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->max_size_opt = memparse(param->string, &rest);
ctx->max_val_type = SIZE_STD;
@@ -1242,23 +1243,24 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_nr_inodes:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->nr_inodes = memparse(param->string, &rest);
return 0;
case Opt_pagesize:
ps = memparse(param->string, &rest);
- ctx->hstate = size_to_hstate(ps);
- if (!ctx->hstate) {
+ h = size_to_hstate(ps);
+ if (!h) {
pr_err("Unsupported page size %lu MB\n", ps >> 20);
return -EINVAL;
}
+ ctx->hstate = h;
return 0;
case Opt_min_size:
/* memparse() will accept a K/M/G without a digit */
- if (!isdigit(param->string[0]))
+ if (!param->string || !isdigit(param->string[0]))
goto bad_val;
ctx->min_size_opt = memparse(param->string, &rest);
ctx->min_val_type = SIZE_STD;
diff --git a/fs/inode.c b/fs/inode.c
index 52cc64adb75c..78d7aba27ebd 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -167,8 +167,6 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_wb_frn_history = 0;
#endif
- if (security_inode_alloc(inode))
- goto out;
spin_lock_init(&inode->i_lock);
lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -199,11 +197,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_fsnotify_mask = 0;
#endif
inode->i_flctx = NULL;
+
+ if (unlikely(security_inode_alloc(inode)))
+ return -ENOMEM;
this_cpu_inc(nr_inodes);
return 0;
-out:
- return -ENOMEM;
}
EXPORT_SYMBOL(inode_init_always);
@@ -1014,6 +1013,48 @@ void discard_new_inode(struct inode *inode)
EXPORT_SYMBOL(discard_new_inode);
/**
+ * lock_two_inodes - lock two inodes (may be regular files but also dirs)
+ *
+ * Lock any non-NULL argument. The caller must make sure that if he is passing
+ * in two directories, one is not ancestor of the other. Zero, one or two
+ * objects may be locked by this function.
+ *
+ * @inode1: first inode to lock
+ * @inode2: second inode to lock
+ * @subclass1: inode lock subclass for the first lock obtained
+ * @subclass2: inode lock subclass for the second lock obtained
+ */
+void lock_two_inodes(struct inode *inode1, struct inode *inode2,
+ unsigned subclass1, unsigned subclass2)
+{
+ if (!inode1 || !inode2) {
+ /*
+ * Make sure @subclass1 will be used for the acquired lock.
+ * This is not strictly necessary (no current caller cares) but
+ * let's keep things consistent.
+ */
+ if (!inode1)
+ swap(inode1, inode2);
+ goto lock;
+ }
+
+ /*
+ * If one object is directory and the other is not, we must make sure
+ * to lock directory first as the other object may be its child.
+ */
+ if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) {
+ if (inode1 > inode2)
+ swap(inode1, inode2);
+ } else if (!S_ISDIR(inode1->i_mode))
+ swap(inode1, inode2);
+lock:
+ if (inode1)
+ inode_lock_nested(inode1, subclass1);
+ if (inode2 && inode2 != inode1)
+ inode_lock_nested(inode2, subclass2);
+}
+
+/**
* lock_two_nondirectories - take two i_mutexes on non-directory objects
*
* Lock any non-NULL argument that is not a directory.
@@ -2060,10 +2101,6 @@ void inode_init_owner(struct inode *inode, const struct inode *dir,
/* Directories are special, and always inherit S_ISGID */
if (S_ISDIR(mode))
mode |= S_ISGID;
- else if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP) &&
- !in_group_p(inode->i_gid) &&
- !capable_wrt_inode_uidgid(dir, CAP_FSETID))
- mode &= ~S_ISGID;
} else
inode->i_gid = current_fsgid();
inode->i_mode = mode;
@@ -2319,3 +2356,31 @@ int vfs_ioc_fssetxattr_check(struct inode *inode, const struct fsxattr *old_fa,
return 0;
}
EXPORT_SYMBOL(vfs_ioc_fssetxattr_check);
+
+/**
+ * mode_strip_sgid - handle the sgid bit for non-directories
+ * @dir: parent directory inode
+ * @mode: mode of the file to be created in @dir
+ *
+ * If the @mode of the new file has both the S_ISGID and S_IXGRP bit
+ * raised and @dir has the S_ISGID bit raised ensure that the caller is
+ * either in the group of the parent directory or they have CAP_FSETID
+ * in their user namespace and are privileged over the parent directory.
+ * In all other cases, strip the S_ISGID bit from @mode.
+ *
+ * Return: the new mode to use for the file
+ */
+umode_t mode_strip_sgid(const struct inode *dir, umode_t mode)
+{
+ if ((mode & (S_ISGID | S_IXGRP)) != (S_ISGID | S_IXGRP))
+ return mode;
+ if (S_ISDIR(mode) || !dir || !(dir->i_mode & S_ISGID))
+ return mode;
+ if (in_group_p(dir->i_gid))
+ return mode;
+ if (capable_wrt_inode_uidgid(dir, CAP_FSETID))
+ return mode;
+
+ return mode & ~S_ISGID;
+}
+EXPORT_SYMBOL(mode_strip_sgid);
diff --git a/fs/internal.h b/fs/internal.h
index 61aed95f83d1..377f984e9226 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -138,6 +138,8 @@ extern int vfs_open(const struct path *, struct file *);
extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
extern void inode_add_lru(struct inode *inode);
extern int dentry_needs_remove_privs(struct dentry *dentry);
+void lock_two_inodes(struct inode *inode1, struct inode *inode2,
+ unsigned subclass1, unsigned subclass2);
/*
* fs-writeback.c
diff --git a/fs/io_uring.c b/fs/io_uring.c
index e73969fa96bc..2c793e4ccf09 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -63,7 +63,6 @@
#include <linux/net.h>
#include <net/sock.h>
#include <net/af_unix.h>
-#include <net/scm.h>
#include <linux/anon_inodes.h>
#include <linux/sched/mm.h>
#include <linux/uaccess.h>
@@ -264,10 +263,6 @@ struct io_ring_ctx {
struct async_list pending_async[2];
-#if defined(CONFIG_UNIX)
- struct socket *ring_sock;
-#endif
-
struct list_head task_list;
spinlock_t task_lock;
};
@@ -381,19 +376,6 @@ static struct kmem_cache *req_cachep;
static const struct file_operations io_uring_fops;
-struct sock *io_uring_get_socket(struct file *file)
-{
-#if defined(CONFIG_UNIX)
- if (file->f_op == &io_uring_fops) {
- struct io_ring_ctx *ctx = file->private_data;
-
- return ctx->ring_sock->sk;
- }
-#endif
- return NULL;
-}
-EXPORT_SYMBOL(io_uring_get_socket);
-
static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
@@ -551,7 +533,8 @@ static void io_kill_timeout(struct io_kiocb *req)
atomic_inc(&req->ctx->cq_timeouts);
list_del(&req->list);
io_cqring_fill_event(req->ctx, req->user_data, 0);
- __io_free_req(req);
+ if (refcount_dec_and_test(&req->refs))
+ __io_free_req(req);
}
}
@@ -1255,7 +1238,7 @@ static int io_import_fixed(struct io_ring_ctx *ctx, int rw,
*/
const struct bio_vec *bvec = imu->bvec;
- if (offset <= bvec->bv_len) {
+ if (offset < bvec->bv_len) {
iov_iter_advance(iter, offset);
} else {
unsigned long seg_skip;
@@ -1908,6 +1891,9 @@ static int io_poll_add(struct io_kiocb *req, const struct io_uring_sqe *sqe)
__poll_t mask;
u16 events;
+ if (req->file->f_op->may_pollfree)
+ return -EOPNOTSUPP;
+
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
if (sqe->addr || sqe->ioprio || sqe->off || sqe->len || sqe->buf_index)
@@ -2076,12 +2062,12 @@ static int io_timeout(struct io_kiocb *req, const struct io_uring_sqe *sqe)
req->sequence -= span;
add:
list_add(&req->list, entry);
- spin_unlock_irq(&ctx->completion_lock);
hrtimer_init(&req->timeout.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
req->timeout.timer.function = io_timeout_fn;
hrtimer_start(&req->timeout.timer, timespec64_to_ktime(ts),
HRTIMER_MODE_REL);
+ spin_unlock_irq(&ctx->completion_lock);
return 0;
}
@@ -3076,20 +3062,10 @@ static int io_cqring_wait(struct io_ring_ctx *ctx, int min_events,
static void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
{
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- struct sock *sock = ctx->ring_sock->sk;
- struct sk_buff *skb;
-
- while ((skb = skb_dequeue(&sock->sk_receive_queue)) != NULL)
- kfree_skb(skb);
- }
-#else
int i;
for (i = 0; i < ctx->nr_user_files; i++)
fput(ctx->user_files[i]);
-#endif
}
static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
@@ -3133,100 +3109,6 @@ static void io_finish_async(struct io_ring_ctx *ctx)
}
}
-#if defined(CONFIG_UNIX)
-static void io_destruct_skb(struct sk_buff *skb)
-{
- struct io_ring_ctx *ctx = skb->sk->sk_user_data;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(ctx->sqo_wq); i++)
- if (ctx->sqo_wq[i])
- flush_workqueue(ctx->sqo_wq[i]);
-
- unix_destruct_scm(skb);
-}
-
-/*
- * Ensure the UNIX gc is aware of our file set, so we are certain that
- * the io_uring can be safely unregistered on process exit, even if we have
- * loops in the file referencing.
- */
-static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset)
-{
- struct sock *sk = ctx->ring_sock->sk;
- struct scm_fp_list *fpl;
- struct sk_buff *skb;
- int i;
-
- fpl = kzalloc(sizeof(*fpl), GFP_KERNEL);
- if (!fpl)
- return -ENOMEM;
-
- skb = alloc_skb(0, GFP_KERNEL);
- if (!skb) {
- kfree(fpl);
- return -ENOMEM;
- }
-
- skb->sk = sk;
- skb->destructor = io_destruct_skb;
-
- fpl->user = get_uid(ctx->user);
- for (i = 0; i < nr; i++) {
- fpl->fp[i] = get_file(ctx->user_files[i + offset]);
- unix_inflight(fpl->user, fpl->fp[i]);
- }
-
- fpl->max = fpl->count = nr;
- UNIXCB(skb).fp = fpl;
- refcount_add(skb->truesize, &sk->sk_wmem_alloc);
- skb_queue_head(&sk->sk_receive_queue, skb);
-
- for (i = 0; i < nr; i++)
- fput(fpl->fp[i]);
-
- return 0;
-}
-
-/*
- * If UNIX sockets are enabled, fd passing can cause a reference cycle which
- * causes regular reference counting to break down. We rely on the UNIX
- * garbage collection to take care of this problem for us.
- */
-static int io_sqe_files_scm(struct io_ring_ctx *ctx)
-{
- unsigned left, total;
- int ret = 0;
-
- total = 0;
- left = ctx->nr_user_files;
- while (left) {
- unsigned this_files = min_t(unsigned, left, SCM_MAX_FD);
-
- ret = __io_sqe_files_scm(ctx, this_files, total);
- if (ret)
- break;
- left -= this_files;
- total += this_files;
- }
-
- if (!ret)
- return 0;
-
- while (total < ctx->nr_user_files) {
- fput(ctx->user_files[total]);
- total++;
- }
-
- return ret;
-}
-#else
-static int io_sqe_files_scm(struct io_ring_ctx *ctx)
-{
- return 0;
-}
-#endif
-
static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
unsigned nr_args)
{
@@ -3280,11 +3162,7 @@ static int io_sqe_files_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
- ret = io_sqe_files_scm(ctx);
- if (ret)
- io_sqe_files_unregister(ctx);
-
- return ret;
+ return 0;
}
static int io_sq_offload_start(struct io_ring_ctx *ctx,
@@ -3682,13 +3560,6 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
io_sqe_files_unregister(ctx);
io_eventfd_unregister(ctx);
-#if defined(CONFIG_UNIX)
- if (ctx->ring_sock) {
- ctx->ring_sock->file = NULL; /* so that iput() is called */
- sock_release(ctx->ring_sock);
- }
-#endif
-
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
@@ -3734,9 +3605,6 @@ static void io_cancel_async_work(struct io_ring_ctx *ctx,
{
struct io_kiocb *req;
- if (list_empty(&ctx->task_list))
- return;
-
spin_lock_irq(&ctx->task_lock);
list_for_each_entry(req, &ctx->task_list, task_list) {
@@ -3891,6 +3759,11 @@ static const struct file_operations io_uring_fops = {
.fasync = io_uring_fasync,
};
+bool io_is_uring_fops(struct file *file)
+{
+ return file->f_op == &io_uring_fops;
+}
+
static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
struct io_uring_params *p)
{
@@ -3938,45 +3811,26 @@ static int io_allocate_scq_urings(struct io_ring_ctx *ctx,
/*
* Allocate an anonymous fd, this is what constitutes the application
* visible backing of an io_uring instance. The application mmaps this
- * fd to gain access to the SQ/CQ ring details. If UNIX sockets are enabled,
- * we have to tie this fd to a socket for file garbage collection purposes.
+ * fd to gain access to the SQ/CQ ring details.
*/
static int io_uring_get_fd(struct io_ring_ctx *ctx)
{
struct file *file;
int ret;
-#if defined(CONFIG_UNIX)
- ret = sock_create_kern(&init_net, PF_UNIX, SOCK_RAW, IPPROTO_IP,
- &ctx->ring_sock);
- if (ret)
- return ret;
-#endif
-
ret = get_unused_fd_flags(O_RDWR | O_CLOEXEC);
if (ret < 0)
- goto err;
+ return ret;
file = anon_inode_getfile("[io_uring]", &io_uring_fops, ctx,
O_RDWR | O_CLOEXEC);
if (IS_ERR(file)) {
put_unused_fd(ret);
- ret = PTR_ERR(file);
- goto err;
+ return PTR_ERR(file);
}
-#if defined(CONFIG_UNIX)
- ctx->ring_sock->file = file;
- ctx->ring_sock->sk->sk_user_data = ctx;
-#endif
fd_install(ret, file);
return ret;
-err:
-#if defined(CONFIG_UNIX)
- sock_release(ctx->ring_sock);
- ctx->ring_sock = NULL;
-#endif
- return ret;
}
static int io_uring_create(unsigned entries, struct io_uring_params *p)
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 53cd7b2bb580..c28ba474a25e 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -23,6 +23,7 @@ static struct iomap_page *
iomap_page_create(struct inode *inode, struct page *page)
{
struct iomap_page *iop = to_iomap_page(page);
+ unsigned int nr_blocks = PAGE_SIZE / i_blocksize(inode);
if (iop || i_blocksize(inode) == PAGE_SIZE)
return iop;
@@ -32,6 +33,8 @@ iomap_page_create(struct inode *inode, struct page *page)
atomic_set(&iop->write_count, 0);
spin_lock_init(&iop->uptodate_lock);
bitmap_zero(iop->uptodate, PAGE_SIZE / SECTOR_SIZE);
+ if (PageUptodate(page))
+ bitmap_fill(iop->uptodate, nr_blocks);
/*
* migrate_page_move_mapping() assumes that pages with private data have
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 74e487d63c62..95c08f3c4b35 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -912,8 +912,22 @@ root_found:
* we then decide whether to use the Joliet descriptor.
*/
inode = isofs_iget(s, sbi->s_firstdatazone, 0);
- if (IS_ERR(inode))
- goto out_no_root;
+
+ /*
+ * Fix for broken CDs with a corrupt root inode but a correct Joliet
+ * root directory.
+ */
+ if (IS_ERR(inode)) {
+ if (joliet_level && sbi->s_firstdatazone != first_data_zone) {
+ printk(KERN_NOTICE
+ "ISOFS: root inode is unusable. "
+ "Disabling Rock Ridge and switching to Joliet.");
+ sbi->s_rock = 0;
+ inode = NULL;
+ } else {
+ goto out_no_root;
+ }
+ }
/*
* Fix for broken CDs with Rock Ridge and empty ISO root directory but
diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c
index 558e7c51ce0d..58f80e1b3ac0 100644
--- a/fs/isofs/namei.c
+++ b/fs/isofs/namei.c
@@ -153,8 +153,8 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry,
struct dentry *isofs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{
int found;
- unsigned long uninitialized_var(block);
- unsigned long uninitialized_var(offset);
+ unsigned long block;
+ unsigned long offset;
struct inode *inode;
struct page *page;
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 5ef99b9ec8be..edb17822f8e6 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -58,28 +58,6 @@ static inline void __buffer_unlink(struct journal_head *jh)
}
/*
- * Move a buffer from the checkpoint list to the checkpoint io list
- *
- * Called with j_list_lock held
- */
-static inline void __buffer_relink_io(struct journal_head *jh)
-{
- transaction_t *transaction = jh->b_cp_transaction;
-
- __buffer_unlink_first(jh);
-
- if (!transaction->t_checkpoint_io_list) {
- jh->b_cpnext = jh->b_cpprev = jh;
- } else {
- jh->b_cpnext = transaction->t_checkpoint_io_list;
- jh->b_cpprev = transaction->t_checkpoint_io_list->b_cpprev;
- jh->b_cpprev->b_cpnext = jh;
- jh->b_cpnext->b_cpprev = jh;
- }
- transaction->t_checkpoint_io_list = jh;
-}
-
-/*
* Try to release a checkpointed buffer from its transaction.
* Returns 1 if we released it and 2 if we also released the
* whole transaction.
@@ -91,8 +69,7 @@ static int __try_to_free_cp_buf(struct journal_head *jh)
int ret = 0;
struct buffer_head *bh = jh2bh(jh);
- if (jh->b_transaction == NULL && !buffer_locked(bh) &&
- !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
+ if (!jh->b_transaction && !buffer_locked(bh) && !buffer_dirty(bh)) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
ret = __jbd2_journal_remove_checkpoint(jh) + 1;
}
@@ -191,6 +168,7 @@ __flush_batch(journal_t *journal, int *batch_count)
struct buffer_head *bh = journal->j_chkpt_bhs[i];
BUFFER_TRACE(bh, "brelse");
__brelse(bh);
+ journal->j_chkpt_bhs[i] = NULL;
}
*batch_count = 0;
}
@@ -228,7 +206,6 @@ int jbd2_log_do_checkpoint(journal_t *journal)
* OK, we need to start writing disk blocks. Take one transaction
* and write it.
*/
- result = 0;
spin_lock(&journal->j_list_lock);
if (!journal->j_checkpoint_transactions)
goto out;
@@ -251,15 +228,6 @@ restart:
jh = transaction->t_checkpoint_list;
bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- goto retry;
- }
if (jh->b_transaction != NULL) {
transaction_t *t = jh->b_transaction;
tid_t tid = t->t_tid;
@@ -294,32 +262,50 @@ restart:
spin_lock(&journal->j_list_lock);
goto restart;
}
- if (!buffer_dirty(bh)) {
- if (unlikely(buffer_write_io_error(bh)) && !result)
- result = -EIO;
+ if (!trylock_buffer(bh)) {
+ /*
+ * The buffer is locked, it may be writing back, or
+ * flushing out in the last couple of cycles, or
+ * re-adding into a new transaction, need to check
+ * it again until it's unlocked.
+ */
+ get_bh(bh);
+ spin_unlock(&journal->j_list_lock);
+ wait_on_buffer(bh);
+ /* the journal_head may have gone by now */
+ BUFFER_TRACE(bh, "brelse");
+ __brelse(bh);
+ goto retry;
+ } else if (!buffer_dirty(bh)) {
+ unlock_buffer(bh);
BUFFER_TRACE(bh, "remove from checkpoint");
- if (__jbd2_journal_remove_checkpoint(jh))
- /* The transaction was released; we're done */
+ /*
+ * If the transaction was released or the checkpoint
+ * list was empty, we're done.
+ */
+ if (__jbd2_journal_remove_checkpoint(jh) ||
+ !transaction->t_checkpoint_list)
goto out;
- continue;
+ } else {
+ unlock_buffer(bh);
+ /*
+ * We are about to write the buffer, it could be
+ * raced by some other transaction shrink or buffer
+ * re-log logic once we release the j_list_lock,
+ * leave it on the checkpoint list and check status
+ * again to make sure it's clean.
+ */
+ BUFFER_TRACE(bh, "queue");
+ get_bh(bh);
+ J_ASSERT_BH(bh, !buffer_jwrite(bh));
+ journal->j_chkpt_bhs[batch_count++] = bh;
+ transaction->t_chp_stats.cs_written++;
+ transaction->t_checkpoint_list = jh->b_cpnext;
}
- /*
- * Important: we are about to write the buffer, and
- * possibly block, while still holding the journal
- * lock. We cannot afford to let the transaction
- * logic start messing around with this buffer before
- * we write it to disk, as that would break
- * recoverability.
- */
- BUFFER_TRACE(bh, "queue");
- get_bh(bh);
- J_ASSERT_BH(bh, !buffer_jwrite(bh));
- journal->j_chkpt_bhs[batch_count++] = bh;
- __buffer_relink_io(jh);
- transaction->t_chp_stats.cs_written++;
+
if ((batch_count == JBD2_NR_BATCH) ||
- need_resched() ||
- spin_needbreak(&journal->j_list_lock))
+ need_resched() || spin_needbreak(&journal->j_list_lock) ||
+ jh2bh(transaction->t_checkpoint_list) == journal->j_chkpt_bhs[0])
goto unlock_and_flush;
}
@@ -333,46 +319,9 @@ restart:
goto restart;
}
- /*
- * Now we issued all of the transaction's buffers, let's deal
- * with the buffers that are out for I/O.
- */
-restart2:
- /* Did somebody clean up the transaction in the meanwhile? */
- if (journal->j_checkpoint_transactions != transaction ||
- transaction->t_tid != this_tid)
- goto out;
-
- while (transaction->t_checkpoint_io_list) {
- jh = transaction->t_checkpoint_io_list;
- bh = jh2bh(jh);
- if (buffer_locked(bh)) {
- get_bh(bh);
- spin_unlock(&journal->j_list_lock);
- wait_on_buffer(bh);
- /* the journal_head may have gone by now */
- BUFFER_TRACE(bh, "brelse");
- __brelse(bh);
- spin_lock(&journal->j_list_lock);
- goto restart2;
- }
- if (unlikely(buffer_write_io_error(bh)) && !result)
- result = -EIO;
-
- /*
- * Now in whatever state the buffer currently is, we
- * know that it has been written out and so we can
- * drop it from the list
- */
- if (__jbd2_journal_remove_checkpoint(jh))
- break;
- }
out:
spin_unlock(&journal->j_list_lock);
- if (result < 0)
- jbd2_journal_abort(journal, result);
- else
- result = jbd2_cleanup_journal_tail(journal);
+ result = jbd2_cleanup_journal_tail(journal);
return (result < 0) ? result : 0;
}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d45ceb2e2149..7bd4b1d4224e 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -514,13 +514,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
*/
jbd2_journal_switch_revoke_table(journal);
+ write_lock(&journal->j_state_lock);
/*
* Reserved credits cannot be claimed anymore, free them
*/
atomic_sub(atomic_read(&journal->j_reserved_credits),
&commit_transaction->t_outstanding_credits);
- write_lock(&journal->j_state_lock);
trace_jbd2_commit_flushing(journal, commit_transaction);
stats.run.rs_flushing = jiffies;
stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -531,7 +531,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
journal->j_running_transaction = NULL;
start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
jbd_debug(3, "JBD2: commit phase 2a\n");
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b7c5819bfc41..81bd7b29a10b 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -562,12 +562,14 @@ static int __jbd2_journal_force_commit(journal_t *journal)
}
/**
- * Force and wait upon a commit if the calling process is not within
- * transaction. This is used for forcing out undo-protected data which contains
- * bitmaps, when the fs is running out of space.
+ * jbd2_journal_force_commit_nested - Force and wait upon a commit if the
+ * calling process is not within transaction.
*
* @journal: journal to force
* Returns true if progress was made.
+ *
+ * This is used for forcing out undo-protected data which contains
+ * bitmaps, when the fs is running out of space.
*/
int jbd2_journal_force_commit_nested(journal_t *journal)
{
@@ -578,7 +580,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
}
/**
- * int journal_force_commit() - force any uncommitted transactions
+ * jbd2_journal_force_commit() - force any uncommitted transactions
* @journal: journal to force
*
* Caller want unconditional commit. We can only force the running transaction
@@ -1266,7 +1268,7 @@ journal_t *jbd2_journal_init_inode(struct inode *inode)
* superblock as being NULL to prevent the journal destroy from writing
* back a bogus superblock.
*/
-static void journal_fail_superblock (journal_t *journal)
+static void journal_fail_superblock(journal_t *journal)
{
struct buffer_head *bh = journal->j_sb_buffer;
brelse(bh);
@@ -1353,9 +1355,11 @@ static int jbd2_write_superblock(journal_t *journal, int write_flags)
return -EIO;
}
- trace_jbd2_write_superblock(journal, write_flags);
if (!(journal->j_flags & JBD2_BARRIER))
write_flags &= ~(REQ_FUA | REQ_PREFLUSH);
+
+ trace_jbd2_write_superblock(journal, write_flags);
+
if (buffer_write_io_error(bh)) {
/*
* Oh, dear. A previous attempt to write the journal
@@ -1634,7 +1638,7 @@ static int load_superblock(journal_t *journal)
/**
- * int jbd2_journal_load() - Read journal from disk.
+ * jbd2_journal_load() - Read journal from disk.
* @journal: Journal to act on.
*
* Given a journal_t structure which tells us which disk blocks contain
@@ -1704,7 +1708,7 @@ recovery_error:
}
/**
- * void jbd2_journal_destroy() - Release a journal_t structure.
+ * jbd2_journal_destroy() - Release a journal_t structure.
* @journal: Journal to act on.
*
* Release a journal_t structure once it is no longer in use by the
@@ -1780,7 +1784,7 @@ int jbd2_journal_destroy(journal_t *journal)
/**
- *int jbd2_journal_check_used_features () - Check if features specified are used.
+ * jbd2_journal_check_used_features() - Check if features specified are used.
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -1790,7 +1794,7 @@ int jbd2_journal_destroy(journal_t *journal)
* features. Return true (non-zero) if it does.
**/
-int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_check_used_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
journal_superblock_t *sb;
@@ -1815,7 +1819,7 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
}
/**
- * int jbd2_journal_check_available_features() - Check feature set in journalling layer
+ * jbd2_journal_check_available_features() - Check feature set in journalling layer
* @journal: Journal to check.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -1825,7 +1829,7 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat,
* all of a given set of features on this journal. Return true
* (non-zero) if it can. */
-int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_check_available_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
if (!compat && !ro && !incompat)
@@ -1847,7 +1851,7 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com
}
/**
- * int jbd2_journal_set_features () - Mark a given journal feature in the superblock
+ * jbd2_journal_set_features() - Mark a given journal feature in the superblock
* @journal: Journal to act on.
* @compat: bitmask of compatible features
* @ro: bitmask of features that force read-only mount
@@ -1858,7 +1862,7 @@ int jbd2_journal_check_available_features (journal_t *journal, unsigned long com
*
*/
-int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
+int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
unsigned long ro, unsigned long incompat)
{
#define INCOMPAT_FEATURE_ON(f) \
@@ -1929,7 +1933,7 @@ int jbd2_journal_set_features (journal_t *journal, unsigned long compat,
}
/*
- * jbd2_journal_clear_features () - Clear a given journal feature in the
+ * jbd2_journal_clear_features() - Clear a given journal feature in the
* superblock
* @journal: Journal to act on.
* @compat: bitmask of compatible features
@@ -1956,7 +1960,7 @@ void jbd2_journal_clear_features(journal_t *journal, unsigned long compat,
EXPORT_SYMBOL(jbd2_journal_clear_features);
/**
- * int jbd2_journal_flush () - Flush journal
+ * jbd2_journal_flush() - Flush journal
* @journal: Journal to act on.
*
* Flush all data for a given journal to disk and empty the journal.
@@ -2031,7 +2035,7 @@ out:
}
/**
- * int jbd2_journal_wipe() - Wipe journal contents
+ * jbd2_journal_wipe() - Wipe journal contents
* @journal: Journal to act on.
* @write: flag (see below)
*
@@ -2072,7 +2076,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
}
/**
- * void jbd2_journal_abort () - Shutdown the journal immediately.
+ * jbd2_journal_abort () - Shutdown the journal immediately.
* @journal: the journal to shutdown.
* @errno: an error number to record in the journal indicating
* the reason for the shutdown.
@@ -2158,7 +2162,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
}
/**
- * int jbd2_journal_errno () - returns the journal's error state.
+ * jbd2_journal_errno() - returns the journal's error state.
* @journal: journal to examine.
*
* This is the errno number set with jbd2_journal_abort(), the last
@@ -2182,7 +2186,7 @@ int jbd2_journal_errno(journal_t *journal)
}
/**
- * int jbd2_journal_clear_err () - clears the journal's error state
+ * jbd2_journal_clear_err() - clears the journal's error state
* @journal: journal to act on.
*
* An error must be cleared or acked to take a FS out of readonly
@@ -2202,7 +2206,7 @@ int jbd2_journal_clear_err(journal_t *journal)
}
/**
- * void jbd2_journal_ack_err() - Ack journal err.
+ * jbd2_journal_ack_err() - Ack journal err.
* @journal: journal to act on.
*
* An error must be cleared or acked to take a FS out of readonly
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index a4967b27ffb6..ed923a9765c2 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -247,6 +247,8 @@ int jbd2_journal_recover(journal_t *journal)
journal_superblock_t * sb;
struct recovery_info info;
+ errseq_t wb_err;
+ struct address_space *mapping;
memset(&info, 0, sizeof(info));
sb = journal->j_superblock;
@@ -264,6 +266,9 @@ int jbd2_journal_recover(journal_t *journal)
return 0;
}
+ wb_err = 0;
+ mapping = journal->j_fs_dev->bd_inode->i_mapping;
+ errseq_check_and_advance(&mapping->wb_err, &wb_err);
err = do_one_pass(journal, &info, PASS_SCAN);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
@@ -284,6 +289,9 @@ int jbd2_journal_recover(journal_t *journal)
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
+ err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err);
+ if (!err)
+ err = err2;
/* Make sure all replayed data is on permanent storage */
if (journal->j_flags & JBD2_BARRIER) {
err2 = blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index be05fb96757c..91c2d3f6d1b3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -149,7 +149,7 @@ static void wait_transaction_locked(journal_t *journal)
int need_to_start;
tid_t tid = journal->j_running_transaction->t_tid;
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
@@ -175,7 +175,7 @@ static void wait_transaction_switching(journal_t *journal)
read_unlock(&journal->j_state_lock);
return;
}
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
read_unlock(&journal->j_state_lock);
/*
@@ -490,7 +490,7 @@ EXPORT_SYMBOL(jbd2__journal_start);
/**
- * handle_t *jbd2_journal_start() - Obtain a new handle.
+ * jbd2_journal_start() - Obtain a new handle.
* @journal: Journal to start transaction on.
* @nblocks: number of block buffer we might modify
*
@@ -525,7 +525,7 @@ void jbd2_journal_free_reserved(handle_t *handle)
EXPORT_SYMBOL(jbd2_journal_free_reserved);
/**
- * int jbd2_journal_start_reserved() - start reserved handle
+ * jbd2_journal_start_reserved() - start reserved handle
* @handle: handle to start
* @type: for handle statistics
* @line_no: for handle statistics
@@ -579,7 +579,7 @@ int jbd2_journal_start_reserved(handle_t *handle, unsigned int type,
EXPORT_SYMBOL(jbd2_journal_start_reserved);
/**
- * int jbd2_journal_extend() - extend buffer credits.
+ * jbd2_journal_extend() - extend buffer credits.
* @handle: handle to 'extend'
* @nblocks: nr blocks to try to extend by.
*
@@ -659,7 +659,7 @@ error_out:
/**
- * int jbd2_journal_restart() - restart a handle .
+ * jbd2__journal_restart() - restart a handle .
* @handle: handle to restart
* @nblocks: nr credits requested
* @gfp_mask: memory allocation flags (for start_this_handle)
@@ -736,7 +736,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
EXPORT_SYMBOL(jbd2_journal_restart);
/**
- * void jbd2_journal_lock_updates () - establish a transaction barrier.
+ * jbd2_journal_lock_updates () - establish a transaction barrier.
* @journal: Journal to establish a barrier on.
*
* This locks out any further updates from being started, and blocks
@@ -795,7 +795,7 @@ void jbd2_journal_lock_updates(journal_t *journal)
}
/**
- * void jbd2_journal_unlock_updates (journal_t* journal) - release barrier
+ * jbd2_journal_unlock_updates () - release barrier
* @journal: Journal to release the barrier on.
*
* Release a transaction barrier obtained with jbd2_journal_lock_updates().
@@ -810,7 +810,7 @@ void jbd2_journal_unlock_updates (journal_t *journal)
write_lock(&journal->j_state_lock);
--journal->j_barrier_count;
write_unlock(&journal->j_state_lock);
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
}
static void warn_dirty_buffer(struct buffer_head *bh)
@@ -1103,7 +1103,8 @@ out:
}
/**
- * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update.
+ * jbd2_journal_get_write_access() - notify intent to modify a buffer
+ * for metadata (not data) update.
* @handle: transaction to add buffer modifications to
* @bh: bh to be used for metadata writes
*
@@ -1147,7 +1148,7 @@ int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
* unlocked buffer beforehand. */
/**
- * int jbd2_journal_get_create_access () - notify intent to use newly created bh
+ * jbd2_journal_get_create_access () - notify intent to use newly created bh
* @handle: transaction to new buffer to
* @bh: new buffer.
*
@@ -1227,7 +1228,7 @@ out:
}
/**
- * int jbd2_journal_get_undo_access() - Notify intent to modify metadata with
+ * jbd2_journal_get_undo_access() - Notify intent to modify metadata with
* non-rewindable consequences
* @handle: transaction
* @bh: buffer to undo
@@ -1304,7 +1305,7 @@ out:
}
/**
- * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * jbd2_journal_set_triggers() - Add triggers for commit writeout
* @bh: buffer to trigger on
* @type: struct jbd2_buffer_trigger_type containing the trigger(s).
*
@@ -1346,7 +1347,7 @@ void jbd2_buffer_abort_trigger(struct journal_head *jh,
}
/**
- * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
+ * jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
*
@@ -1375,8 +1376,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
struct journal_head *jh;
int ret = 0;
- if (is_handle_aborted(handle))
- return -EROFS;
if (!buffer_jbd(bh))
return -EUCLEAN;
@@ -1423,6 +1422,18 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
journal = transaction->t_journal;
jbd_lock_bh_state(bh);
+ if (is_handle_aborted(handle)) {
+ /*
+ * Check journal aborting with @jh->b_state_lock locked,
+ * since 'jh->b_transaction' could be replaced with
+ * 'jh->b_next_transaction' during old transaction
+ * committing if journal aborted, which may fail
+ * assertion on 'jh->b_frozen_data == NULL'.
+ */
+ ret = -EROFS;
+ goto out_unlock_bh;
+ }
+
if (jh->b_modified == 0) {
/*
* This buffer's got modified and becoming part
@@ -1514,7 +1525,7 @@ out:
}
/**
- * void jbd2_journal_forget() - bforget() for potentially-journaled buffers.
+ * jbd2_journal_forget() - bforget() for potentially-journaled buffers.
* @handle: transaction handle
* @bh: bh to 'forget'
*
@@ -1689,7 +1700,7 @@ not_jbd:
}
/**
- * int jbd2_journal_stop() - complete a transaction
+ * jbd2_journal_stop() - complete a transaction
* @handle: transaction to complete.
*
* All done for a particular handle.
@@ -2037,7 +2048,7 @@ out:
}
/**
- * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
+ * jbd2_journal_try_to_free_buffers() - try to free page buffers.
* @journal: journal for operation
* @page: to try and free
* @gfp_mask: we use the mask to detect how hard should we try to release
@@ -2376,7 +2387,7 @@ zap_buffer_unlocked:
}
/**
- * void jbd2_journal_invalidatepage()
+ * jbd2_journal_invalidatepage()
* @journal: journal to use for flush...
* @page: page to flush
* @offset: start of the range to invalidate
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index 837cd55fd4c5..6ae9d6fefb86 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -211,7 +211,10 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c)
ic->scan_dents = NULL;
cond_resched();
}
- jffs2_build_xattr_subsystem(c);
+ ret = jffs2_build_xattr_subsystem(c);
+ if (ret)
+ goto exit;
+
c->flags &= ~JFFS2_SB_FLAG_BUILDING;
dbg_fsbuild("FS build complete\n");
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 83b8f06b4a64..7e9abdb89712 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -401,7 +401,7 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb
{
size_t retlen;
int ret;
- uint32_t uninitialized_var(bad_offset);
+ uint32_t bad_offset;
switch (jffs2_block_check_erase(c, jeb, &bad_offset)) {
case -EAGAIN: goto refile;
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 34880a4c2173..94bd4bbd3787 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -137,19 +137,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
pgoff_t index = pos >> PAGE_SHIFT;
- uint32_t pageofs = index << PAGE_SHIFT;
int ret = 0;
jffs2_dbg(1, "%s()\n", __func__);
- if (pageofs > inode->i_size) {
- /* Make new hole frag from old EOF to new page */
+ if (pos > inode->i_size) {
+ /* Make new hole frag from old EOF to new position */
struct jffs2_raw_inode ri;
struct jffs2_full_dnode *fn;
uint32_t alloc_len;
- jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
- (unsigned int)inode->i_size, pageofs);
+ jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new position\n",
+ (unsigned int)inode->i_size, (uint32_t)pos);
ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
@@ -169,10 +168,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
ri.mode = cpu_to_jemode(inode->i_mode);
ri.uid = cpu_to_je16(i_uid_read(inode));
ri.gid = cpu_to_je16(i_gid_read(inode));
- ri.isize = cpu_to_je32(max((uint32_t)inode->i_size, pageofs));
+ ri.isize = cpu_to_je32((uint32_t)pos);
ri.atime = ri.ctime = ri.mtime = cpu_to_je32(JFFS2_NOW());
ri.offset = cpu_to_je32(inode->i_size);
- ri.dsize = cpu_to_je32(pageofs - inode->i_size);
+ ri.dsize = cpu_to_je32((uint32_t)pos - inode->i_size);
ri.csize = cpu_to_je32(0);
ri.compr = JFFS2_COMPR_ZERO;
ri.node_crc = cpu_to_je32(crc32(0, &ri, sizeof(ri)-8));
@@ -202,7 +201,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
goto out_err;
}
jffs2_complete_reservation(c);
- inode->i_size = pageofs;
+ inode->i_size = pos;
mutex_unlock(&f->sem);
}
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index da3e18503c65..acb4492f5970 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -772,10 +772,10 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c)
}
#define XREF_TMPHASH_SIZE (128)
-void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
+int jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
{
struct jffs2_xattr_ref *ref, *_ref;
- struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE];
+ struct jffs2_xattr_ref **xref_tmphash;
struct jffs2_xattr_datum *xd, *_xd;
struct jffs2_inode_cache *ic;
struct jffs2_raw_node_ref *raw;
@@ -784,9 +784,12 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING));
+ xref_tmphash = kcalloc(XREF_TMPHASH_SIZE,
+ sizeof(struct jffs2_xattr_ref *), GFP_KERNEL);
+ if (!xref_tmphash)
+ return -ENOMEM;
+
/* Phase.1 : Merge same xref */
- for (i=0; i < XREF_TMPHASH_SIZE; i++)
- xref_tmphash[i] = NULL;
for (ref=c->xref_temp; ref; ref=_ref) {
struct jffs2_xattr_ref *tmp;
@@ -884,6 +887,8 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c)
"%u of xref (%u dead, %u orphan) found.\n",
xdatum_count, xdatum_unchecked_count, xdatum_orphan_count,
xref_count, xref_dead_count, xref_orphan_count);
+ kfree(xref_tmphash);
+ return 0;
}
struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h
index 720007b2fd65..1b5030a3349d 100644
--- a/fs/jffs2/xattr.h
+++ b/fs/jffs2/xattr.h
@@ -71,7 +71,7 @@ static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref)
#ifdef CONFIG_JFFS2_FS_XATTR
extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c);
-extern void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c);
+extern int jffs2_build_xattr_subsystem(struct jffs2_sb_info *c);
extern void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c);
extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c,
@@ -103,7 +103,7 @@ extern ssize_t jffs2_listxattr(struct dentry *, char *, size_t);
#else
#define jffs2_init_xattr_subsystem(c)
-#define jffs2_build_xattr_subsystem(c)
+#define jffs2_build_xattr_subsystem(c) (0)
#define jffs2_clear_xattr_subsystem(c)
#define jffs2_xattr_do_crccheck_inode(c, ic)
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index d3cb27487c70..deb54efb5601 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -63,10 +63,10 @@
*/
static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
int nblocks);
-static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
-static int dbBackSplit(dmtree_t * tp, int leafno);
-static int dbJoin(dmtree_t * tp, int leafno, int newval);
-static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
+static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl);
+static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl);
+static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl);
+static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl);
static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
int level);
static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
@@ -87,7 +87,7 @@ static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
static int dbFindBits(u32 word, int l2nb);
static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
-static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
+static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl);
static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
int nblocks);
static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
@@ -155,7 +155,7 @@ int dbMount(struct inode *ipbmap)
struct bmap *bmp;
struct dbmap_disk *dbmp_le;
struct metapage *mp;
- int i;
+ int i, err;
/*
* allocate/initialize the in-memory bmap descriptor
@@ -170,30 +170,53 @@ int dbMount(struct inode *ipbmap)
BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
PSIZE, 0);
if (mp == NULL) {
- kfree(bmp);
- return -EIO;
+ err = -EIO;
+ goto err_kfree_bmp;
}
/* copy the on-disk bmap descriptor to its in-memory version. */
dbmp_le = (struct dbmap_disk *) mp->data;
bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
+
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
+ if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE ||
+ bmp->db_l2nbperpage < 0) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
if (!bmp->db_numag) {
- release_metapage(mp);
- kfree(bmp);
- return -EINVAL;
+ err = -EINVAL;
+ goto err_release_metapage;
}
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
+ if (bmp->db_maxag >= MAXAG || bmp->db_maxag < 0 ||
+ bmp->db_agpref >= MAXAG || bmp->db_agpref < 0) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight);
bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
+ if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG ||
+ bmp->db_agl2size < 0) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
+ if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
+ err = -EINVAL;
+ goto err_release_metapage;
+ }
+
for (i = 0; i < MAXAG; i++)
bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
@@ -214,6 +237,12 @@ int dbMount(struct inode *ipbmap)
BMAP_LOCK_INIT(bmp);
return (0);
+
+err_release_metapage:
+ release_metapage(mp);
+err_kfree_bmp:
+ kfree(bmp);
+ return err;
}
@@ -247,6 +276,7 @@ int dbUnmount(struct inode *ipbmap, int mounterror)
/* free the memory for the in-memory bmap. */
kfree(bmp);
+ JFS_SBI(ipbmap->i_sb)->bmap = NULL;
return (0);
}
@@ -1755,7 +1785,7 @@ static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
* dbFindLeaf() returns the index of the leaf at which
* free space was found.
*/
- rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
+ rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx, true);
/* release the buffer.
*/
@@ -2002,9 +2032,12 @@ dbAllocDmapLev(struct bmap * bmp,
* free space. if sufficient free space is found, dbFindLeaf()
* returns the index of the leaf at which free space was found.
*/
- if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
+ if (dbFindLeaf((dmtree_t *) &dp->tree, l2nb, &leafidx, false))
return -ENOSPC;
+ if (leafidx < 0)
+ return -EIO;
+
/* determine the block number within the file system corresponding
* to the leaf at which free space was found.
*/
@@ -2138,7 +2171,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
* system.
*/
if (dp->tree.stree[word] == NOFREE)
- dbBackSplit((dmtree_t *) & dp->tree, word);
+ dbBackSplit((dmtree_t *)&dp->tree, word, false);
dbAllocBits(bmp, dp, blkno, nblocks);
}
@@ -2224,7 +2257,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
* the binary system of the leaves if need be.
*/
dbSplit(tp, word, BUDMIN,
- dbMaxBud((u8 *) & dp->wmap[word]));
+ dbMaxBud((u8 *)&dp->wmap[word]), false);
word += 1;
} else {
@@ -2264,7 +2297,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
* system of the leaves to reflect the current
* allocation (size).
*/
- dbSplit(tp, word, size, NOFREE);
+ dbSplit(tp, word, size, NOFREE, false);
/* get the number of dmap words handled */
nw = BUDSIZE(size, BUDMIN);
@@ -2371,7 +2404,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
/* update the leaf for this dmap word.
*/
rc = dbJoin(tp, word,
- dbMaxBud((u8 *) & dp->wmap[word]));
+ dbMaxBud((u8 *)&dp->wmap[word]), false);
if (rc)
return rc;
@@ -2404,7 +2437,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
/* update the leaf.
*/
- rc = dbJoin(tp, word, size);
+ rc = dbJoin(tp, word, size, false);
if (rc)
return rc;
@@ -2556,14 +2589,14 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
* that it is at the front of a binary buddy system.
*/
if (oldval == NOFREE) {
- rc = dbBackSplit((dmtree_t *) dcp, leafno);
+ rc = dbBackSplit((dmtree_t *)dcp, leafno, true);
if (rc)
return rc;
oldval = dcp->stree[ti];
}
- dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
+ dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval, true);
} else {
- rc = dbJoin((dmtree_t *) dcp, leafno, newval);
+ rc = dbJoin((dmtree_t *) dcp, leafno, newval, true);
if (rc)
return rc;
}
@@ -2592,7 +2625,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
*/
if (alloc) {
dbJoin((dmtree_t *) dcp, leafno,
- oldval);
+ oldval, true);
} else {
/* the dbJoin() above might have
* caused a larger binary buddy system
@@ -2602,9 +2635,9 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
*/
if (dcp->stree[ti] == NOFREE)
dbBackSplit((dmtree_t *)
- dcp, leafno);
+ dcp, leafno, true);
dbSplit((dmtree_t *) dcp, leafno,
- dcp->budmin, oldval);
+ dcp->budmin, oldval, true);
}
/* release the buffer and return the error.
@@ -2652,7 +2685,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
*
* serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
*/
-static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
+static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl)
{
int budsz;
int cursz;
@@ -2674,7 +2707,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
while (cursz >= splitsz) {
/* update the buddy's leaf with its new value.
*/
- dbAdjTree(tp, leafno ^ budsz, cursz);
+ dbAdjTree(tp, leafno ^ budsz, cursz, is_ctl);
/* on to the next size and buddy.
*/
@@ -2686,7 +2719,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
/* adjust the dmap tree to reflect the specified leaf's new
* value.
*/
- dbAdjTree(tp, leafno, newval);
+ dbAdjTree(tp, leafno, newval, is_ctl);
}
@@ -2717,7 +2750,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
*
* serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
*/
-static int dbBackSplit(dmtree_t * tp, int leafno)
+static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl)
{
int budsz, bud, w, bsz, size;
int cursz;
@@ -2768,7 +2801,7 @@ static int dbBackSplit(dmtree_t * tp, int leafno)
* system in two.
*/
cursz = leaf[bud] - 1;
- dbSplit(tp, bud, cursz, cursz);
+ dbSplit(tp, bud, cursz, cursz, is_ctl);
break;
}
}
@@ -2796,7 +2829,7 @@ static int dbBackSplit(dmtree_t * tp, int leafno)
*
* RETURN VALUES: none
*/
-static int dbJoin(dmtree_t * tp, int leafno, int newval)
+static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
{
int budsz, buddy;
s8 *leaf;
@@ -2851,12 +2884,12 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
if (leafno < buddy) {
/* leafno is the left buddy.
*/
- dbAdjTree(tp, buddy, NOFREE);
+ dbAdjTree(tp, buddy, NOFREE, is_ctl);
} else {
/* buddy is the left buddy and becomes
* leafno.
*/
- dbAdjTree(tp, leafno, NOFREE);
+ dbAdjTree(tp, leafno, NOFREE, is_ctl);
leafno = buddy;
}
@@ -2869,7 +2902,7 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
/* update the leaf value.
*/
- dbAdjTree(tp, leafno, newval);
+ dbAdjTree(tp, leafno, newval, is_ctl);
return 0;
}
@@ -2890,15 +2923,20 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
*
* RETURN VALUES: none
*/
-static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
+static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl)
{
int lp, pp, k;
- int max;
+ int max, size;
+
+ size = is_ctl ? CTLTREESIZE : TREESIZE;
/* pick up the index of the leaf for this leafno.
*/
lp = leafno + le32_to_cpu(tp->dmt_leafidx);
+ if (WARN_ON_ONCE(lp >= size || lp < 0))
+ return;
+
/* is the current value the same as the old value ? if so,
* there is nothing to do.
*/
@@ -2959,14 +2997,18 @@ static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
* leafidx - return pointer to be set to the index of the leaf
* describing at least l2nb free blocks if sufficient
* free blocks are found.
+ * is_ctl - determines if the tree is of type ctl
*
* RETURN VALUES:
* 0 - success
* -ENOSPC - insufficient free blocks.
*/
-static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
+static int dbFindLeaf(dmtree_t *tp, int l2nb, int *leafidx, bool is_ctl)
{
int ti, n = 0, k, x = 0;
+ int max_size;
+
+ max_size = is_ctl ? CTLTREESIZE : TREESIZE;
/* first check the root of the tree to see if there is
* sufficient free space.
@@ -2987,6 +3029,8 @@ static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
/* sufficient free space found. move to the next
* level (or quit if this is the last level).
*/
+ if (x + n > max_size)
+ return -ENOSPC;
if (l2nb <= tp->dmt_stree[x + n])
break;
}
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 3acc954f7c04..077a87e53020 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -633,6 +633,11 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
index = base + (lim >> 1);
+ if (stbl[index] < 0) {
+ rc = -EIO;
+ goto out;
+ }
+
if (p->header.flag & BT_LEAF) {
/* uppercase leaf name to compare */
cmp =
@@ -1970,7 +1975,7 @@ static int dtSplitRoot(tid_t tid,
do {
f = &rp->slot[fsi];
fsi = f->next;
- } while (fsi != -1);
+ } while (fsi >= 0);
f->next = n;
}
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index f65bd6b35412..d4e063dbb9a0 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -508,6 +508,11 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
* blocks in the map. in that case, we'll start off with the
* maximum free.
*/
+
+ /* give up if no space left */
+ if (bmp->db_maxfreebud == -1)
+ return -ENOSPC;
+
max = (s64) 1 << bmp->db_maxfreebud;
if (*nblocks >= max && *nblocks > nbperpage)
nb = nblks = (max > nbperpage) ? max : nbperpage;
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index b5d702df7111..33ef13a0b110 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -122,7 +122,9 @@
#define NUM_INODE_PER_IAG INOSPERIAG
#define MINBLOCKSIZE 512
+#define L2MINBLOCKSIZE 9
#define MAXBLOCKSIZE 4096
+#define L2MAXBLOCKSIZE 12
#define MAXFILESIZE ((s64)1 << 52)
#define JFS_LINK_MAX 0xffffffff
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 937ca07b58b1..b0965f3ef186 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -195,6 +195,7 @@ int diUnmount(struct inode *ipimap, int mounterror)
* free in-memory control structure
*/
kfree(imap);
+ JFS_IP(ipimap)->i_imap = NULL;
return (0);
}
@@ -1321,7 +1322,7 @@ diInitInode(struct inode *ip, int iagno, int ino, int extno, struct iag * iagp)
int diAlloc(struct inode *pip, bool dir, struct inode *ip)
{
int rc, ino, iagno, addext, extno, bitno, sword;
- int nwords, rem, i, agno;
+ int nwords, rem, i, agno, dn_numag;
u32 mask, inosmap, extsmap;
struct inode *ipimap;
struct metapage *mp;
@@ -1357,6 +1358,9 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip)
/* get the ag number of this iag */
agno = BLKTOAG(JFS_IP(pip)->agstart, JFS_SBI(pip->i_sb));
+ dn_numag = JFS_SBI(pip->i_sb)->bmap->db_numag;
+ if (agno < 0 || agno > dn_numag)
+ return -EIO;
if (atomic_read(&JFS_SBI(pip->i_sb)->bmap->db_active[agno])) {
/*
@@ -2177,6 +2181,9 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
/* get the ag and iag numbers for this iag.
*/
agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+ if (agno >= MAXAG || agno < 0)
+ return -EIO;
+
iagno = le32_to_cpu(iagp->iagnum);
/* check if this is the last free extent within the
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index d41733540df9..459324f3570a 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -171,15 +171,15 @@ int jfs_mount(struct super_block *sb)
}
jfs_info("jfs_mount: ipimap:0x%p", ipimap);
- /* map further access of per fileset inodes by the fileset inode */
- sbi->ipimap = ipimap;
-
/* initialize fileset inode allocation map */
if ((rc = diMount(ipimap))) {
jfs_err("jfs_mount: diMount failed w/rc = %d", rc);
goto err_ipimap;
}
+ /* map further access of per fileset inodes by the fileset inode */
+ sbi->ipimap = ipimap;
+
return rc;
/*
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index c8ce7f1bc594..6f6a5b9203d3 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -354,6 +354,11 @@ tid_t txBegin(struct super_block *sb, int flag)
jfs_info("txBegin: flag = 0x%x", flag);
log = JFS_SBI(sb)->log;
+ if (!log) {
+ jfs_error(sb, "read-only filesystem\n");
+ return 0;
+ }
+
TXN_LOCK();
INCREMENT(TxStat.txBegin);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 7a55d14cc1af..f155ad6650bd 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -798,6 +798,11 @@ static int jfs_link(struct dentry *old_dentry,
if (rc)
goto out;
+ if (isReadOnly(ip)) {
+ jfs_error(ip->i_sb, "read-only filesystem\n");
+ return -EROFS;
+ }
+
tid = txBegin(ip->i_sb, 0);
mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 99ee657596b5..d3a602ea795b 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -702,6 +702,18 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
{
struct kernfs_node *kn;
+ if (parent->mode & S_ISGID) {
+ /* this code block imitates inode_init_owner() for
+ * kernfs
+ */
+
+ if (parent->iattr)
+ gid = parent->iattr->ia_gid;
+
+ if (flags & KERNFS_DIR)
+ mode |= S_ISGID;
+ }
+
kn = __kernfs_new_node(kernfs_root(parent), parent,
name, mode, uid, gid, flags);
if (kn) {
@@ -1515,8 +1527,11 @@ int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name,
mutex_lock(&kernfs_mutex);
kn = kernfs_find_ns(parent, name, ns);
- if (kn)
+ if (kn) {
+ kernfs_get(kn);
__kernfs_remove(kn);
+ kernfs_put(kn);
+ }
mutex_unlock(&kernfs_mutex);
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 6c12fac2c287..d62cec6d838d 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -200,7 +200,7 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn,
dput(dentry);
return ERR_PTR(-EINVAL);
}
- dtmp = lookup_one_len_unlocked(kntmp->name, dentry,
+ dtmp = lookup_positive_unlocked(kntmp->name, dentry,
strlen(kntmp->name));
dput(dentry);
if (IS_ERR(dtmp))
diff --git a/fs/libfs.c b/fs/libfs.c
index 247b58a68240..e6f986da2a65 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -883,8 +883,8 @@ out:
EXPORT_SYMBOL_GPL(simple_attr_read);
/* interpret the buffer as a number to call the set function with */
-ssize_t simple_attr_write(struct file *file, const char __user *buf,
- size_t len, loff_t *ppos)
+static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos, bool is_signed)
{
struct simple_attr *attr;
unsigned long long val;
@@ -905,7 +905,10 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
goto out;
attr->set_buf[size] = '\0';
- ret = kstrtoull(attr->set_buf, 0, &val);
+ if (is_signed)
+ ret = kstrtoll(attr->set_buf, 0, &val);
+ else
+ ret = kstrtoull(attr->set_buf, 0, &val);
if (ret)
goto out;
ret = attr->set(attr->data, val);
@@ -915,8 +918,21 @@ out:
mutex_unlock(&attr->mutex);
return ret;
}
+
+ssize_t simple_attr_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return simple_attr_write_xsigned(file, buf, len, ppos, false);
+}
EXPORT_SYMBOL_GPL(simple_attr_write);
+ssize_t simple_attr_write_signed(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return simple_attr_write_xsigned(file, buf, len, ppos, true);
+}
+EXPORT_SYMBOL_GPL(simple_attr_write_signed);
+
/**
* generic_fh_to_dentry - generic helper for the fh_to_dentry export operation
* @sb: filesystem to do the file handle conversion on
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1eabd91870e6..d770a41f8569 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -276,6 +276,9 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
{
struct nsm_handle *new;
+ if (!hostname)
+ return NULL;
+
new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL);
if (unlikely(new == NULL))
return NULL;
diff --git a/fs/locks.c b/fs/locks.c
index b8a31c1c4fff..90f92784aa55 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1338,6 +1338,7 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
out:
spin_unlock(&ctx->flc_lock);
percpu_up_read(&file_rwsem);
+ trace_posix_lock_inode(inode, request, error);
/*
* Free any unused locks.
*/
@@ -1346,7 +1347,6 @@ static int posix_lock_inode(struct inode *inode, struct file_lock *request,
if (new_fl2)
locks_free_lock(new_fl2);
locks_dispose_list(&dispose);
- trace_posix_lock_inode(inode, request, error);
return error;
}
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 97c54d3a2227..95b047256d09 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -11,7 +11,7 @@
/*
* Mbcache is a simple key-value store. Keys need not be unique, however
* key-value pairs are expected to be unique (we use this fact in
- * mb_cache_entry_delete()).
+ * mb_cache_entry_delete_or_get()).
*
* Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
* Ext4 also uses it for deduplication of xattr values stored in inodes.
@@ -90,12 +90,19 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
return -ENOMEM;
INIT_LIST_HEAD(&entry->e_list);
- /* One ref for hash, one ref returned */
- atomic_set(&entry->e_refcnt, 1);
+ /*
+ * We create entry with two references. One reference is kept by the
+ * hash table, the other reference is used to protect us from
+ * mb_cache_entry_delete_or_get() until the entry is fully setup. This
+ * avoids nesting of cache->c_list_lock into hash table bit locks which
+ * is problematic for RT.
+ */
+ atomic_set(&entry->e_refcnt, 2);
entry->e_key = key;
entry->e_value = value;
- entry->e_reusable = reusable;
- entry->e_referenced = 0;
+ entry->e_flags = 0;
+ if (reusable)
+ set_bit(MBE_REUSABLE_B, &entry->e_flags);
head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head);
hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
@@ -107,24 +114,41 @@ int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
}
hlist_bl_add_head(&entry->e_hash_list, head);
hlist_bl_unlock(head);
-
spin_lock(&cache->c_list_lock);
list_add_tail(&entry->e_list, &cache->c_list);
- /* Grab ref for LRU list */
- atomic_inc(&entry->e_refcnt);
cache->c_entry_count++;
spin_unlock(&cache->c_list_lock);
+ mb_cache_entry_put(cache, entry);
return 0;
}
EXPORT_SYMBOL(mb_cache_entry_create);
-void __mb_cache_entry_free(struct mb_cache_entry *entry)
+void __mb_cache_entry_free(struct mb_cache *cache, struct mb_cache_entry *entry)
{
+ struct hlist_bl_head *head;
+
+ head = mb_cache_entry_head(cache, entry->e_key);
+ hlist_bl_lock(head);
+ hlist_bl_del(&entry->e_hash_list);
+ hlist_bl_unlock(head);
kmem_cache_free(mb_entry_cache, entry);
}
EXPORT_SYMBOL(__mb_cache_entry_free);
+/*
+ * mb_cache_entry_wait_unused - wait to be the last user of the entry
+ *
+ * @entry - entry to work on
+ *
+ * Wait to be the last user of the entry.
+ */
+void mb_cache_entry_wait_unused(struct mb_cache_entry *entry)
+{
+ wait_var_event(&entry->e_refcnt, atomic_read(&entry->e_refcnt) <= 2);
+}
+EXPORT_SYMBOL(mb_cache_entry_wait_unused);
+
static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
struct mb_cache_entry *entry,
u32 key)
@@ -142,10 +166,10 @@ static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
while (node) {
entry = hlist_bl_entry(node, struct mb_cache_entry,
e_hash_list);
- if (entry->e_key == key && entry->e_reusable) {
- atomic_inc(&entry->e_refcnt);
+ if (entry->e_key == key &&
+ test_bit(MBE_REUSABLE_B, &entry->e_flags) &&
+ atomic_inc_not_zero(&entry->e_refcnt))
goto out;
- }
node = node->next;
}
entry = NULL;
@@ -205,10 +229,9 @@ struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
head = mb_cache_entry_head(cache, key);
hlist_bl_lock(head);
hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
- if (entry->e_key == key && entry->e_value == value) {
- atomic_inc(&entry->e_refcnt);
+ if (entry->e_key == key && entry->e_value == value &&
+ atomic_inc_not_zero(&entry->e_refcnt))
goto out;
- }
}
entry = NULL;
out:
@@ -217,7 +240,7 @@ out:
}
EXPORT_SYMBOL(mb_cache_entry_get);
-/* mb_cache_entry_delete - remove a cache entry
+/* mb_cache_entry_delete - try to remove a cache entry
* @cache - cache we work with
* @key - key
* @value - value
@@ -254,6 +277,43 @@ void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
}
EXPORT_SYMBOL(mb_cache_entry_delete);
+/* mb_cache_entry_delete_or_get - remove a cache entry if it has no users
+ * @cache - cache we work with
+ * @key - key
+ * @value - value
+ *
+ * Remove entry from cache @cache with key @key and value @value. The removal
+ * happens only if the entry is unused. The function returns NULL in case the
+ * entry was successfully removed or there's no entry in cache. Otherwise the
+ * function grabs reference of the entry that we failed to delete because it
+ * still has users and return it.
+ */
+struct mb_cache_entry *mb_cache_entry_delete_or_get(struct mb_cache *cache,
+ u32 key, u64 value)
+{
+ struct mb_cache_entry *entry;
+
+ entry = mb_cache_entry_get(cache, key, value);
+ if (!entry)
+ return NULL;
+
+ /*
+ * Drop the ref we got from mb_cache_entry_get() and the initial hash
+ * ref if we are the last user
+ */
+ if (atomic_cmpxchg(&entry->e_refcnt, 2, 0) != 2)
+ return entry;
+
+ spin_lock(&cache->c_list_lock);
+ if (!list_empty(&entry->e_list))
+ list_del_init(&entry->e_list);
+ cache->c_entry_count--;
+ spin_unlock(&cache->c_list_lock);
+ __mb_cache_entry_free(cache, entry);
+ return NULL;
+}
+EXPORT_SYMBOL(mb_cache_entry_delete_or_get);
+
/* mb_cache_entry_touch - cache entry got used
* @cache - cache the entry belongs to
* @entry - entry that got used
@@ -263,7 +323,7 @@ EXPORT_SYMBOL(mb_cache_entry_delete);
void mb_cache_entry_touch(struct mb_cache *cache,
struct mb_cache_entry *entry)
{
- entry->e_referenced = 1;
+ set_bit(MBE_REFERENCED_B, &entry->e_flags);
}
EXPORT_SYMBOL(mb_cache_entry_touch);
@@ -281,34 +341,24 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
unsigned long nr_to_scan)
{
struct mb_cache_entry *entry;
- struct hlist_bl_head *head;
unsigned long shrunk = 0;
spin_lock(&cache->c_list_lock);
while (nr_to_scan-- && !list_empty(&cache->c_list)) {
entry = list_first_entry(&cache->c_list,
struct mb_cache_entry, e_list);
- if (entry->e_referenced) {
- entry->e_referenced = 0;
+ /* Drop initial hash reference if there is no user */
+ if (test_bit(MBE_REFERENCED_B, &entry->e_flags) ||
+ atomic_cmpxchg(&entry->e_refcnt, 1, 0) != 1) {
+ clear_bit(MBE_REFERENCED_B, &entry->e_flags);
list_move_tail(&entry->e_list, &cache->c_list);
continue;
}
list_del_init(&entry->e_list);
cache->c_entry_count--;
- /*
- * We keep LRU list reference so that entry doesn't go away
- * from under us.
- */
spin_unlock(&cache->c_list_lock);
- head = mb_cache_entry_head(cache, entry->e_key);
- hlist_bl_lock(head);
- if (!hlist_bl_unhashed(&entry->e_hash_list)) {
- hlist_bl_del_init(&entry->e_hash_list);
- atomic_dec(&entry->e_refcnt);
- }
- hlist_bl_unlock(head);
- if (mb_cache_entry_put(cache, entry))
- shrunk++;
+ __mb_cache_entry_free(cache, entry);
+ shrunk++;
cond_resched();
spin_lock(&cache->c_list_lock);
}
@@ -400,11 +450,6 @@ void mb_cache_destroy(struct mb_cache *cache)
* point.
*/
list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
- if (!hlist_bl_unhashed(&entry->e_hash_list)) {
- hlist_bl_del_init(&entry->e_hash_list);
- atomic_dec(&entry->e_refcnt);
- } else
- WARN_ON(1);
list_del(&entry->e_list);
WARN_ON(atomic_read(&entry->e_refcnt) != 1);
mb_cache_entry_put(cache, entry);
diff --git a/fs/namei.c b/fs/namei.c
index b952ecbd49c2..a4cba6991a4d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -52,8 +52,8 @@
* The new code replaces the old recursive symlink resolution with
* an iterative one (in case of non-nested symlink chains). It does
* this with calls to <fs>_follow_link().
- * As a side effect, dir_namei(), _namei() and follow_link() are now
- * replaced with a single function lookup_dentry() that can handle all
+ * As a side effect, dir_namei(), _namei() and follow_link() are now
+ * replaced with a single function lookup_dentry() that can handle all
* the special cases of the former code.
*
* With the new dcache, the pathname is stored at each inode, at least as
@@ -2565,6 +2565,26 @@ struct dentry *lookup_one_len_unlocked(const char *name,
}
EXPORT_SYMBOL(lookup_one_len_unlocked);
+/*
+ * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT)
+ * on negatives. Returns known positive or ERR_PTR(); that's what
+ * most of the users want. Note that pinned negative with unlocked parent
+ * _can_ become positive at any time, so callers of lookup_one_len_unlocked()
+ * need to be very careful; pinned positives have ->d_inode stable, so
+ * this one avoids such problems.
+ */
+struct dentry *lookup_positive_unlocked(const char *name,
+ struct dentry *base, int len)
+{
+ struct dentry *ret = lookup_one_len_unlocked(name, base, len);
+ if (!IS_ERR(ret) && d_is_negative(ret)) {
+ dput(ret);
+ ret = ERR_PTR(-ENOENT);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(lookup_positive_unlocked);
+
#ifdef CONFIG_UNIX98_PTYS
int path_pts(struct path *path)
{
@@ -2583,7 +2603,7 @@ int path_pts(struct path *path)
this.name = "pts";
this.len = 3;
child = d_hash_and_lookup(parent, &this);
- if (!child)
+ if (IS_ERR_OR_NULL(child))
return -ENOENT;
path->dentry = child;
@@ -2859,20 +2879,14 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
p = d_ancestor(p2, p1);
if (p) {
inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
- inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
return p;
}
p = d_ancestor(p1, p2);
- if (p) {
- inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
- inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
- return p;
- }
-
inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
- return NULL;
+ return p;
}
EXPORT_SYMBOL(lock_rename);
@@ -2886,6 +2900,63 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
}
EXPORT_SYMBOL(unlock_rename);
+/**
+ * mode_strip_umask - handle vfs umask stripping
+ * @dir: parent directory of the new inode
+ * @mode: mode of the new inode to be created in @dir
+ *
+ * Umask stripping depends on whether or not the filesystem supports POSIX
+ * ACLs. If the filesystem doesn't support it umask stripping is done directly
+ * in here. If the filesystem does support POSIX ACLs umask stripping is
+ * deferred until the filesystem calls posix_acl_create().
+ *
+ * Returns: mode
+ */
+static inline umode_t mode_strip_umask(const struct inode *dir, umode_t mode)
+{
+ if (!IS_POSIXACL(dir))
+ mode &= ~current_umask();
+ return mode;
+}
+
+/**
+ * vfs_prepare_mode - prepare the mode to be used for a new inode
+ * @dir: parent directory of the new inode
+ * @mode: mode of the new inode
+ * @mask_perms: allowed permission by the vfs
+ * @type: type of file to be created
+ *
+ * This helper consolidates and enforces vfs restrictions on the @mode of a new
+ * object to be created.
+ *
+ * Umask stripping depends on whether the filesystem supports POSIX ACLs (see
+ * the kernel documentation for mode_strip_umask()). Moving umask stripping
+ * after setgid stripping allows the same ordering for both non-POSIX ACL and
+ * POSIX ACL supporting filesystems.
+ *
+ * Note that it's currently valid for @type to be 0 if a directory is created.
+ * Filesystems raise that flag individually and we need to check whether each
+ * filesystem can deal with receiving S_IFDIR from the vfs before we enforce a
+ * non-zero type.
+ *
+ * Returns: mode to be passed to the filesystem
+ */
+static inline umode_t vfs_prepare_mode(const struct inode *dir, umode_t mode,
+ umode_t mask_perms, umode_t type)
+{
+ mode = mode_strip_sgid(dir, mode);
+ mode = mode_strip_umask(dir, mode);
+
+ /*
+ * Apply the vfs mandated allowed permission mask and set the type of
+ * file to be created before we call into the filesystem.
+ */
+ mode &= (mask_perms & ~S_IFMT);
+ mode |= (type & S_IFMT);
+
+ return mode;
+}
+
int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
bool want_excl)
{
@@ -2895,8 +2966,8 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
if (!dir->i_op->create)
return -EACCES; /* shouldn't it be ENOSYS? */
- mode &= S_IALLUGO;
- mode |= S_IFREG;
+
+ mode = vfs_prepare_mode(dir, mode, S_IALLUGO, S_IFREG);
error = security_inode_create(dir, dentry, mode);
if (error)
return error;
@@ -3166,8 +3237,7 @@ static int lookup_open(struct nameidata *nd, struct path *path,
* O_EXCL open we want to return EEXIST not EROFS).
*/
if (open_flag & O_CREAT) {
- if (!IS_POSIXACL(dir->d_inode))
- mode &= ~current_umask();
+ mode = vfs_prepare_mode(dir->d_inode, mode, mode, mode);
if (unlikely(!got_write)) {
create_error = -EROFS;
open_flag &= ~O_CREAT;
@@ -3443,6 +3513,7 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
child = d_alloc(dentry, &slash_name);
if (unlikely(!child))
goto out_err;
+ mode = vfs_prepare_mode(dir, mode, mode, mode);
error = dir->i_op->tmpfile(dir, child, mode);
if (error)
goto out_err;
@@ -3701,6 +3772,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
if (!dir->i_op->mknod)
return -EPERM;
+ mode = vfs_prepare_mode(dir, mode, mode, mode);
error = devcgroup_inode_mknod(mode, dev);
if (error)
return error;
@@ -3749,9 +3821,8 @@ retry:
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
- error = security_path_mknod(&path, dentry, mode, dev);
+ error = security_path_mknod(&path, dentry,
+ mode_strip_umask(path.dentry->d_inode, mode), dev);
if (error)
goto out;
switch (mode & S_IFMT) {
@@ -3799,7 +3870,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
if (!dir->i_op->mkdir)
return -EPERM;
- mode &= (S_IRWXUGO|S_ISVTX);
+ mode = vfs_prepare_mode(dir, mode, S_IRWXUGO | S_ISVTX, 0);
error = security_inode_mkdir(dir, dentry, mode);
if (error)
return error;
@@ -3826,9 +3897,8 @@ retry:
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- if (!IS_POSIXACL(path.dentry->d_inode))
- mode &= ~current_umask();
- error = security_path_mkdir(&path, dentry, mode);
+ error = security_path_mkdir(&path, dentry,
+ mode_strip_umask(path.dentry->d_inode, mode));
if (!error)
error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
done_path_create(&path, dentry);
@@ -4361,11 +4431,12 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
*
* a) we can get into loop creation.
* b) race potential - two innocent renames can create a loop together.
- * That's where 4.4 screws up. Current fix: serialization on
+ * That's where 4.4BSD screws up. Current fix: serialization on
* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
* story.
- * c) we have to lock _four_ objects - parents and victim (if it exists),
- * and source (if it is not a directory).
+ * c) we may have to lock up to _four_ objects - parents and victim (if it exists),
+ * and source (if it's a non-directory or a subdirectory that moves to
+ * different parent).
* And that - after we got ->i_mutex on parents (until then we don't know
* whether the target exists). Solution: try to be smart with locking
* order for inodes. We rely on the fact that tree topology may change
@@ -4394,6 +4465,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
struct name_snapshot old_name;
+ bool lock_old_subdir, lock_new_subdir;
if (source == target)
return 0;
@@ -4442,10 +4514,33 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
take_dentry_name_snapshot(&old_name, old_dentry);
dget(new_dentry);
- if (!is_dir || (flags & RENAME_EXCHANGE))
+ /*
+ * Lock children.
+ * The source subdirectory needs to be locked on cross-directory
+ * rename or cross-directory exchange since its parent changes.
+ * The target subdirectory needs to be locked on cross-directory
+ * exchange due to parent change and on any rename due to becoming
+ * a victim.
+ * Non-directories need locking in all cases (for NFS reasons);
+ * they get locked after any subdirectories (in inode address order).
+ *
+ * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
+ * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
+ */
+ lock_old_subdir = new_dir != old_dir;
+ lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
+ if (is_dir) {
+ if (lock_old_subdir)
+ inode_lock_nested(source, I_MUTEX_CHILD);
+ if (target && (!new_is_dir || lock_new_subdir))
+ inode_lock(target);
+ } else if (new_is_dir) {
+ if (lock_new_subdir)
+ inode_lock_nested(target, I_MUTEX_CHILD);
+ inode_lock(source);
+ } else {
lock_two_nondirectories(source, target);
- else if (target)
- inode_lock(target);
+ }
error = -EBUSY;
if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
@@ -4489,9 +4584,9 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
d_exchange(old_dentry, new_dentry);
}
out:
- if (!is_dir || (flags & RENAME_EXCHANGE))
- unlock_two_nondirectories(source, target);
- else if (target)
+ if (!is_dir || lock_old_subdir)
+ inode_unlock(source);
+ if (target && (!new_is_dir || lock_new_subdir))
inode_unlock(target);
dput(new_dentry);
if (!error) {
@@ -4817,7 +4912,7 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int err;
unsigned int flags = 0;
if (nofs)
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 690221747b47..9f10b90debec 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -604,6 +604,8 @@ retry:
nfs4_delete_deviceid(node->ld, node->nfs_client, id);
goto retry;
}
+
+ nfs4_put_deviceid_node(node);
return ERR_PTR(-ENODEV);
}
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index dec5880ac6de..6e3a14fdff9c 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -422,7 +422,7 @@ bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
int ret, i;
d->children = kcalloc(v->concat.volumes_count,
- sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ sizeof(struct pnfs_block_dev), gfp_mask);
if (!d->children)
return -ENOMEM;
@@ -451,7 +451,7 @@ bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
int ret, i;
d->children = kcalloc(v->stripe.volumes_count,
- sizeof(struct pnfs_block_dev), GFP_KERNEL);
+ sizeof(struct pnfs_block_dev), gfp_mask);
if (!d->children)
return -ENOMEM;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 98b74cdabb99..8c72718d9fe0 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -837,6 +837,12 @@ filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
return &fl->generic_hdr;
}
+static bool
+filelayout_lseg_is_striped(const struct nfs4_filelayout_segment *flseg)
+{
+ return flseg->num_fh > 1;
+}
+
/*
* filelayout_pg_test(). Called by nfs_can_coalesce_requests()
*
@@ -857,6 +863,8 @@ filelayout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
size = pnfs_generic_pg_test(pgio, prev, req);
if (!size)
return 0;
+ else if (!filelayout_lseg_is_striped(FILELAYOUT_LSEG(pgio->pg_lseg)))
+ return size;
/* see if req and prev are in the same stripe */
if (prev) {
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index fa1c920afb49..1b88b78f40be 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1280,6 +1280,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
case -EPFNOSUPPORT:
case -EPROTONOSUPPORT:
case -EOPNOTSUPP:
+ case -EINVAL:
case -ECONNREFUSED:
case -ECONNRESET:
case -EHOSTDOWN:
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 53f3374acd0d..d4453d97c789 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -414,7 +414,9 @@ extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
-
+extern int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data);
/* io.c */
extern void nfs_start_io_read(struct inode *inode);
extern void nfs_end_io_read(struct inode *inode);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index af557dc2cfe1..6b783e2d2855 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -953,7 +953,7 @@ int nfs2_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
error = decode_filename_inline(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return -EAGAIN;
+ return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN;
/*
* The type (size and byte order) of nfscookie isn't defined in
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 84369d51353a..6d8768ce370d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1991,7 +1991,7 @@ int nfs3_decode_dirent(struct xdr_stream *xdr, struct nfs_entry *entry,
error = decode_inline_filename3(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return -EAGAIN;
+ return error == -ENAMETOOLONG ? -ENAMETOOLONG : -EAGAIN;
error = decode_cookie3(xdr, &new_cookie);
if (unlikely(error))
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 3671a51fe5eb..1f4bdcda3fda 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -346,6 +346,7 @@ int nfs40_init_client(struct nfs_client *clp)
ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE,
"NFSv4.0 transport Slot table");
if (ret) {
+ nfs4_shutdown_slot_table(tbl);
kfree(tbl);
return ret;
}
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 1e7296395d71..4e6dd267ac50 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -560,22 +560,20 @@ nfs_idmap_prepare_pipe_upcall(struct idmap *idmap,
return true;
}
-static void
-nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
+static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data,
+ int ret)
{
- struct key *authkey = idmap->idmap_upcall_data->authkey;
-
- kfree(idmap->idmap_upcall_data);
- idmap->idmap_upcall_data = NULL;
- complete_request_key(authkey, ret);
- key_put(authkey);
+ complete_request_key(data->authkey, ret);
+ key_put(data->authkey);
+ kfree(data);
}
-static void
-nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
+static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap,
+ struct idmap_legacy_upcalldata *data,
+ int ret)
{
- if (idmap->idmap_upcall_data != NULL)
- nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+ if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data)
+ nfs_idmap_complete_pipe_upcall(data, ret);
}
static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
@@ -612,7 +610,7 @@ static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
if (ret < 0)
- nfs_idmap_abort_pipe_upcall(idmap, ret);
+ nfs_idmap_abort_pipe_upcall(idmap, data, ret);
return ret;
out2:
@@ -668,6 +666,7 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
struct request_key_auth *rka;
struct rpc_inode *rpci = RPC_I(file_inode(filp));
struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
struct key *authkey;
struct idmap_msg im;
size_t namelen_in;
@@ -677,10 +676,11 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
* will have been woken up and someone else may now have used
* idmap_key_cons - so after this point we may no longer touch it.
*/
- if (idmap->idmap_upcall_data == NULL)
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data == NULL)
goto out_noupcall;
- authkey = idmap->idmap_upcall_data->authkey;
+ authkey = data->authkey;
rka = get_request_key_auth(authkey);
if (mlen != sizeof(im)) {
@@ -702,18 +702,17 @@ idmap_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
ret = -EINVAL;
goto out;
-}
+ }
- ret = nfs_idmap_read_and_verify_message(&im,
- &idmap->idmap_upcall_data->idmap_msg,
- rka->target_key, authkey);
+ ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg,
+ rka->target_key, authkey);
if (ret >= 0) {
key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
ret = mlen;
}
out:
- nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+ nfs_idmap_complete_pipe_upcall(data, ret);
out_noupcall:
return ret;
}
@@ -727,7 +726,7 @@ idmap_pipe_destroy_msg(struct rpc_pipe_msg *msg)
struct idmap *idmap = data->idmap;
if (msg->errno)
- nfs_idmap_abort_pipe_upcall(idmap, msg->errno);
+ nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno);
}
static void
@@ -735,8 +734,11 @@ idmap_release_pipe(struct inode *inode)
{
struct rpc_inode *rpci = RPC_I(inode);
struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
- nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data)
+ nfs_idmap_complete_pipe_upcall(data, -EPIPE);
}
int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ba4a03a69fbf..31503bade335 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -121,6 +121,11 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry,
if (nfs_server_capable(dir, NFS_CAP_SECURITY_LABEL) == 0)
return NULL;
+ label->lfs = 0;
+ label->pi = 0;
+ label->len = 0;
+ label->label = NULL;
+
err = security_dentry_init_security(dentry, sattr->ia_mode,
&dentry->d_name, (void **)&label->label, &label->len);
if (err == 0)
@@ -163,6 +168,7 @@ static int nfs4_map_errors(int err)
case -NFS4ERR_RESOURCE:
case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
return -EREMOTEIO;
case -NFS4ERR_WRONGSEC:
case -NFS4ERR_WRONG_CRED:
@@ -547,6 +553,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_GRACE:
case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
exception->delay = 1;
return 0;
@@ -779,10 +786,9 @@ static void nfs4_slot_sequence_record_sent(struct nfs4_slot *slot,
if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0)
slot->seq_nr_highest_sent = seqnr;
}
-static void nfs4_slot_sequence_acked(struct nfs4_slot *slot,
- u32 seqnr)
+static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, u32 seqnr)
{
- slot->seq_nr_highest_sent = seqnr;
+ nfs4_slot_sequence_record_sent(slot, seqnr);
slot->seq_nr_last_acked = seqnr;
}
@@ -849,7 +855,6 @@ static int nfs41_sequence_process(struct rpc_task *task,
__func__,
slot->slot_nr,
slot->seq_nr);
- nfs4_slot_sequence_acked(slot, slot->seq_nr);
goto out_retry;
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_SEQ_FALSE_RETRY:
@@ -912,6 +917,7 @@ out:
out_noaction:
return ret;
session_recover:
+ set_bit(NFS4_SLOT_TBL_DRAINING, &session->fc_slot_table.slot_tbl_state);
nfs4_schedule_session_recovery(session, status);
dprintk("%s ERROR: %d Reset session\n", __func__, status);
nfs41_sequence_free_slot(res);
@@ -1931,8 +1937,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
if (!data->rpc_done) {
if (data->rpc_status)
return ERR_PTR(data->rpc_status);
- /* cached opens have already been processed */
- goto update;
+ return nfs4_try_open_cached(data);
}
ret = nfs_refresh_inode(inode, &data->f_attr);
@@ -1941,7 +1946,7 @@ _nfs4_opendata_reclaim_to_nfs4_state(struct nfs4_opendata *data)
if (data->o_res.delegation_type != 0)
nfs4_opendata_check_deleg(data, state);
-update:
+
if (!update_open_stateid(state, &data->o_res.stateid,
NULL, data->o_arg.fmode))
return ERR_PTR(-EAGAIN);
@@ -2082,18 +2087,18 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
}
static int nfs4_open_recover_helper(struct nfs4_opendata *opendata,
- fmode_t fmode)
+ fmode_t fmode)
{
struct nfs4_state *newstate;
+ struct nfs_server *server = NFS_SB(opendata->dentry->d_sb);
+ int openflags = opendata->o_arg.open_flags;
int ret;
if (!nfs4_mode_match_open_stateid(opendata->state, fmode))
return 0;
- opendata->o_arg.open_flags = 0;
opendata->o_arg.fmode = fmode;
- opendata->o_arg.share_access = nfs4_map_atomic_open_share(
- NFS_SB(opendata->dentry->d_sb),
- fmode, 0);
+ opendata->o_arg.share_access =
+ nfs4_map_atomic_open_share(server, fmode, openflags);
memset(&opendata->o_res, 0, sizeof(opendata->o_res));
memset(&opendata->c_res, 0, sizeof(opendata->c_res));
nfs4_init_opendata_res(opendata);
@@ -2668,10 +2673,15 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s
struct nfs4_opendata *opendata;
int ret;
- opendata = nfs4_open_recoverdata_alloc(ctx, state,
- NFS4_OPEN_CLAIM_FH);
+ opendata = nfs4_open_recoverdata_alloc(ctx, state, NFS4_OPEN_CLAIM_FH);
if (IS_ERR(opendata))
return PTR_ERR(opendata);
+ /*
+ * We're not recovering a delegation, so ask for no delegation.
+ * Otherwise the recovery thread could deadlock with an outstanding
+ * delegation return.
+ */
+ opendata->o_arg.open_flags = O_DIRECT;
ret = nfs4_open_recover(opendata, state);
if (ret == -ESTALE)
d_drop(ctx->dentry);
@@ -3041,12 +3051,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
}
out:
- if (opendata->lgp) {
- nfs4_lgopen_release(opendata->lgp);
- opendata->lgp = NULL;
- }
- if (!opendata->cancelled)
+ if (!opendata->cancelled) {
+ if (opendata->lgp) {
+ nfs4_lgopen_release(opendata->lgp);
+ opendata->lgp = NULL;
+ }
nfs4_sequence_free_slot(&opendata->o_res.seq_res);
+ }
return ret;
}
@@ -3743,7 +3754,7 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx,
int open_flags, struct iattr *attr, int *opened)
{
struct nfs4_state *state;
- struct nfs4_label l = {0, 0, 0, NULL}, *label = NULL;
+ struct nfs4_label l, *label;
label = nfs4_label_init_security(dir, ctx->dentry, attr, &l);
@@ -4498,7 +4509,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
int flags)
{
struct nfs_server *server = NFS_SERVER(dir);
- struct nfs4_label l, *ilabel = NULL;
+ struct nfs4_label l, *ilabel;
struct nfs_open_context *ctx;
struct nfs4_state *state;
int status = 0;
@@ -4851,7 +4862,7 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry,
struct nfs4_exception exception = {
.interruptible = true,
};
- struct nfs4_label l, *label = NULL;
+ struct nfs4_label l, *label;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -4892,7 +4903,7 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
struct nfs4_exception exception = {
.interruptible = true,
};
- struct nfs4_label l, *label = NULL;
+ struct nfs4_label l, *label;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -5013,7 +5024,7 @@ static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
struct nfs4_exception exception = {
.interruptible = true,
};
- struct nfs4_label l, *label = NULL;
+ struct nfs4_label l, *label;
int err;
label = nfs4_label_init_security(dir, dentry, sattr, &l);
@@ -5360,7 +5371,7 @@ static void nfs4_proc_write_setup(struct nfs_pgio_header *hdr,
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_WRITE];
nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, 0);
- nfs4_state_protect_write(server->nfs_client, clnt, msg, hdr);
+ nfs4_state_protect_write(hdr->ds_clp ? hdr->ds_clp : server->nfs_client, clnt, msg, hdr);
}
static void nfs4_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commit_data *data)
@@ -5401,7 +5412,8 @@ static void nfs4_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess
data->res.server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1, 0);
- nfs4_state_protect(server->nfs_client, NFS_SP4_MACH_CRED_COMMIT, clnt, msg);
+ nfs4_state_protect(data->ds_clp ? data->ds_clp : server->nfs_client,
+ NFS_SP4_MACH_CRED_COMMIT, clnt, msg);
}
static int _nfs4_proc_commit(struct file *dst, struct nfs_commitargs *args,
@@ -6855,6 +6867,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
{
struct nfs4_lockdata *data = calldata;
struct nfs4_lock_state *lsp = data->lsp;
+ struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry));
dprintk("%s: begin!\n", __func__);
@@ -6864,8 +6877,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
data->rpc_status = task->tk_status;
switch (task->tk_status) {
case 0:
- renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
- data->timestamp);
+ renew_lease(server, data->timestamp);
if (data->arg.new_lock && !data->cancelled) {
data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
@@ -6878,14 +6890,23 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
} else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
goto out_restart;
break;
- case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_OLD_STATEID:
+ if (data->arg.new_lock_owner != 0 &&
+ nfs4_refresh_open_old_stateid(&data->arg.open_stateid,
+ lsp->ls_state))
+ goto out_restart;
+ if (nfs4_refresh_lock_old_stateid(&data->arg.lock_stateid, lsp))
+ goto out_restart;
+ fallthrough;
+ case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
if (data->arg.new_lock_owner != 0) {
if (!nfs4_stateid_match(&data->arg.open_stateid,
&lsp->ls_state->open_stateid))
goto out_restart;
+ else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN)
+ goto out_restart;
} else if (!nfs4_stateid_match(&data->arg.lock_stateid,
&lsp->ls_stateid))
goto out_restart;
@@ -8988,6 +9009,9 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf
rpc_delay(task, NFS4_POLL_RETRY_MAX);
/* fall through */
case -NFS4ERR_RETRY_UNCACHED_REP:
+ case -EACCES:
+ dprintk("%s: failed to reclaim complete error %d for server %s, retrying\n",
+ __func__, task->tk_status, clp->cl_hostname);
return -EAGAIN;
case -NFS4ERR_BADSESSION:
case -NFS4ERR_DEADSESSION:
@@ -9137,6 +9161,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
status = -EBUSY;
break;
case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
status = -ERECALLCONFLICT;
break;
case -NFS4ERR_DELEG_REVOKED:
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 1d2b81a233bb..1aacb0aa07f0 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -61,11 +61,14 @@
#include "nfs4session.h"
#include "pnfs.h"
#include "netns.h"
+#include "nfs4trace.h"
#define NFSDBG_FACILITY NFSDBG_STATE
#define OPENOWNER_POOL_SIZE 8
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp);
+
const nfs4_stateid zero_stateid = {
{ .data = { 0 } },
.type = NFS4_SPECIAL_STATEID_TYPE,
@@ -329,6 +332,8 @@ do_confirm:
status = nfs4_proc_create_session(clp, cred);
if (status != 0)
goto out;
+ if (!(clp->cl_exchange_flags & EXCHGID4_FLAG_CONFIRMED_R))
+ nfs4_state_start_reclaim_reboot(clp);
nfs41_finish_session_reset(clp);
nfs_mark_client_ready(clp, NFS_CS_READY);
out:
@@ -1224,6 +1229,8 @@ void nfs4_schedule_state_manager(struct nfs_client *clp)
if (IS_ERR(task)) {
printk(KERN_ERR "%s: kthread_run: %ld\n",
__func__, PTR_ERR(task));
+ if (!nfs_client_init_is_complete(clp))
+ nfs_mark_client_ready(clp, PTR_ERR(task));
nfs4_clear_state_manager_bit(clp);
nfs_put_client(clp);
module_put(THIS_MODULE);
@@ -1743,6 +1750,7 @@ static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp,
static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
{
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
/* Mark all delegations for reclaim */
nfs_delegation_mark_reclaim(clp);
nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
@@ -2518,6 +2526,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
/* Ensure exclusive access to NFSv4 state */
do {
+ trace_nfs4_state_mgr(clp);
clear_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
section = "purge state";
@@ -2588,6 +2597,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
if (status < 0)
goto out_error;
nfs4_state_end_reclaim_reboot(clp);
+ continue;
}
/* Detect expired delegations... */
@@ -2613,6 +2623,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
+ if (test_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state) &&
+ !test_and_set_bit(NFS4CLNT_MANAGER_RUNNING,
+ &clp->cl_state)) {
+ memflags = memalloc_nofs_save();
+ continue;
+ }
+
if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) {
if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
nfs_client_return_marked_delegations(clp);
@@ -2633,6 +2650,7 @@ static void nfs4_state_manager(struct nfs_client *clp)
out_error:
if (strlen(section))
section_sep = ": ";
+ trace_nfs4_state_mgr_failed(clp, section, status);
pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s"
" with error %d\n", section_sep, section,
clp->cl_hostname, -status);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 2295a934a154..010ee5e6fa32 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -563,6 +563,99 @@ TRACE_EVENT(nfs4_setup_sequence,
)
);
+TRACE_DEFINE_ENUM(NFS4CLNT_MANAGER_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_CHECK_LEASE);
+TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_EXPIRED);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_REBOOT);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECLAIM_NOGRACE);
+TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN);
+TRACE_DEFINE_ENUM(NFS4CLNT_SESSION_RESET);
+TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_CONFIRM);
+TRACE_DEFINE_ENUM(NFS4CLNT_SERVER_SCOPE_MISMATCH);
+TRACE_DEFINE_ENUM(NFS4CLNT_PURGE_STATE);
+TRACE_DEFINE_ENUM(NFS4CLNT_BIND_CONN_TO_SESSION);
+TRACE_DEFINE_ENUM(NFS4CLNT_MOVED);
+TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED);
+TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED);
+TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER);
+TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING);
+
+#define show_nfs4_clp_state(state) \
+ __print_flags(state, "|", \
+ { NFS4CLNT_MANAGER_RUNNING, "MANAGER_RUNNING" }, \
+ { NFS4CLNT_CHECK_LEASE, "CHECK_LEASE" }, \
+ { NFS4CLNT_LEASE_EXPIRED, "LEASE_EXPIRED" }, \
+ { NFS4CLNT_RECLAIM_REBOOT, "RECLAIM_REBOOT" }, \
+ { NFS4CLNT_RECLAIM_NOGRACE, "RECLAIM_NOGRACE" }, \
+ { NFS4CLNT_DELEGRETURN, "DELEGRETURN" }, \
+ { NFS4CLNT_SESSION_RESET, "SESSION_RESET" }, \
+ { NFS4CLNT_LEASE_CONFIRM, "LEASE_CONFIRM" }, \
+ { NFS4CLNT_SERVER_SCOPE_MISMATCH, \
+ "SERVER_SCOPE_MISMATCH" }, \
+ { NFS4CLNT_PURGE_STATE, "PURGE_STATE" }, \
+ { NFS4CLNT_BIND_CONN_TO_SESSION, \
+ "BIND_CONN_TO_SESSION" }, \
+ { NFS4CLNT_MOVED, "MOVED" }, \
+ { NFS4CLNT_LEASE_MOVED, "LEASE_MOVED" }, \
+ { NFS4CLNT_DELEGATION_EXPIRED, "DELEGATION_EXPIRED" }, \
+ { NFS4CLNT_RUN_MANAGER, "RUN_MANAGER" }, \
+ { NFS4CLNT_DELEGRETURN_RUNNING, "DELEGRETURN_RUNNING" })
+
+TRACE_EVENT(nfs4_state_mgr,
+ TP_PROTO(
+ const struct nfs_client *clp
+ ),
+
+ TP_ARGS(clp),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, state)
+ __string(hostname, clp->cl_hostname)
+ ),
+
+ TP_fast_assign(
+ __entry->state = clp->cl_state;
+ __assign_str(hostname, clp->cl_hostname)
+ ),
+
+ TP_printk(
+ "hostname=%s clp state=%s", __get_str(hostname),
+ show_nfs4_clp_state(__entry->state)
+ )
+)
+
+TRACE_EVENT(nfs4_state_mgr_failed,
+ TP_PROTO(
+ const struct nfs_client *clp,
+ const char *section,
+ int status
+ ),
+
+ TP_ARGS(clp, section, status),
+
+ TP_STRUCT__entry(
+ __field(unsigned long, error)
+ __field(unsigned long, state)
+ __string(hostname, clp->cl_hostname)
+ __string(section, section)
+ ),
+
+ TP_fast_assign(
+ __entry->error = status;
+ __entry->state = clp->cl_state;
+ __assign_str(hostname, clp->cl_hostname);
+ __assign_str(section, section);
+ ),
+
+ TP_printk(
+ "hostname=%s clp state=%s error=%ld (%s) section=%s",
+ __get_str(hostname),
+ show_nfs4_clp_state(__entry->state), -__entry->error,
+ show_nfsv4_errors(__entry->error), __get_str(section)
+
+ )
+)
+
TRACE_EVENT(nfs4_xdr_status,
TP_PROTO(
const struct xdr_stream *xdr,
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 2b7741fe42ea..a3592becae4a 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -4169,19 +4169,17 @@ static int decode_attr_security_label(struct xdr_stream *xdr, uint32_t *bitmap,
p = xdr_inline_decode(xdr, len);
if (unlikely(!p))
return -EIO;
+ bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
if (len < NFS4_MAXLABELLEN) {
- if (label) {
- if (label->len) {
- if (label->len < len)
- return -ERANGE;
- memcpy(label->label, p, len);
- }
+ if (label && label->len) {
+ if (label->len < len)
+ return -ERANGE;
+ memcpy(label->label, p, len);
label->len = len;
label->pi = pi;
label->lfs = lfs;
status = NFS_ATTR_FATTR_V4_SECURITY_LABEL;
}
- bitmap[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
} else
printk(KERN_WARNING "%s: label too long (%u)!\n",
__func__, len);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index effaa4247b91..c0f2e1751c33 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -169,10 +169,10 @@ static int __init root_nfs_cat(char *dest, const char *src,
size_t len = strlen(dest);
if (len && dest[len - 1] != ',')
- if (strlcat(dest, ",", destlen) > destlen)
+ if (strlcat(dest, ",", destlen) >= destlen)
return -1;
- if (strlcat(dest, src, destlen) > destlen)
+ if (strlcat(dest, src, destlen) >= destlen)
return -1;
return 0;
}
diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c
index 537b80d693f1..d4829f3f2293 100644
--- a/fs/nfs/pnfs_dev.c
+++ b/fs/nfs/pnfs_dev.c
@@ -152,7 +152,7 @@ nfs4_get_device_info(struct nfs_server *server,
set_bit(NFS_DEVICEID_NOCACHE, &d->flags);
out_free_pages:
- for (i = 0; i < max_pages; i++)
+ while (--i >= 0)
__free_page(pages[i]);
kfree(pages);
out_free_pdev:
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 999b4162abba..e7e779cfdbaf 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -440,6 +440,41 @@ void nfs_sb_deactive(struct super_block *sb)
}
EXPORT_SYMBOL_GPL(nfs_sb_deactive);
+static int __nfs_list_for_each_server(struct list_head *head,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ struct nfs_server *server, *last = NULL;
+ int ret = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, head, client_link) {
+ if (!(server->super && nfs_sb_active(server->super)))
+ continue;
+ rcu_read_unlock();
+ if (last)
+ nfs_sb_deactive(last->super);
+ last = server;
+ ret = fn(server, data);
+ if (ret)
+ goto out;
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+out:
+ if (last)
+ nfs_sb_deactive(last->super);
+ return ret;
+}
+
+int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ return __nfs_list_for_each_server(&clp->cl_superblocks, fn, data);
+}
+EXPORT_SYMBOL_GPL(nfs_client_for_each_server);
+
/*
* Deliver file system statistics to userspace
*/
@@ -2405,22 +2440,31 @@ void nfs_fill_super(struct super_block *sb, struct nfs_mount_info *mount_info)
if (data && data->bsize)
sb->s_blocksize = nfs_block_size(data->bsize, &sb->s_blocksize_bits);
- if (server->nfs_client->rpc_ops->version != 2) {
- /* The VFS shouldn't apply the umask to mode bits. We will do
- * so ourselves when necessary.
+ switch (server->nfs_client->rpc_ops->version) {
+ case 2:
+ sb->s_time_gran = 1000;
+ sb->s_time_min = 0;
+ sb->s_time_max = U32_MAX;
+ break;
+ case 3:
+ /*
+ * The VFS shouldn't apply the umask to mode bits.
+ * We will do so ourselves when necessary.
*/
sb->s_flags |= SB_POSIXACL;
sb->s_time_gran = 1;
- sb->s_export_op = &nfs_export_ops;
- } else
- sb->s_time_gran = 1000;
-
- if (server->nfs_client->rpc_ops->version != 4) {
sb->s_time_min = 0;
sb->s_time_max = U32_MAX;
- } else {
+ sb->s_export_op = &nfs_export_ops;
+ break;
+ case 4:
+ sb->s_flags |= SB_POSIXACL;
+ sb->s_time_gran = 1;
sb->s_time_min = S64_MIN;
sb->s_time_max = S64_MAX;
+ if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+ sb->s_export_op = &nfs_export_ops;
+ break;
}
nfs_initialise_sb(sb);
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 442543304930..2455dc8be18a 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -82,6 +82,15 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
int len = sizeof(__be32), ret, i;
__be32 *p;
+ /*
+ * See paragraph 5 of RFC 8881 S18.40.3.
+ */
+ if (!gdp->gd_maxcount) {
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
+ }
+
p = xdr_reserve_space(xdr, len + sizeof(__be32));
if (!p)
return nfserr_resource;
diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c
index e81d2a5cf381..bb205328e043 100644
--- a/fs/nfsd/flexfilelayoutxdr.c
+++ b/fs/nfsd/flexfilelayoutxdr.c
@@ -85,6 +85,15 @@ nfsd4_ff_encode_getdeviceinfo(struct xdr_stream *xdr,
int addr_len;
__be32 *p;
+ /*
+ * See paragraph 5 of RFC 8881 S18.40.3.
+ */
+ if (!gdp->gd_maxcount) {
+ if (xdr_stream_encode_u32(xdr, 0) != XDR_UNIT)
+ return nfserr_resource;
+ return nfs_ok;
+ }
+
/* len + padding for two strings */
addr_len = 16 + da->netaddr.netid_len + da->netaddr.addr_len;
ver_len = 20;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ed53e206a299..82329b5102c6 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -42,9 +42,6 @@ struct nfsd_net {
bool grace_ended;
time_t boot_time;
- /* internal mount of the "nfsd" pseudofilesystem: */
- struct vfsmount *nfsd_mnt;
-
struct dentry *nfsd_client_dir;
/*
@@ -121,6 +118,9 @@ struct nfsd_net {
wait_queue_head_t ntf_wq;
atomic_t ntf_refcnt;
+ /* Allow umount to wait for nfsd state cleanup */
+ struct completion nfsd_shutdown_complete;
+
/*
* clientid and stateid data for construction of net unique COPY
* stateids.
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 8f077e66e613..03e8c45a52f3 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -868,13 +868,11 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
} else
dchild = dget(dparent);
} else
- dchild = lookup_one_len_unlocked(name, dparent, namlen);
+ dchild = lookup_positive_unlocked(name, dparent, namlen);
if (IS_ERR(dchild))
return rv;
if (d_mountpoint(dchild))
goto out;
- if (d_really_is_negative(dchild))
- goto out;
if (dchild->d_inode->i_ino != ino)
goto out;
rv = fh_compose(fhp, exp, dchild, &cd->fh);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 3c50d18fe8a9..771733396eab 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -840,8 +840,8 @@ static const struct cred *get_backchannel_cred(struct nfs4_client *clp, struct r
if (!kcred)
return NULL;
- kcred->uid = ses->se_cb_sec.uid;
- kcred->gid = ses->se_cb_sec.gid;
+ kcred->fsuid = ses->se_cb_sec.uid;
+ kcred->fsgid = ses->se_cb_sec.gid;
return kcred;
}
}
@@ -880,7 +880,6 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
} else {
if (!conn->cb_xprt)
return -EINVAL;
- clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
clp->cl_cb_session = ses;
args.bc_xprt = conn->cb_xprt;
args.prognumber = clp->cl_cb_session->se_cb_prog;
@@ -900,6 +899,9 @@ static int setup_callback_client(struct nfs4_client *clp, struct nfs4_cb_conn *c
rpc_shutdown_client(client);
return -ENOMEM;
}
+
+ if (clp->cl_minorversion != 0)
+ clp->cl_cb_conn.cb_xprt = conn->cb_xprt;
clp->cl_cb_client = client;
clp->cl_cb_cred = cred;
return 0;
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index e12409eca7cc..215362144aec 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -322,11 +322,11 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
if (ls->ls_recalled)
goto out_unlock;
- ls->ls_recalled = true;
- atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
if (list_empty(&ls->ls_layouts))
goto out_unlock;
+ ls->ls_recalled = true;
+ atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid);
refcount_inc(&ls->ls_stid.sc_count);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 452ed633a2c7..e38f873f98a7 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -865,8 +865,8 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
rename->rn_tname, rename->rn_tnamelen);
if (status)
return status;
- set_change_info(&rename->rn_sinfo, &cstate->current_fh);
- set_change_info(&rename->rn_tinfo, &cstate->save_fh);
+ set_change_info(&rename->rn_sinfo, &cstate->save_fh);
+ set_change_info(&rename->rn_tinfo, &cstate->current_fh);
return nfs_ok;
}
@@ -1059,8 +1059,10 @@ out:
return status;
out_put_dst:
nfsd_file_put(*dst);
+ *dst = NULL;
out_put_src:
nfsd_file_put(*src);
+ *src = NULL;
goto out;
}
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 7d408957ed62..14463e107918 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -825,8 +825,10 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
princhash.data = memdup_user(
&ci->cc_princhash.cp_data,
princhashlen);
- if (IS_ERR_OR_NULL(princhash.data))
+ if (IS_ERR_OR_NULL(princhash.data)) {
+ kfree(name.data);
return -EFAULT;
+ }
princhash.len = princhashlen;
} else
princhash.len = 0;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 228c2b0753dc..a0aa7e63739d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -508,15 +508,26 @@ find_any_file(struct nfs4_file *f)
return ret;
}
-static struct nfsd_file *find_deleg_file(struct nfs4_file *f)
+static struct nfsd_file *find_any_file_locked(struct nfs4_file *f)
{
- struct nfsd_file *ret = NULL;
+ lockdep_assert_held(&f->fi_lock);
+
+ if (f->fi_fds[O_RDWR])
+ return f->fi_fds[O_RDWR];
+ if (f->fi_fds[O_WRONLY])
+ return f->fi_fds[O_WRONLY];
+ if (f->fi_fds[O_RDONLY])
+ return f->fi_fds[O_RDONLY];
+ return NULL;
+}
+
+static struct nfsd_file *find_deleg_file_locked(struct nfs4_file *f)
+{
+ lockdep_assert_held(&f->fi_lock);
- spin_lock(&f->fi_lock);
if (f->fi_deleg_file)
- ret = nfsd_file_get(f->fi_deleg_file);
- spin_unlock(&f->fi_lock);
- return ret;
+ return f->fi_deleg_file;
+ return NULL;
}
static atomic_long_t num_delegations;
@@ -1085,9 +1096,9 @@ static void revoke_delegation(struct nfs4_delegation *dp)
WARN_ON(!list_empty(&dp->dl_recall_lru));
if (clp->cl_minorversion) {
+ spin_lock(&clp->cl_lock);
dp->dl_stid.sc_type = NFS4_REVOKED_DELEG_STID;
refcount_inc(&dp->dl_stid.sc_count);
- spin_lock(&clp->cl_lock);
list_add(&dp->dl_recall_lru, &clp->cl_revoked);
spin_unlock(&clp->cl_lock);
}
@@ -2402,9 +2413,11 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
ols = openlockstateid(st);
oo = ols->st_stateowner;
nf = st->sc_file;
- file = find_any_file(nf);
+
+ spin_lock(&nf->fi_lock);
+ file = find_any_file_locked(nf);
if (!file)
- return 0;
+ goto out;
seq_printf(s, "- 0x%16phN: { type: open, ", &st->sc_stateid);
@@ -2422,8 +2435,8 @@ static int nfs4_show_open(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_owner(s, oo);
seq_printf(s, " }\n");
- nfsd_file_put(file);
-
+out:
+ spin_unlock(&nf->fi_lock);
return 0;
}
@@ -2437,9 +2450,10 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
ols = openlockstateid(st);
oo = ols->st_stateowner;
nf = st->sc_file;
- file = find_any_file(nf);
+ spin_lock(&nf->fi_lock);
+ file = find_any_file_locked(nf);
if (!file)
- return 0;
+ goto out;
seq_printf(s, "- 0x%16phN: { type: lock, ", &st->sc_stateid);
@@ -2455,8 +2469,8 @@ static int nfs4_show_lock(struct seq_file *s, struct nfs4_stid *st)
seq_printf(s, ", ");
nfs4_show_owner(s, oo);
seq_printf(s, " }\n");
- nfsd_file_put(file);
-
+out:
+ spin_unlock(&nf->fi_lock);
return 0;
}
@@ -2468,9 +2482,10 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
ds = delegstateid(st);
nf = st->sc_file;
- file = find_deleg_file(nf);
+ spin_lock(&nf->fi_lock);
+ file = find_deleg_file_locked(nf);
if (!file)
- return 0;
+ goto out;
seq_printf(s, "- 0x%16phN: { type: deleg, ", &st->sc_stateid);
@@ -2482,8 +2497,8 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
nfs4_show_superblock(s, file);
seq_printf(s, " }\n");
- nfsd_file_put(file);
-
+out:
+ spin_unlock(&nf->fi_lock);
return 0;
}
@@ -2556,7 +2571,7 @@ static int client_opens_release(struct inode *inode, struct file *file)
/* XXX: alternatively, we could get/drop in seq start/stop */
drop_client(clp);
- return 0;
+ return seq_release(inode, file);
}
static const struct file_operations client_states_fops = {
@@ -5498,15 +5513,6 @@ static __be32 nfsd4_validate_stateid(struct nfs4_client *cl, stateid_t *stateid)
if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) ||
CLOSE_STATEID(stateid))
return status;
- /* Client debugging aid. */
- if (!same_clid(&stateid->si_opaque.so_clid, &cl->cl_clientid)) {
- char addr_str[INET6_ADDRSTRLEN];
- rpc_ntop((struct sockaddr *)&cl->cl_addr, addr_str,
- sizeof(addr_str));
- pr_warn_ratelimited("NFSD: client %s testing state ID "
- "with incorrect client ID\n", addr_str);
- return status;
- }
spin_lock(&cl->cl_lock);
s = find_stateid_locked(cl, stateid);
if (!s)
@@ -7739,14 +7745,9 @@ nfs4_state_start_net(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
int ret;
- ret = get_nfsdfs(net);
- if (ret)
- return ret;
ret = nfs4_state_create_net(net);
- if (ret) {
- mntput(nn->nfsd_mnt);
+ if (ret)
return ret;
- }
locks_start_grace(net, &nn->nfsd4_manager);
nfsd4_client_tracking_init(net);
if (nn->track_reclaim_completes && nn->reclaim_str_hashtbl_size == 0)
@@ -7815,7 +7816,6 @@ nfs4_state_shutdown_net(struct net *net)
nfsd4_client_tracking_exit(net);
nfs4_state_destroy_net(net);
- mntput(nn->nfsd_mnt);
}
void
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index e61d9c435957..1d24fff2709c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2991,18 +2991,9 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
__be32 nfserr;
int ignore_crossmnt = 0;
- dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
+ dentry = lookup_positive_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
if (IS_ERR(dentry))
return nfserrno(PTR_ERR(dentry));
- if (d_really_is_negative(dentry)) {
- /*
- * we're not holding the i_mutex here, so there's
- * a window where this directory entry could have gone
- * away.
- */
- dput(dentry);
- return nfserr_noent;
- }
exp_get(exp);
/*
@@ -3109,6 +3100,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
case nfserr_noent:
xdr_truncate_encode(xdr, start_offset);
goto skip_entry;
+ case nfserr_jukebox:
+ /*
+ * The pseudoroot should only display dentries that lead to
+ * exports. If we get EJUKEBOX here, then we can't tell whether
+ * this entry should be included. Just fail the whole READDIR
+ * with NFS4ERR_DELAY in that case, and hope that the situation
+ * will resolve itself by the client's next attempt.
+ */
+ if (cd->rd_fhp->fh_export->ex_flags & NFSEXP_V4ROOT)
+ goto fail;
+ fallthrough;
default:
/*
* If the client requested the RDATTR_ERROR attribute,
@@ -3398,7 +3400,7 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op
p = xdr_reserve_space(xdr, 32);
if (!p)
return nfserr_resource;
- *p++ = cpu_to_be32(0);
+ *p++ = cpu_to_be32(open->op_recall);
/*
* TODO: space_limit's in delegations
@@ -3600,7 +3602,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
if (resp->xdr.buf->page_len &&
test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
WARN_ON_ONCE(1);
- return nfserr_resource;
+ return nfserr_serverfault;
}
xdr_commit_encode(xdr);
@@ -4124,20 +4126,17 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
*p++ = cpu_to_be32(gdev->gd_layout_type);
- /* If maxcount is 0 then just update notifications */
- if (gdev->gd_maxcount != 0) {
- ops = nfsd4_layout_ops[gdev->gd_layout_type];
- nfserr = ops->encode_getdeviceinfo(xdr, gdev);
- if (nfserr) {
- /*
- * We don't bother to burden the layout drivers with
- * enforcing gd_maxcount, just tell the client to
- * come back with a bigger buffer if it's not enough.
- */
- if (xdr->buf->len + 4 > gdev->gd_maxcount)
- goto toosmall;
- return nfserr;
- }
+ ops = nfsd4_layout_ops[gdev->gd_layout_type];
+ nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+ if (nfserr) {
+ /*
+ * We don't bother to burden the layout drivers with
+ * enforcing gd_maxcount, just tell the client to
+ * come back with a bigger buffer if it's not enough.
+ */
+ if (xdr->buf->len + 4 > gdev->gd_maxcount)
+ goto toosmall;
+ return nfserr;
}
if (gdev->gd_notify_types) {
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 670e97dd67f0..80c90fc231a5 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -20,8 +20,7 @@
#include "nfsd.h"
#include "cache.h"
-
-#define NFSDDBG_FACILITY NFSDDBG_REPCACHE
+#include "trace.h"
/*
* We use this value to determine the number of hash buckets from the max
@@ -324,8 +323,10 @@ nfsd_cache_key_cmp(const struct svc_cacherep *key,
const struct svc_cacherep *rp, struct nfsd_net *nn)
{
if (key->c_key.k_xid == rp->c_key.k_xid &&
- key->c_key.k_csum != rp->c_key.k_csum)
+ key->c_key.k_csum != rp->c_key.k_csum) {
++nn->payload_misses;
+ trace_nfsd_drc_mismatch(nn, key, rp);
+ }
return memcmp(&key->c_key, &rp->c_key, sizeof(key->c_key));
}
@@ -378,15 +379,22 @@ out:
return ret;
}
-/*
+/**
+ * nfsd_cache_lookup - Find an entry in the duplicate reply cache
+ * @rqstp: Incoming Call to find
+ *
* Try to find an entry matching the current call in the cache. When none
* is found, we try to grab the oldest expired entry off the LRU list. If
* a suitable one isn't there, then drop the cache_lock and allocate a
* new one, then search again in case one got inserted while this thread
* didn't hold the lock.
+ *
+ * Return values:
+ * %RC_DOIT: Process the request normally
+ * %RC_REPLY: Reply from cache
+ * %RC_DROPIT: Do not process the request further
*/
-int
-nfsd_cache_lookup(struct svc_rqst *rqstp)
+int nfsd_cache_lookup(struct svc_rqst *rqstp)
{
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct svc_cacherep *rp, *found;
@@ -400,7 +408,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
rqstp->rq_cacherep = NULL;
if (type == RC_NOCACHE) {
nfsdstats.rcnocache++;
- return rtn;
+ goto out;
}
csum = nfsd_cache_csum(rqstp);
@@ -410,10 +418,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
* preallocate an entry.
*/
rp = nfsd_reply_cache_alloc(rqstp, csum, nn);
- if (!rp) {
- dprintk("nfsd: unable to allocate DRC entry!\n");
- return rtn;
- }
+ if (!rp)
+ goto out;
spin_lock(&b->cache_lock);
found = nfsd_cache_insert(b, rp, nn);
@@ -432,8 +438,10 @@ nfsd_cache_lookup(struct svc_rqst *rqstp)
/* go ahead and prune the cache */
prune_bucket(b, nn);
- out:
+
+out_unlock:
spin_unlock(&b->cache_lock);
+out:
return rtn;
found_entry:
@@ -443,13 +451,13 @@ found_entry:
/* Request being processed */
if (rp->c_state == RC_INPROG)
- goto out;
+ goto out_trace;
/* From the hall of fame of impractical attacks:
* Is this a user who tries to snoop on the cache? */
rtn = RC_DOIT;
if (!test_bit(RQ_SECURE, &rqstp->rq_flags) && rp->c_secure)
- goto out;
+ goto out_trace;
/* Compose RPC reply header */
switch (rp->c_type) {
@@ -461,20 +469,26 @@ found_entry:
break;
case RC_REPLBUFF:
if (!nfsd_cache_append(rqstp, &rp->c_replvec))
- goto out; /* should not happen */
+ goto out_unlock; /* should not happen */
rtn = RC_REPLY;
break;
default:
WARN_ONCE(1, "nfsd: bad repcache type %d\n", rp->c_type);
}
- goto out;
+out_trace:
+ trace_nfsd_drc_found(nn, rqstp, rtn);
+ goto out_unlock;
}
-/*
- * Update a cache entry. This is called from nfsd_dispatch when
- * the procedure has been executed and the complete reply is in
- * rqstp->rq_res.
+/**
+ * nfsd_cache_update - Update an entry in the duplicate reply cache.
+ * @rqstp: svc_rqst with a finished Reply
+ * @cachetype: which cache to update
+ * @statp: Reply's status code
+ *
+ * This is called from nfsd_dispatch when the procedure has been
+ * executed and the complete reply is in rqstp->rq_res.
*
* We're copying around data here rather than swapping buffers because
* the toplevel loop requires max-sized buffers, which would be a waste
@@ -487,8 +501,7 @@ found_entry:
* nfsd failed to encode a reply that otherwise would have been cached.
* In this case, nfsd_cache_update is called with statp == NULL.
*/
-void
-nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
+void nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp)
{
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct svc_cacherep *rp = rqstp->rq_cacherep;
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 055cc0458f27..ce72869315ad 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -351,7 +351,7 @@ static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size)
static ssize_t write_filehandle(struct file *file, char *buf, size_t size)
{
char *dname, *path;
- int uninitialized_var(maxsize);
+ int maxsize;
char *mesg = buf;
int len;
struct auth_domain *dom;
@@ -1417,6 +1417,8 @@ static void nfsd_umount(struct super_block *sb)
{
struct net *net = sb->s_fs_info;
+ nfsd_shutdown_threads(net);
+
kill_litter_super(sb);
put_net(net);
}
@@ -1429,18 +1431,6 @@ static struct file_system_type nfsd_fs_type = {
};
MODULE_ALIAS_FS("nfsd");
-int get_nfsdfs(struct net *net)
-{
- struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- struct vfsmount *mnt;
-
- mnt = vfs_kern_mount(&nfsd_fs_type, SB_KERNMOUNT, "nfsd", NULL);
- if (IS_ERR(mnt))
- return PTR_ERR(mnt);
- nn->nfsd_mnt = mnt;
- return 0;
-}
-
#ifdef CONFIG_PROC_FS
static int create_proc_exports_entry(void)
{
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 4ff0c5318a02..3ae9811c0bb9 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -84,11 +84,10 @@ int nfsd_get_nrthreads(int n, int *, struct net *);
int nfsd_set_nrthreads(int n, int *, struct net *);
int nfsd_pool_stats_open(struct inode *, struct file *);
int nfsd_pool_stats_release(struct inode *, struct file *);
+void nfsd_shutdown_threads(struct net *net);
void nfsd_destroy(struct net *net);
-int get_nfsdfs(struct net *);
-
struct nfsdfs_client {
struct kref cl_ref;
void (*cl_release)(struct kref *kref);
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index d63cdda1782d..969a227186fa 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -420,8 +420,8 @@ static void nfsd_shutdown_net(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
- nfsd_file_cache_shutdown_net(net);
nfs4_state_shutdown_net(net);
+ nfsd_file_cache_shutdown_net(net);
if (nn->lockd_up) {
lockd_down(net);
nn->lockd_up = 0;
@@ -594,6 +594,37 @@ static const struct svc_serv_ops nfsd_thread_sv_ops = {
.svo_module = THIS_MODULE,
};
+static void nfsd_complete_shutdown(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+
+ WARN_ON(!mutex_is_locked(&nfsd_mutex));
+
+ nn->nfsd_serv = NULL;
+ complete(&nn->nfsd_shutdown_complete);
+}
+
+void nfsd_shutdown_threads(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ struct svc_serv *serv;
+
+ mutex_lock(&nfsd_mutex);
+ serv = nn->nfsd_serv;
+ if (serv == NULL) {
+ mutex_unlock(&nfsd_mutex);
+ return;
+ }
+
+ svc_get(serv);
+ /* Kill outstanding nfsd threads */
+ serv->sv_ops->svo_setup(serv, NULL, 0);
+ nfsd_destroy(net);
+ mutex_unlock(&nfsd_mutex);
+ /* Wait for shutdown of nfsd_serv to complete */
+ wait_for_completion(&nn->nfsd_shutdown_complete);
+}
+
int nfsd_create_serv(struct net *net)
{
int error;
@@ -611,11 +642,13 @@ int nfsd_create_serv(struct net *net)
&nfsd_thread_sv_ops);
if (nn->nfsd_serv == NULL)
return -ENOMEM;
+ init_completion(&nn->nfsd_shutdown_complete);
nn->nfsd_serv->sv_maxconn = nn->max_connections;
error = svc_bind(nn->nfsd_serv, net);
if (error < 0) {
svc_destroy(nn->nfsd_serv);
+ nfsd_complete_shutdown(net);
return error;
}
@@ -664,7 +697,7 @@ void nfsd_destroy(struct net *net)
svc_shutdown_net(nn->nfsd_serv, net);
svc_destroy(nn->nfsd_serv);
if (destroy)
- nn->nfsd_serv = NULL;
+ nfsd_complete_shutdown(net);
}
int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 127db5351d01..9d37d09d7ca8 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -166,6 +166,12 @@ DEFINE_STATEID_EVENT(layout_recall_done);
DEFINE_STATEID_EVENT(layout_recall_fail);
DEFINE_STATEID_EVENT(layout_recall_release);
+TRACE_DEFINE_ENUM(NFSD_FILE_HASHED);
+TRACE_DEFINE_ENUM(NFSD_FILE_PENDING);
+TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_READ);
+TRACE_DEFINE_ENUM(NFSD_FILE_BREAK_WRITE);
+TRACE_DEFINE_ENUM(NFSD_FILE_REFERENCED);
+
#define show_nf_flags(val) \
__print_flags(val, "|", \
{ 1 << NFSD_FILE_HASHED, "HASHED" }, \
@@ -304,6 +310,65 @@ TRACE_EVENT(nfsd_file_fsnotify_handle_event,
__entry->nlink, __entry->mode, __entry->mask)
);
+#include "cache.h"
+
+TRACE_DEFINE_ENUM(RC_DROPIT);
+TRACE_DEFINE_ENUM(RC_REPLY);
+TRACE_DEFINE_ENUM(RC_DOIT);
+
+#define show_drc_retval(x) \
+ __print_symbolic(x, \
+ { RC_DROPIT, "DROPIT" }, \
+ { RC_REPLY, "REPLY" }, \
+ { RC_DOIT, "DOIT" })
+
+TRACE_EVENT(nfsd_drc_found,
+ TP_PROTO(
+ const struct nfsd_net *nn,
+ const struct svc_rqst *rqstp,
+ int result
+ ),
+ TP_ARGS(nn, rqstp, result),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ __field(unsigned long, result)
+ __field(u32, xid)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ __entry->result = result;
+ __entry->xid = be32_to_cpu(rqstp->rq_xid);
+ ),
+ TP_printk("boot_time=%16llx xid=0x%08x result=%s",
+ __entry->boot_time, __entry->xid,
+ show_drc_retval(__entry->result))
+
+);
+
+TRACE_EVENT(nfsd_drc_mismatch,
+ TP_PROTO(
+ const struct nfsd_net *nn,
+ const struct svc_cacherep *key,
+ const struct svc_cacherep *rp
+ ),
+ TP_ARGS(nn, key, rp),
+ TP_STRUCT__entry(
+ __field(unsigned long long, boot_time)
+ __field(u32, xid)
+ __field(u32, cached)
+ __field(u32, ingress)
+ ),
+ TP_fast_assign(
+ __entry->boot_time = nn->boot_time;
+ __entry->xid = be32_to_cpu(key->c_key.k_xid);
+ __entry->cached = (__force u32)key->c_key.k_csum;
+ __entry->ingress = (__force u32)rp->c_key.k_csum;
+ ),
+ TP_printk("boot_time=%16llx xid=0x%08x cached-csum=0x%08x ingress-csum=0x%08x",
+ __entry->boot_time, __entry->xid, __entry->cached,
+ __entry->ingress)
+);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index b6f4b552c9af..6aa968bee0ce 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1689,6 +1689,12 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
if (!flen || isdotent(fname, flen) || !tlen || isdotent(tname, tlen))
goto out;
+ err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
+ if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
+ goto out;
+ if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
+ goto out;
+
retry:
host_err = fh_want_write(ffhp);
if (host_err) {
@@ -1723,12 +1729,6 @@ retry:
if (ndentry == trap)
goto out_dput_new;
- host_err = -EXDEV;
- if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
- goto out_dput_new;
- if (ffhp->fh_export->ex_path.dentry != tfhp->fh_export->ex_path.dentry)
- goto out_dput_new;
-
if (nfsd_has_cached_files(ndentry)) {
has_cached = true;
goto out_dput_old;
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
index 235b959fc2b3..279d945d4ebe 100644
--- a/fs/nilfs2/alloc.c
+++ b/fs/nilfs2/alloc.c
@@ -205,7 +205,8 @@ static int nilfs_palloc_get_block(struct inode *inode, unsigned long blkoff,
int ret;
spin_lock(lock);
- if (prev->bh && blkoff == prev->blkoff) {
+ if (prev->bh && blkoff == prev->blkoff &&
+ likely(buffer_uptodate(prev->bh))) {
get_bh(prev->bh);
*bhp = prev->bh;
spin_unlock(lock);
@@ -613,10 +614,10 @@ void nilfs_palloc_commit_free_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): entry number %llu already freed",
- __func__, inode->i_ino,
- (unsigned long long)req->pr_entry_nr);
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -654,10 +655,10 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode,
lock = nilfs_mdt_bgl_lock(inode, group);
if (!nilfs_clear_bit_atomic(lock, group_offset, bitmap))
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): entry number %llu already freed",
- __func__, inode->i_ino,
- (unsigned long long)req->pr_entry_nr);
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)req->pr_entry_nr);
else
nilfs_palloc_group_desc_add_entries(desc, lock, 1);
@@ -763,10 +764,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
do {
if (!nilfs_clear_bit_atomic(lock, group_offset,
bitmap)) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): entry number %llu already freed",
- __func__, inode->i_ino,
- (unsigned long long)entry_nrs[j]);
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): entry number %llu already freed",
+ __func__, inode->i_ino,
+ (unsigned long long)entry_nrs[j]);
} else {
n++;
}
@@ -808,10 +809,10 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
ret = nilfs_palloc_delete_entry_block(inode,
last_nrs[k]);
if (ret && ret != -ENOENT)
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "error %d deleting block that object (entry=%llu, ino=%lu) belongs to",
- ret, (unsigned long long)last_nrs[k],
- inode->i_ino);
+ nilfs_warn(inode->i_sb,
+ "error %d deleting block that object (entry=%llu, ino=%lu) belongs to",
+ ret, (unsigned long long)last_nrs[k],
+ inode->i_ino);
}
desc_kaddr = kmap_atomic(desc_bh->b_page);
@@ -826,9 +827,9 @@ int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
if (nfree == nilfs_palloc_entries_per_group(inode)) {
ret = nilfs_palloc_delete_bitmap_block(inode, group);
if (ret && ret != -ENOENT)
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "error %d deleting bitmap block of group=%lu, ino=%lu",
- ret, group, inode->i_ino);
+ nilfs_warn(inode->i_sb,
+ "error %d deleting bitmap block of group=%lu, ino=%lu",
+ ret, group, inode->i_ino);
}
}
return 0;
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index fb5a9a8a13cf..2ba57e4b4f0a 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -67,20 +67,28 @@ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
down_read(&bmap->b_sem);
ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
- if (ret < 0) {
- ret = nilfs_bmap_convert_error(bmap, __func__, ret);
+ if (ret < 0)
goto out;
- }
+
if (NILFS_BMAP_USE_VBN(bmap)) {
ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), *ptrp,
&blocknr);
if (!ret)
*ptrp = blocknr;
+ else if (ret == -ENOENT) {
+ /*
+ * If there was no valid entry in DAT for the block
+ * address obtained by b_ops->bop_lookup, then pass
+ * internal code -EINVAL to nilfs_bmap_convert_error
+ * to treat it as metadata corruption.
+ */
+ ret = -EINVAL;
+ }
}
out:
up_read(&bmap->b_sem);
- return ret;
+ return nilfs_bmap_convert_error(bmap, __func__, ret);
}
int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index e00e184b1261..1776121677e2 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -285,6 +285,14 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
if (nbh == NULL) { /* blocksize == pagesize */
xa_erase_irq(&btnc->i_pages, newkey);
unlock_page(ctxt->bh->b_page);
- } else
- brelse(nbh);
+ } else {
+ /*
+ * When canceling a buffer that a prepare operation has
+ * allocated to copy a node block to another location, use
+ * nilfs_btnode_delete() to initialize and release the buffer
+ * so that the buffer flags will not be in an inconsistent
+ * state when it is reallocated.
+ */
+ nilfs_btnode_delete(nbh);
+ }
}
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index 919d1238ce45..4905b7cd7bf3 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -352,10 +352,10 @@ static int nilfs_btree_node_broken(const struct nilfs_btree_node *node,
(flags & NILFS_BTREE_NODE_ROOT) ||
nchildren < 0 ||
nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) {
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d",
- inode->i_ino, (unsigned long long)blocknr, level,
- flags, nchildren);
+ nilfs_crit(inode->i_sb,
+ "bad btree node (ino=%lu, blocknr=%llu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, (unsigned long long)blocknr, level,
+ flags, nchildren);
ret = 1;
}
return ret;
@@ -382,9 +382,9 @@ static int nilfs_btree_root_broken(const struct nilfs_btree_node *node,
level >= NILFS_BTREE_LEVEL_MAX ||
nchildren < 0 ||
nchildren > NILFS_BTREE_ROOT_NCHILDREN_MAX)) {
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d",
- inode->i_ino, level, flags, nchildren);
+ nilfs_crit(inode->i_sb,
+ "bad btree root (ino=%lu): level = %d, flags = 0x%x, nchildren = %d",
+ inode->i_ino, level, flags, nchildren);
ret = 1;
}
return ret;
@@ -451,10 +451,10 @@ static int nilfs_btree_bad_node(const struct nilfs_bmap *btree,
{
if (unlikely(nilfs_btree_node_get_level(node) != level)) {
dump_stack();
- nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
- "btree level mismatch (ino=%lu): %d != %d",
- btree->b_inode->i_ino,
- nilfs_btree_node_get_level(node), level);
+ nilfs_crit(btree->b_inode->i_sb,
+ "btree level mismatch (ino=%lu): %d != %d",
+ btree->b_inode->i_ino,
+ nilfs_btree_node_get_level(node), level);
return 1;
}
return 0;
@@ -480,9 +480,18 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
ret = nilfs_btnode_submit_block(btnc, ptr, 0, REQ_OP_READ, 0, &bh,
&submit_ptr);
if (ret) {
- if (ret != -EEXIST)
- return ret;
- goto out_check;
+ if (likely(ret == -EEXIST))
+ goto out_check;
+ if (ret == -ENOENT) {
+ /*
+ * Block address translation failed due to invalid
+ * value of 'ptr'. In this case, return internal code
+ * -EINVAL (broken bmap) to notify bmap layer of fatal
+ * metadata corruption.
+ */
+ ret = -EINVAL;
+ }
+ return ret;
}
if (ra) {
@@ -510,7 +519,7 @@ static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr,
out_no_wait:
if (!buffer_uptodate(bh)) {
- nilfs_msg(btree->b_inode->i_sb, KERN_ERR,
+ nilfs_err(btree->b_inode->i_sb,
"I/O error reading b-tree node block (ino=%lu, blocknr=%llu)",
btree->b_inode->i_ino, (unsigned long long)ptr);
brelse(bh);
@@ -715,7 +724,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
dat = nilfs_bmap_get_dat(btree);
ret = nilfs_dat_translate(dat, ptr, &blocknr);
if (ret < 0)
- goto out;
+ goto dat_error;
ptr = blocknr;
}
cnt = 1;
@@ -734,7 +743,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
if (dat) {
ret = nilfs_dat_translate(dat, ptr2, &blocknr);
if (ret < 0)
- goto out;
+ goto dat_error;
ptr2 = blocknr;
}
if (ptr2 != ptr + cnt || ++cnt == maxblocks)
@@ -773,6 +782,11 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree,
out:
nilfs_btree_free_path(path);
return ret;
+
+ dat_error:
+ if (ret == -ENOENT)
+ ret = -EINVAL; /* Notify bmap layer of metadata corruption */
+ goto out;
}
static void nilfs_btree_promote_key(struct nilfs_bmap *btree,
@@ -2080,10 +2094,10 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree,
ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0);
if (ret < 0) {
if (unlikely(ret == -ENOENT))
- nilfs_msg(btree->b_inode->i_sb, KERN_CRIT,
- "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
- btree->b_inode->i_ino,
- (unsigned long long)key, level);
+ nilfs_crit(btree->b_inode->i_sb,
+ "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d",
+ btree->b_inode->i_ino,
+ (unsigned long long)key, level);
goto out;
}
@@ -2120,11 +2134,11 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree,
if (level < NILFS_BTREE_LEVEL_NODE_MIN ||
level >= NILFS_BTREE_LEVEL_MAX) {
dump_stack();
- nilfs_msg(btree->b_inode->i_sb, KERN_WARNING,
- "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)",
- level, (unsigned long long)key,
- btree->b_inode->i_ino,
- (unsigned long long)bh->b_blocknr);
+ nilfs_warn(btree->b_inode->i_sb,
+ "invalid btree level: %d (key=%llu, ino=%lu, blocknr=%llu)",
+ level, (unsigned long long)key,
+ btree->b_inode->i_ino,
+ (unsigned long long)bh->b_blocknr);
return;
}
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 8d41311b5db4..86d4d850d130 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -322,7 +322,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
int ret, ncps, nicps, nss, count, i;
if (unlikely(start == 0 || start > end)) {
- nilfs_msg(cpfile->i_sb, KERN_ERR,
+ nilfs_err(cpfile->i_sb,
"cannot delete checkpoints: invalid range [%llu, %llu)",
(unsigned long long)start, (unsigned long long)end);
return -EINVAL;
@@ -376,7 +376,7 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
cpfile, cno);
if (ret == 0)
continue;
- nilfs_msg(cpfile->i_sb, KERN_ERR,
+ nilfs_err(cpfile->i_sb,
"error %d deleting checkpoint block",
ret);
break;
@@ -981,12 +981,10 @@ int nilfs_cpfile_read(struct super_block *sb, size_t cpsize,
int err;
if (cpsize > sb->s_blocksize) {
- nilfs_msg(sb, KERN_ERR,
- "too large checkpoint size: %zu bytes", cpsize);
+ nilfs_err(sb, "too large checkpoint size: %zu bytes", cpsize);
return -EINVAL;
} else if (cpsize < NILFS_MIN_CHECKPOINT_SIZE) {
- nilfs_msg(sb, KERN_ERR,
- "too small checkpoint size: %zu bytes", cpsize);
+ nilfs_err(sb, "too small checkpoint size: %zu bytes", cpsize);
return -EINVAL;
}
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index a3523a243e11..c47e1f6f23a8 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -40,8 +40,21 @@ static inline struct nilfs_dat_info *NILFS_DAT_I(struct inode *dat)
static int nilfs_dat_prepare_entry(struct inode *dat,
struct nilfs_palloc_req *req, int create)
{
- return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
- create, &req->pr_entry_bh);
+ int ret;
+
+ ret = nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
+ create, &req->pr_entry_bh);
+ if (unlikely(ret == -ENOENT)) {
+ nilfs_msg(dat->i_sb, KERN_ERR,
+ "DAT doesn't have a block to manage vblocknr = %llu",
+ (unsigned long long)req->pr_entry_nr);
+ /*
+ * Return internal code -EINVAL to notify bmap layer of
+ * metadata corruption.
+ */
+ ret = -EINVAL;
+ }
+ return ret;
}
static void nilfs_dat_commit_entry(struct inode *dat,
@@ -111,16 +124,19 @@ static void nilfs_dat_commit_free(struct inode *dat,
kunmap_atomic(kaddr);
nilfs_dat_commit_entry(dat, req);
+
+ if (unlikely(req->pr_desc_bh == NULL || req->pr_bitmap_bh == NULL)) {
+ nilfs_error(dat->i_sb,
+ "state inconsistency probably due to duplicate use of vblocknr = %llu",
+ (unsigned long long)req->pr_entry_nr);
+ return;
+ }
nilfs_palloc_commit_free_entry(dat, req);
}
int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
{
- int ret;
-
- ret = nilfs_dat_prepare_entry(dat, req, 0);
- WARN_ON(ret == -ENOENT);
- return ret;
+ return nilfs_dat_prepare_entry(dat, req, 0);
}
void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
@@ -147,10 +163,8 @@ int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
int ret;
ret = nilfs_dat_prepare_entry(dat, req, 0);
- if (ret < 0) {
- WARN_ON(ret == -ENOENT);
+ if (ret < 0)
return ret;
- }
kaddr = kmap_atomic(req->pr_entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
@@ -340,11 +354,11 @@ int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
kaddr = kmap_atomic(entry_bh->b_page);
entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
- nilfs_msg(dat->i_sb, KERN_CRIT,
- "%s: invalid vblocknr = %llu, [%llu, %llu)",
- __func__, (unsigned long long)vblocknr,
- (unsigned long long)le64_to_cpu(entry->de_start),
- (unsigned long long)le64_to_cpu(entry->de_end));
+ nilfs_crit(dat->i_sb,
+ "%s: invalid vblocknr = %llu, [%llu, %llu)",
+ __func__, (unsigned long long)vblocknr,
+ (unsigned long long)le64_to_cpu(entry->de_start),
+ (unsigned long long)le64_to_cpu(entry->de_end));
kunmap_atomic(kaddr);
brelse(entry_bh);
return -EINVAL;
@@ -471,11 +485,11 @@ int nilfs_dat_read(struct super_block *sb, size_t entry_size,
int err;
if (entry_size > sb->s_blocksize) {
- nilfs_msg(sb, KERN_ERR, "too large DAT entry size: %zu bytes",
+ nilfs_err(sb, "too large DAT entry size: %zu bytes",
entry_size);
return -EINVAL;
} else if (entry_size < NILFS_MIN_DAT_ENTRY_SIZE) {
- nilfs_msg(sb, KERN_ERR, "too small DAT entry size: %zu bytes",
+ nilfs_err(sb, "too small DAT entry size: %zu bytes",
entry_size);
return -EINVAL;
}
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 81394e22d0a0..eb7de9e2a384 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -243,7 +243,7 @@ nilfs_filetype_table[NILFS_FT_MAX] = {
#define S_SHIFT 12
static unsigned char
-nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+nilfs_type_by_mode[(S_IFMT >> S_SHIFT) + 1] = {
[S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE,
[S_IFDIR >> S_SHIFT] = NILFS_FT_DIR,
[S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV,
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 533e24ea3a88..7faf8c285d6c 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -66,7 +66,7 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
dat = nilfs_bmap_get_dat(direct);
ret = nilfs_dat_translate(dat, ptr, &blocknr);
if (ret < 0)
- return ret;
+ goto dat_error;
ptr = blocknr;
}
@@ -79,7 +79,7 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
if (dat) {
ret = nilfs_dat_translate(dat, ptr2, &blocknr);
if (ret < 0)
- return ret;
+ goto dat_error;
ptr2 = blocknr;
}
if (ptr2 != ptr + cnt)
@@ -87,6 +87,11 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct,
}
*ptrp = ptr;
return cnt;
+
+ dat_error:
+ if (ret == -ENOENT)
+ ret = -EINVAL; /* Notify bmap layer of metadata corruption */
+ return ret;
}
static __u64
@@ -328,16 +333,18 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap,
key = nilfs_bmap_data_get_key(bmap, *bh);
if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
- nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
- "%s (ino=%lu): invalid key: %llu", __func__,
- bmap->b_inode->i_ino, (unsigned long long)key);
+ nilfs_crit(bmap->b_inode->i_sb,
+ "%s (ino=%lu): invalid key: %llu",
+ __func__,
+ bmap->b_inode->i_ino, (unsigned long long)key);
return -EINVAL;
}
ptr = nilfs_direct_get_ptr(bmap, key);
if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
- nilfs_msg(bmap->b_inode->i_sb, KERN_CRIT,
- "%s (ino=%lu): invalid pointer: %llu", __func__,
- bmap->b_inode->i_ino, (unsigned long long)ptr);
+ nilfs_crit(bmap->b_inode->i_sb,
+ "%s (ino=%lu): invalid pointer: %llu",
+ __func__,
+ bmap->b_inode->i_ino, (unsigned long long)ptr);
return -EINVAL;
}
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 64bc81363c6c..3802b42e1cb4 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -105,7 +105,13 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
nilfs_transaction_commit(inode->i_sb);
mapped:
- wait_for_stable_page(page);
+ /*
+ * Since checksumming including data blocks is performed to determine
+ * the validity of the log to be written and used for recovery, it is
+ * necessary to wait for writeback to finish here, regardless of the
+ * stable write requirement of the backing device.
+ */
+ wait_on_page_writeback(page);
out:
sb_end_pagefault(inode->i_sb);
return block_page_mkwrite_return(ret);
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 114774ac2185..b0077f5f7112 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -73,10 +73,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
err = nilfs_dat_translate(nilfs->ns_dat, vbn, &pbn);
- if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
- brelse(bh);
+ if (unlikely(err)) /* -EIO, -ENOMEM, -ENOENT */
goto failed;
- }
}
lock_buffer(bh);
@@ -102,6 +100,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
failed:
unlock_page(bh->b_page);
put_page(bh->b_page);
+ if (unlikely(err))
+ brelse(bh);
return err;
}
@@ -143,7 +143,7 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
if (!buffer_uptodate(bh)) {
struct inode *inode = bh->b_page->mapping->host;
- nilfs_msg(inode->i_sb, KERN_ERR,
+ nilfs_err(inode->i_sb,
"I/O error reading %s block for GC (ino=%lu, vblocknr=%llu)",
buffer_nilfs_node(bh) ? "node" : "data",
inode->i_ino, (unsigned long long)bh->b_blocknr);
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
index 4140d232cadc..02727ed3a7c6 100644
--- a/fs/nilfs2/ifile.c
+++ b/fs/nilfs2/ifile.c
@@ -142,8 +142,8 @@ int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
if (unlikely(err))
- nilfs_msg(sb, KERN_WARNING, "error %d reading inode: ino=%lu",
- err, (unsigned long)ino);
+ nilfs_warn(sb, "error %d reading inode: ino=%lu",
+ err, (unsigned long)ino);
return err;
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 35b0bfe9324f..530edb813add 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -107,11 +107,11 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
* However, the page having this block must
* be locked in this case.
*/
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
- __func__, inode->i_ino,
- (unsigned long long)blkoff);
- err = 0;
+ nilfs_warn(inode->i_sb,
+ "%s (ino=%lu): a race condition while inserting a data block at offset=%llu",
+ __func__, inode->i_ino,
+ (unsigned long long)blkoff);
+ err = -EAGAIN;
}
nilfs_transaction_abort(inode->i_sb);
goto out;
@@ -340,6 +340,7 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
struct inode *inode;
struct nilfs_inode_info *ii;
struct nilfs_root *root;
+ struct buffer_head *bh;
int err = -ENOMEM;
ino_t ino;
@@ -355,11 +356,26 @@ struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
ii->i_state = BIT(NILFS_I_NEW);
ii->i_root = root;
- err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
+ err = nilfs_ifile_create_inode(root->ifile, &ino, &bh);
if (unlikely(err))
goto failed_ifile_create_inode;
/* reference count of i_bh inherits from nilfs_mdt_read_block() */
+ if (unlikely(ino < NILFS_USER_INO)) {
+ nilfs_msg(sb, KERN_WARNING,
+ "inode bitmap is inconsistent for reserved inodes");
+ do {
+ brelse(bh);
+ err = nilfs_ifile_create_inode(root->ifile, &ino, &bh);
+ if (unlikely(err))
+ goto failed_ifile_create_inode;
+ } while (ino < NILFS_USER_INO);
+
+ nilfs_msg(sb, KERN_INFO,
+ "repaired inode bitmap for reserved inodes");
+ }
+ ii->i_bh = bh;
+
atomic64_inc(&root->inodes_count);
inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
@@ -451,6 +467,8 @@ int nilfs_read_inode_common(struct inode *inode,
inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+ if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode))
+ return -EIO; /* this inode is for metadata and corrupted */
if (inode->i_nlink == 0)
return -ESTALE; /* this inode is deleted */
@@ -844,9 +862,8 @@ repeat:
goto repeat;
failed:
- nilfs_msg(ii->vfs_inode.i_sb, KERN_WARNING,
- "error %d truncating bmap (ino=%lu)", ret,
- ii->vfs_inode.i_ino);
+ nilfs_warn(ii->vfs_inode.i_sb, "error %d truncating bmap (ino=%lu)",
+ ret, ii->vfs_inode.i_ino);
}
void nilfs_truncate(struct inode *inode)
@@ -912,6 +929,7 @@ void nilfs_evict_inode(struct inode *inode)
struct nilfs_transaction_info ti;
struct super_block *sb = inode->i_sb;
struct nilfs_inode_info *ii = NILFS_I(inode);
+ struct the_nilfs *nilfs;
int ret;
if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
@@ -924,6 +942,23 @@ void nilfs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
+ nilfs = sb->s_fs_info;
+ if (unlikely(sb_rdonly(sb) || !nilfs->ns_writer)) {
+ /*
+ * If this inode is about to be disposed after the file system
+ * has been degraded to read-only due to file system corruption
+ * or after the writer has been detached, do not make any
+ * changes that cause writes, just clear it.
+ * Do this check after read-locking ns_segctor_sem by
+ * nilfs_transaction_begin() in order to avoid a race with
+ * the writer detach operation.
+ */
+ clear_inode(inode);
+ nilfs_clear_inode(inode);
+ nilfs_transaction_abort(sb);
+ return;
+ }
+
/* TODO: some of the following operations may fail. */
nilfs_truncate_bmap(ii, 0);
nilfs_mark_inode_dirty(inode);
@@ -1000,7 +1035,7 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
int err;
spin_lock(&nilfs->ns_inode_lock);
- if (ii->i_bh == NULL) {
+ if (ii->i_bh == NULL || unlikely(!buffer_uptodate(ii->i_bh))) {
spin_unlock(&nilfs->ns_inode_lock);
err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
inode->i_ino, pbh);
@@ -1009,7 +1044,10 @@ int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
spin_lock(&nilfs->ns_inode_lock);
if (ii->i_bh == NULL)
ii->i_bh = *pbh;
- else {
+ else if (unlikely(!buffer_uptodate(ii->i_bh))) {
+ __brelse(ii->i_bh);
+ ii->i_bh = *pbh;
+ } else {
brelse(*pbh);
*pbh = ii->i_bh;
}
@@ -1058,9 +1096,9 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
* This will happen when somebody is freeing
* this inode.
*/
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "cannot set file dirty (ino=%lu): the file is being freed",
- inode->i_ino);
+ nilfs_warn(inode->i_sb,
+ "cannot set file dirty (ino=%lu): the file is being freed",
+ inode->i_ino);
spin_unlock(&nilfs->ns_inode_lock);
return -EINVAL; /*
* NILFS_I_DIRTY may remain for
@@ -1076,14 +1114,22 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned int nr_dirty)
int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
{
+ struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
struct buffer_head *ibh;
int err;
+ /*
+ * Do not dirty inodes after the log writer has been detached
+ * and its nilfs_root struct has been freed.
+ */
+ if (unlikely(nilfs_purging(nilfs)))
+ return 0;
+
err = nilfs_load_inode_block(inode, &ibh);
if (unlikely(err)) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "cannot mark inode dirty (ino=%lu): error %d loading inode block",
- inode->i_ino, err);
+ nilfs_warn(inode->i_sb,
+ "cannot mark inode dirty (ino=%lu): error %d loading inode block",
+ inode->i_ino, err);
return err;
}
nilfs_update_inode(inode, ibh, flags);
@@ -1109,8 +1155,8 @@ void nilfs_dirty_inode(struct inode *inode, int flags)
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
if (is_bad_inode(inode)) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "tried to mark bad_inode dirty. ignored.");
+ nilfs_warn(inode->i_sb,
+ "tried to mark bad_inode dirty. ignored.");
dump_stack();
return;
}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 91b9dac6b2cc..1a266a10d4cf 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -70,7 +70,7 @@ static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
if (argv->v_index > ~(__u64)0 - argv->v_nmembs)
return -EINVAL;
- buf = (void *)__get_free_pages(GFP_NOFS, 0);
+ buf = (void *)get_zeroed_page(GFP_NOFS);
if (unlikely(!buf))
return -ENOMEM;
maxmembs = PAGE_SIZE / argv->v_size;
@@ -569,25 +569,25 @@ static int nilfs_ioctl_move_inode_block(struct inode *inode,
if (unlikely(ret < 0)) {
if (ret == -ENOENT)
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "%s: invalid virtual block address (%s): ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_crit(inode->i_sb,
+ "%s: invalid virtual block address (%s): ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
return ret;
}
if (unlikely(!list_empty(&bh->b_assoc_buffers))) {
- nilfs_msg(inode->i_sb, KERN_CRIT,
- "%s: conflicting %s buffer: ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
- __func__, vdesc->vd_flags ? "node" : "data",
- (unsigned long long)vdesc->vd_ino,
- (unsigned long long)vdesc->vd_cno,
- (unsigned long long)vdesc->vd_offset,
- (unsigned long long)vdesc->vd_blocknr,
- (unsigned long long)vdesc->vd_vblocknr);
+ nilfs_crit(inode->i_sb,
+ "%s: conflicting %s buffer: ino=%llu, cno=%llu, offset=%llu, blocknr=%llu, vblocknr=%llu",
+ __func__, vdesc->vd_flags ? "node" : "data",
+ (unsigned long long)vdesc->vd_ino,
+ (unsigned long long)vdesc->vd_cno,
+ (unsigned long long)vdesc->vd_offset,
+ (unsigned long long)vdesc->vd_blocknr,
+ (unsigned long long)vdesc->vd_vblocknr);
brelse(bh);
return -EEXIST;
}
@@ -837,8 +837,7 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
return 0;
failed:
- nilfs_msg(nilfs->ns_sb, KERN_ERR, "error %d preparing GC: %s", ret,
- msg);
+ nilfs_err(nilfs->ns_sb, "error %d preparing GC: %s", ret, msg);
return ret;
}
@@ -947,7 +946,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
ret = nilfs_ioctl_move_blocks(inode->i_sb, &argv[0], kbufs[0]);
if (ret < 0) {
- nilfs_msg(inode->i_sb, KERN_ERR,
+ nilfs_err(inode->i_sb,
"error %d preparing GC: cannot read source blocks",
ret);
} else {
@@ -1130,7 +1129,14 @@ static int nilfs_ioctl_set_alloc_range(struct inode *inode, void __user *argp)
minseg = range[0] + segbytes - 1;
do_div(minseg, segbytes);
+
+ if (range[1] < 4096)
+ goto out;
+
maxseg = NILFS_SB2_OFFSET_BYTES(range[1]);
+ if (maxseg < segbytes)
+ goto out;
+
do_div(maxseg, segbytes);
maxseg--;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 7c9055d767d1..e80ef2c0a785 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -199,7 +199,7 @@ static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
out_no_wait:
err = -EIO;
if (!buffer_uptodate(first_bh)) {
- nilfs_msg(inode->i_sb, KERN_ERR,
+ nilfs_err(inode->i_sb,
"I/O error reading meta-data file (ino=%lu, block-offset=%lu)",
inode->i_ino, block);
goto failed_bh;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 9fe6d4ab74f0..a6ec7961d4f5 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -272,9 +272,9 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
goto out;
if (!inode->i_nlink) {
- nilfs_msg(inode->i_sb, KERN_WARNING,
- "deleting nonexistent file (ino=%lu), %d",
- inode->i_ino, inode->i_nlink);
+ nilfs_warn(inode->i_sb,
+ "deleting nonexistent file (ino=%lu), %d",
+ inode->i_ino, inode->i_nlink);
set_nlink(inode, 1);
}
err = nilfs_delete_entry(de, page);
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index dc772eaa13cf..6b9383ba0049 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -319,6 +319,15 @@ void __nilfs_error(struct super_block *sb, const char *function,
#endif /* CONFIG_PRINTK */
+#define nilfs_crit(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_CRIT, fmt, ##__VA_ARGS__)
+#define nilfs_err(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_ERR, fmt, ##__VA_ARGS__)
+#define nilfs_warn(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_WARNING, fmt, ##__VA_ARGS__)
+#define nilfs_info(sb, fmt, ...) \
+ nilfs_msg(sb, KERN_INFO, fmt, ##__VA_ARGS__)
+
extern struct nilfs_super_block *
nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
extern int nilfs_store_magic_and_option(struct super_block *,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 98d21ad9a073..fc86cc7a26d7 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -370,7 +370,15 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
struct page *page = pvec.pages[i];
lock_page(page);
- nilfs_clear_dirty_page(page, silent);
+
+ /*
+ * This page may have been removed from the address
+ * space by truncation or invalidation when the lock
+ * was acquired. Skip processing in that case.
+ */
+ if (likely(page->mapping == mapping))
+ nilfs_clear_dirty_page(page, silent);
+
unlock_page(page);
}
pagevec_release(&pvec);
@@ -391,9 +399,8 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
BUG_ON(!PageLocked(page));
if (!silent)
- nilfs_msg(sb, KERN_WARNING,
- "discard dirty page: offset=%lld, ino=%lu",
- page_offset(page), inode->i_ino);
+ nilfs_warn(sb, "discard dirty page: offset=%lld, ino=%lu",
+ page_offset(page), inode->i_ino);
ClearPageUptodate(page);
ClearPageMappedToDisk(page);
@@ -409,9 +416,9 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
do {
lock_buffer(bh);
if (!silent)
- nilfs_msg(sb, KERN_WARNING,
- "discard dirty block: blocknr=%llu, size=%zu",
- (u64)bh->b_blocknr, bh->b_size);
+ nilfs_warn(sb,
+ "discard dirty block: blocknr=%llu, size=%zu",
+ (u64)bh->b_blocknr, bh->b_size);
set_mask_bits(&bh->b_state, clear_bits, 0);
unlock_buffer(bh);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 140b663e91c7..0923231e9e60 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -51,7 +51,7 @@ static int nilfs_warn_segment_error(struct super_block *sb, int err)
switch (err) {
case NILFS_SEG_FAIL_IO:
- nilfs_msg(sb, KERN_ERR, "I/O error reading segment");
+ nilfs_err(sb, "I/O error reading segment");
return -EIO;
case NILFS_SEG_FAIL_MAGIC:
msg = "Magic number mismatch";
@@ -72,10 +72,10 @@ static int nilfs_warn_segment_error(struct super_block *sb, int err)
msg = "No super root in the last segment";
break;
default:
- nilfs_msg(sb, KERN_ERR, "unrecognized segment error %d", err);
+ nilfs_err(sb, "unrecognized segment error %d", err);
return -EINVAL;
}
- nilfs_msg(sb, KERN_WARNING, "invalid segment: %s", msg);
+ nilfs_warn(sb, "invalid segment: %s", msg);
return -EINVAL;
}
@@ -472,9 +472,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
struct nilfs_recovery_block *rb,
- struct page *page)
+ loff_t pos, struct page *page)
{
struct buffer_head *bh_org;
+ size_t from = pos & ~PAGE_MASK;
void *kaddr;
bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
@@ -482,7 +483,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
return -EIO;
kaddr = kmap_atomic(page);
- memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
+ memcpy(kaddr + from, bh_org->b_data, bh_org->b_size);
kunmap_atomic(kaddr);
brelse(bh_org);
return 0;
@@ -521,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
goto failed_inode;
}
- err = nilfs_recovery_copy_block(nilfs, rb, page);
+ err = nilfs_recovery_copy_block(nilfs, rb, pos, page);
if (unlikely(err))
goto failed_page;
@@ -543,10 +544,10 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
put_page(page);
failed_inode:
- nilfs_msg(sb, KERN_WARNING,
- "error %d recovering data block (ino=%lu, block-offset=%llu)",
- err, (unsigned long)rb->ino,
- (unsigned long long)rb->blkoff);
+ nilfs_warn(sb,
+ "error %d recovering data block (ino=%lu, block-offset=%llu)",
+ err, (unsigned long)rb->ino,
+ (unsigned long long)rb->blkoff);
if (!err2)
err2 = err;
next:
@@ -669,8 +670,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
}
if (nsalvaged_blocks) {
- nilfs_msg(sb, KERN_INFO, "salvaged %lu blocks",
- nsalvaged_blocks);
+ nilfs_info(sb, "salvaged %lu blocks", nsalvaged_blocks);
ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
}
out:
@@ -681,7 +681,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
confused:
err = -EINVAL;
failed:
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"error %d roll-forwarding partial segment at blocknr = %llu",
err, (unsigned long long)pseg_start);
goto out;
@@ -703,8 +703,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
set_buffer_dirty(bh);
err = sync_dirty_buffer(bh);
if (unlikely(err))
- nilfs_msg(nilfs->ns_sb, KERN_WARNING,
- "buffer sync write failed during post-cleaning of recovery.");
+ nilfs_warn(nilfs->ns_sb,
+ "buffer sync write failed during post-cleaning of recovery.");
brelse(bh);
}
@@ -739,8 +739,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
err = nilfs_attach_checkpoint(sb, ri->ri_cno, true, &root);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR,
- "error %d loading the latest checkpoint", err);
+ nilfs_err(sb, "error %d loading the latest checkpoint", err);
return err;
}
@@ -751,8 +750,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
err = nilfs_prepare_segment_for_recovery(nilfs, sb, ri);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR,
- "error %d preparing segment for recovery",
+ nilfs_err(sb, "error %d preparing segment for recovery",
err);
goto failed;
}
@@ -766,8 +764,7 @@ int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs,
nilfs_detach_log_writer(sb);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR,
- "error %d writing segment for recovery",
+ nilfs_err(sb, "error %d writing segment for recovery",
err);
goto failed;
}
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 20c479b5e41b..9f435879a048 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -101,6 +101,12 @@ int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
if (unlikely(!bh))
return -ENOMEM;
+ lock_buffer(bh);
+ if (!buffer_uptodate(bh)) {
+ memset(bh->b_data, 0, bh->b_size);
+ set_buffer_uptodate(bh);
+ }
+ unlock_buffer(bh);
nilfs_segbuf_add_segsum_buffer(segbuf, bh);
return 0;
}
@@ -505,7 +511,7 @@ static int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf)
} while (--segbuf->sb_nbio > 0);
if (unlikely(atomic_read(&segbuf->sb_err) > 0)) {
- nilfs_msg(segbuf->sb_super, KERN_ERR,
+ nilfs_err(segbuf->sb_super,
"I/O error writing log (start-blocknr=%llu, block-count=%lu) in segment %llu",
(unsigned long long)segbuf->sb_pseg_start,
segbuf->sb_sum.nblocks,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index eb3ac7619088..d4610f71d21b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -158,7 +158,7 @@ static int nilfs_prepare_segment_lock(struct super_block *sb,
* it is saved and will be restored on
* nilfs_transaction_commit().
*/
- nilfs_msg(sb, KERN_WARNING, "journal info from a different FS");
+ nilfs_warn(sb, "journal info from a different FS");
save = current->journal_info;
}
if (!ti) {
@@ -322,7 +322,7 @@ void nilfs_relax_pressure_in_lock(struct super_block *sb)
struct the_nilfs *nilfs = sb->s_fs_info;
struct nilfs_sc_info *sci = nilfs->ns_writer;
- if (!sci || !sci->sc_flush_request)
+ if (sb_rdonly(sb) || unlikely(!sci) || !sci->sc_flush_request)
return;
set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
@@ -435,6 +435,23 @@ static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
return 0;
}
+/**
+ * nilfs_segctor_zeropad_segsum - zero pad the rest of the segment summary area
+ * @sci: segment constructor object
+ *
+ * nilfs_segctor_zeropad_segsum() zero-fills unallocated space at the end of
+ * the current segment summary block.
+ */
+static void nilfs_segctor_zeropad_segsum(struct nilfs_sc_info *sci)
+{
+ struct nilfs_segsum_pointer *ssp;
+
+ ssp = sci->sc_blk_cnt > 0 ? &sci->sc_binfo_ptr : &sci->sc_finfo_ptr;
+ if (ssp->offset < ssp->bh->b_size)
+ memset(ssp->bh->b_data + ssp->offset, 0,
+ ssp->bh->b_size - ssp->offset);
+}
+
static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
{
sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
@@ -443,6 +460,7 @@ static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
* The current segment is filled up
* (internal code)
*/
+ nilfs_segctor_zeropad_segsum(sci);
sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
return nilfs_segctor_reset_segment_buffer(sci);
}
@@ -547,6 +565,7 @@ static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
goto retry;
}
if (unlikely(required)) {
+ nilfs_segctor_zeropad_segsum(sci);
err = nilfs_segbuf_extend_segsum(segbuf);
if (unlikely(err))
goto failed;
@@ -711,6 +730,11 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
struct page *page = pvec.pages[i];
lock_page(page);
+ if (unlikely(page->mapping != mapping)) {
+ /* Exclude pages removed from the address space */
+ unlock_page(page);
+ continue;
+ }
if (!page_has_buffers(page))
create_empty_buffers(page, i_blocksize(inode), 0);
unlock_page(page);
@@ -880,9 +904,11 @@ static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
nilfs_cpfile_put_checkpoint(
nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
- } else
- WARN_ON(err == -EINVAL || err == -ENOENT);
-
+ } else if (err == -EINVAL || err == -ENOENT) {
+ nilfs_error(sci->sc_super,
+ "checkpoint creation failed due to metadata corruption.");
+ err = -EIO;
+ }
return err;
}
@@ -896,7 +922,11 @@ static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
&raw_cp, &bh_cp);
if (unlikely(err)) {
- WARN_ON(err == -EINVAL || err == -ENOENT);
+ if (err == -EINVAL || err == -ENOENT) {
+ nilfs_error(sci->sc_super,
+ "checkpoint finalization failed due to metadata corruption.");
+ err = -EIO;
+ }
goto failed_ibh;
}
raw_cp->cp_snapshot_list.ssl_next = 0;
@@ -959,10 +989,13 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
unsigned int isz, srsz;
bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
+
+ lock_buffer(bh_sr);
raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
isz = nilfs->ns_inode_size;
srsz = NILFS_SR_BYTES(isz);
+ raw_sr->sr_sum = 0; /* Ensure initialization within this update */
raw_sr->sr_bytes = cpu_to_le16(srsz);
raw_sr->sr_nongc_ctime
= cpu_to_le64(nilfs_doing_gc() ?
@@ -976,6 +1009,8 @@ static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
NILFS_SR_SUFILE_OFFSET(isz), 1);
memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
+ set_buffer_uptodate(bh_sr);
+ unlock_buffer(bh_sr);
}
static void nilfs_redirty_inodes(struct list_head *head)
@@ -1525,6 +1560,7 @@ static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
sci->sc_stage = prev_stage;
}
+ nilfs_segctor_zeropad_segsum(sci);
nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
return 0;
@@ -1666,7 +1702,6 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
if (bh->b_page != bd_page) {
lock_page(bd_page);
@@ -1677,6 +1712,7 @@ static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
}
break;
}
+ set_buffer_async_write(bh);
if (bh->b_page != fs_page) {
nilfs_begin_page_io(fs_page);
fs_page = bh->b_page;
@@ -1752,6 +1788,7 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(segbuf, logs, sb_list) {
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
+ clear_buffer_uptodate(bh);
if (bh->b_page != bd_page) {
if (bd_page)
end_page_writeback(bd_page);
@@ -1761,14 +1798,15 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
+ clear_buffer_uptodate(bh);
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
bd_page = bh->b_page;
}
break;
}
+ clear_buffer_async_write(bh);
if (bh->b_page != fs_page) {
nilfs_end_page_io(fs_page, err);
fs_page = bh->b_page;
@@ -1856,8 +1894,9 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
if (bh->b_page != bd_page) {
end_page_writeback(bd_page);
bd_page = bh->b_page;
@@ -1865,6 +1904,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
update_sr = true;
break;
}
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh->b_page != fs_page) {
nilfs_end_page_io(fs_page, 0);
fs_page = bh->b_page;
@@ -1943,9 +1983,9 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
err = nilfs_ifile_get_inode_block(
ifile, ii->vfs_inode.i_ino, &ibh);
if (unlikely(err)) {
- nilfs_msg(sci->sc_super, KERN_WARNING,
- "log writer: error %d getting inode block (ino=%lu)",
- err, ii->vfs_inode.i_ino);
+ nilfs_warn(sci->sc_super,
+ "log writer: error %d getting inode block (ino=%lu)",
+ err, ii->vfs_inode.i_ino);
return err;
}
spin_lock(&nilfs->ns_inode_lock);
@@ -2013,6 +2053,9 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
int err;
+ if (sb_rdonly(sci->sc_super))
+ return -EROFS;
+
nilfs_sc_cstage_set(sci, NILFS_ST_INIT);
sci->sc_cno = nilfs->ns_cno;
@@ -2237,7 +2280,7 @@ int nilfs_construct_segment(struct super_block *sb)
struct nilfs_transaction_info *ti;
int err;
- if (!sci)
+ if (sb_rdonly(sb) || unlikely(!sci))
return -EROFS;
/* A call inside transactions causes a deadlock. */
@@ -2276,7 +2319,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
struct nilfs_transaction_info ti;
int err = 0;
- if (!sci)
+ if (sb_rdonly(sb) || unlikely(!sci))
return -EROFS;
nilfs_transaction_lock(sb, &ti, 0);
@@ -2452,7 +2495,7 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
if (likely(!err))
break;
- nilfs_msg(sb, KERN_WARNING, "error %d cleaning segments", err);
+ nilfs_warn(sb, "error %d cleaning segments", err);
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(sci->sc_interval);
}
@@ -2460,9 +2503,9 @@ int nilfs_clean_segments(struct super_block *sb, struct nilfs_argv *argv,
int ret = nilfs_discard_segments(nilfs, sci->sc_freesegs,
sci->sc_nfreesegs);
if (ret) {
- nilfs_msg(sb, KERN_WARNING,
- "error %d on discard request, turning discards off for the device",
- ret);
+ nilfs_warn(sb,
+ "error %d on discard request, turning discards off for the device",
+ ret);
nilfs_clear_opt(nilfs, DISCARD);
}
}
@@ -2543,9 +2586,9 @@ static int nilfs_segctor_thread(void *arg)
/* start sync. */
sci->sc_task = current;
wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
- nilfs_msg(sci->sc_super, KERN_INFO,
- "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
- sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+ nilfs_info(sci->sc_super,
+ "segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
+ sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
spin_lock(&sci->sc_state_lock);
loop:
@@ -2603,11 +2646,10 @@ static int nilfs_segctor_thread(void *arg)
goto loop;
end_thread:
- spin_unlock(&sci->sc_state_lock);
-
/* end sync. */
sci->sc_task = NULL;
wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
+ spin_unlock(&sci->sc_state_lock);
return 0;
}
@@ -2619,8 +2661,8 @@ static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
if (IS_ERR(t)) {
int err = PTR_ERR(t);
- nilfs_msg(sci->sc_super, KERN_ERR,
- "error %d creating segctord thread", err);
+ nilfs_err(sci->sc_super, "error %d creating segctord thread",
+ err);
return err;
}
wait_event(sci->sc_wait_task, sci->sc_task != NULL);
@@ -2699,7 +2741,7 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
flush_work(&sci->sc_iput_work);
- } while (ret && retrycount-- > 0);
+ } while (ret && ret != -EROFS && retrycount-- > 0);
}
/**
@@ -2730,14 +2772,14 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
nilfs_segctor_write_out(sci);
if (!list_empty(&sci->sc_dirty_files)) {
- nilfs_msg(sci->sc_super, KERN_WARNING,
- "disposed unprocessed dirty file(s) when stopping log writer");
+ nilfs_warn(sci->sc_super,
+ "disposed unprocessed dirty file(s) when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
}
if (!list_empty(&sci->sc_iput_queue)) {
- nilfs_msg(sci->sc_super, KERN_WARNING,
- "disposed unprocessed inode(s) in iput queue when stopping log writer");
+ nilfs_warn(sci->sc_super,
+ "disposed unprocessed inode(s) in iput queue when stopping log writer");
nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
}
@@ -2772,11 +2814,12 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
if (nilfs->ns_writer) {
/*
- * This happens if the filesystem was remounted
- * read/write after nilfs_error degenerated it into a
- * read-only mount.
+ * This happens if the filesystem is made read-only by
+ * __nilfs_error or nilfs_remount and then remounted
+ * read/write. In these cases, reuse the existing
+ * writer.
*/
- nilfs_detach_log_writer(sb);
+ return 0;
}
nilfs->ns_writer = nilfs_segctor_new(sb, root);
@@ -2786,10 +2829,9 @@ int nilfs_attach_log_writer(struct super_block *sb, struct nilfs_root *root)
inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL);
err = nilfs_segctor_start_thread(nilfs->ns_writer);
- if (err) {
- kfree(nilfs->ns_writer);
- nilfs->ns_writer = NULL;
- }
+ if (unlikely(err))
+ nilfs_detach_log_writer(sb);
+
return err;
}
@@ -2810,16 +2852,18 @@ void nilfs_detach_log_writer(struct super_block *sb)
nilfs_segctor_destroy(nilfs->ns_writer);
nilfs->ns_writer = NULL;
}
+ set_nilfs_purging(nilfs);
/* Force to free the list of dirty files */
spin_lock(&nilfs->ns_inode_lock);
if (!list_empty(&nilfs->ns_dirty_files)) {
list_splice_init(&nilfs->ns_dirty_files, &garbage_list);
- nilfs_msg(sb, KERN_WARNING,
- "disposed unprocessed dirty file(s) when detaching log writer");
+ nilfs_warn(sb,
+ "disposed unprocessed dirty file(s) when detaching log writer");
}
spin_unlock(&nilfs->ns_inode_lock);
up_write(&nilfs->ns_segctor_sem);
nilfs_dispose_list(nilfs, &garbage_list, 1);
+ clear_nilfs_purging(nilfs);
}
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index bf3f8f05c89b..379db0c54227 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -171,9 +171,9 @@ int nilfs_sufile_updatev(struct inode *sufile, __u64 *segnumv, size_t nsegs,
down_write(&NILFS_MDT(sufile)->mi_sem);
for (seg = segnumv; seg < segnumv + nsegs; seg++) {
if (unlikely(*seg >= nilfs_sufile_get_nsegments(sufile))) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: invalid segment number: %llu",
- __func__, (unsigned long long)*seg);
+ nilfs_warn(sufile->i_sb,
+ "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)*seg);
nerr++;
}
}
@@ -230,9 +230,8 @@ int nilfs_sufile_update(struct inode *sufile, __u64 segnum, int create,
int ret;
if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: invalid segment number: %llu",
- __func__, (unsigned long long)segnum);
+ nilfs_warn(sufile->i_sb, "%s: invalid segment number: %llu",
+ __func__, (unsigned long long)segnum);
return -EINVAL;
}
down_write(&NILFS_MDT(sufile)->mi_sem);
@@ -410,9 +409,8 @@ void nilfs_sufile_do_cancel_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (unlikely(!nilfs_segment_usage_clean(su))) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: segment %llu must be clean", __func__,
- (unsigned long long)segnum);
+ nilfs_warn(sufile->i_sb, "%s: segment %llu must be clean",
+ __func__, (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -468,9 +466,8 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(su_bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
if (nilfs_segment_usage_clean(su)) {
- nilfs_msg(sufile->i_sb, KERN_WARNING,
- "%s: segment %llu is already clean",
- __func__, (unsigned long long)segnum);
+ nilfs_warn(sufile->i_sb, "%s: segment %llu is already clean",
+ __func__, (unsigned long long)segnum);
kunmap_atomic(kaddr);
return;
}
@@ -498,14 +495,45 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
{
struct buffer_head *bh;
+ void *kaddr;
+ struct nilfs_segment_usage *su;
int ret;
+ down_write(&NILFS_MDT(sufile)->mi_sem);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
- if (!ret) {
+ if (ret)
+ goto out_sem;
+
+ kaddr = kmap_atomic(bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+ if (unlikely(nilfs_segment_usage_error(su))) {
+ struct the_nilfs *nilfs = sufile->i_sb->s_fs_info;
+
+ kunmap_atomic(kaddr);
+ brelse(bh);
+ if (nilfs_segment_is_active(nilfs, segnum)) {
+ nilfs_error(sufile->i_sb,
+ "active segment %llu is erroneous",
+ (unsigned long long)segnum);
+ } else {
+ /*
+ * Segments marked erroneous are never allocated by
+ * nilfs_sufile_alloc(); only active segments, ie,
+ * the segments indexed by ns_segnum or ns_nextnum,
+ * can be erroneous here.
+ */
+ WARN_ON_ONCE(1);
+ }
+ ret = -EIO;
+ } else {
+ nilfs_segment_usage_set_dirty(su);
+ kunmap_atomic(kaddr);
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(sufile);
brelse(bh);
}
+out_sem:
+ up_write(&NILFS_MDT(sufile)->mi_sem);
return ret;
}
@@ -531,9 +559,14 @@ int nilfs_sufile_set_segment_usage(struct inode *sufile, __u64 segnum,
kaddr = kmap_atomic(bh->b_page);
su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
- WARN_ON(nilfs_segment_usage_error(su));
- if (modtime)
+ if (modtime) {
+ /*
+ * Check segusage error and set su_lastmod only when updating
+ * this entry with a valid timestamp, not for cancellation.
+ */
+ WARN_ON_ONCE(nilfs_segment_usage_error(su));
su->su_lastmod = cpu_to_le64(modtime);
+ }
su->su_nblocks = cpu_to_le32(nblocks);
kunmap_atomic(kaddr);
@@ -774,6 +807,15 @@ int nilfs_sufile_resize(struct inode *sufile, __u64 newnsegs)
goto out_header;
sui->ncleansegs -= nsegs - newnsegs;
+
+ /*
+ * If the sufile is successfully truncated, immediately adjust
+ * the segment allocation space while locking the semaphore
+ * "mi_sem" so that nilfs_sufile_alloc() never allocates
+ * segments in the truncated space.
+ */
+ sui->allocmax = newnsegs - 1;
+ sui->allocmin = 0;
}
kaddr = kmap_atomic(header_bh->b_page);
@@ -1168,12 +1210,12 @@ int nilfs_sufile_read(struct super_block *sb, size_t susize,
int err;
if (susize > sb->s_blocksize) {
- nilfs_msg(sb, KERN_ERR,
- "too large segment usage size: %zu bytes", susize);
+ nilfs_err(sb, "too large segment usage size: %zu bytes",
+ susize);
return -EINVAL;
} else if (susize < NILFS_MIN_SEGMENT_USAGE_SIZE) {
- nilfs_msg(sb, KERN_ERR,
- "too small segment usage size: %zu bytes", susize);
+ nilfs_err(sb, "too small segment usage size: %zu bytes",
+ susize);
return -EINVAL;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index b5bc9f0c6a40..59e74e764cd2 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -106,7 +106,7 @@ static void nilfs_set_error(struct super_block *sb)
*
* This implements the body of nilfs_error() macro. Normally,
* nilfs_error() should be used. As for sustainable errors such as a
- * single-shot I/O error, nilfs_msg() should be used instead.
+ * single-shot I/O error, nilfs_err() should be used instead.
*
* Callers should not add a trailing newline since this will do it.
*/
@@ -179,8 +179,7 @@ static int nilfs_sync_super(struct super_block *sb, int flag)
}
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR, "unable to write superblock: err=%d",
- err);
+ nilfs_err(sb, "unable to write superblock: err=%d", err);
if (err == -EIO && nilfs->ns_sbh[1]) {
/*
* sbp[0] points to newer log than sbp[1],
@@ -250,7 +249,7 @@ struct nilfs_super_block **nilfs_prepare_super(struct super_block *sb,
sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) {
memcpy(sbp[0], sbp[1], nilfs->ns_sbsize);
} else {
- nilfs_msg(sb, KERN_CRIT, "superblock broke");
+ nilfs_crit(sb, "superblock broke");
return NULL;
}
} else if (sbp[1] &&
@@ -360,17 +359,38 @@ static int nilfs_move_2nd_super(struct super_block *sb, loff_t sb2off)
offset = sb2off & (nilfs->ns_blocksize - 1);
nsbh = sb_getblk(sb, newblocknr);
if (!nsbh) {
- nilfs_msg(sb, KERN_WARNING,
- "unable to move secondary superblock to block %llu",
- (unsigned long long)newblocknr);
+ nilfs_warn(sb,
+ "unable to move secondary superblock to block %llu",
+ (unsigned long long)newblocknr);
ret = -EIO;
goto out;
}
nsbp = (void *)nsbh->b_data + offset;
- memset(nsbp, 0, nilfs->ns_blocksize);
+ lock_buffer(nsbh);
if (sb2i >= 0) {
+ /*
+ * The position of the second superblock only changes by 4KiB,
+ * which is larger than the maximum superblock data size
+ * (= 1KiB), so there is no need to use memmove() to allow
+ * overlap between source and destination.
+ */
memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize);
+
+ /*
+ * Zero fill after copy to avoid overwriting in case of move
+ * within the same block.
+ */
+ memset(nsbh->b_data, 0, offset);
+ memset((void *)nsbp + nilfs->ns_sbsize, 0,
+ nsbh->b_size - offset - nilfs->ns_sbsize);
+ } else {
+ memset(nsbh->b_data, 0, nsbh->b_size);
+ }
+ set_buffer_uptodate(nsbh);
+ unlock_buffer(nsbh);
+
+ if (sb2i >= 0) {
brelse(nilfs->ns_sbh[sb2i]);
nilfs->ns_sbh[sb2i] = nsbh;
nilfs->ns_sbp[sb2i] = nsbp;
@@ -404,6 +424,15 @@ int nilfs_resize_fs(struct super_block *sb, __u64 newsize)
goto out;
/*
+ * Prevent underflow in second superblock position calculation.
+ * The exact minimum size check is done in nilfs_sufile_resize().
+ */
+ if (newsize < 4096) {
+ ret = -ENOSPC;
+ goto out;
+ }
+
+ /*
* Write lock is required to protect some functions depending
* on the number of segments, the number of reserved segments,
* and so forth.
@@ -468,6 +497,7 @@ static void nilfs_put_super(struct super_block *sb)
up_write(&nilfs->ns_sem);
}
+ nilfs_sysfs_delete_device_group(nilfs);
iput(nilfs->ns_sufile);
iput(nilfs->ns_cpfile);
iput(nilfs->ns_dat);
@@ -525,7 +555,7 @@ int nilfs_attach_checkpoint(struct super_block *sb, __u64 cno, int curr_mnt,
up_read(&nilfs->ns_segctor_sem);
if (unlikely(err)) {
if (err == -ENOENT || err == -EINVAL) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"Invalid checkpoint (checkpoint number=%llu)",
(unsigned long long)cno);
err = -EINVAL;
@@ -623,8 +653,7 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
err = nilfs_ifile_count_free_inodes(root->ifile,
&nmaxinodes, &nfreeinodes);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_WARNING,
- "failed to count free inodes: err=%d", err);
+ nilfs_warn(sb, "failed to count free inodes: err=%d", err);
if (err == -ERANGE) {
/*
* If nilfs_palloc_count_max_entries() returns
@@ -756,7 +785,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
break;
case Opt_snapshot:
if (is_remount) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"\"%s\" option is invalid for remount",
p);
return 0;
@@ -772,8 +801,7 @@ static int parse_options(char *options, struct super_block *sb, int is_remount)
nilfs_clear_opt(nilfs, DISCARD);
break;
default:
- nilfs_msg(sb, KERN_ERR,
- "unrecognized mount option \"%s\"", p);
+ nilfs_err(sb, "unrecognized mount option \"%s\"", p);
return 0;
}
}
@@ -809,10 +837,10 @@ static int nilfs_setup_super(struct super_block *sb, int is_mount)
mnt_count = le16_to_cpu(sbp[0]->s_mnt_count);
if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
- nilfs_msg(sb, KERN_WARNING, "mounting fs with errors");
+ nilfs_warn(sb, "mounting fs with errors");
#if 0
} else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
- nilfs_msg(sb, KERN_WARNING, "maximal mount count reached");
+ nilfs_warn(sb, "maximal mount count reached");
#endif
}
if (!max_mnt_count)
@@ -875,7 +903,7 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
features = le64_to_cpu(sbp->s_feature_incompat) &
~NILFS_FEATURE_INCOMPAT_SUPP;
if (features) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't mount because of unsupported optional features (%llx)",
(unsigned long long)features);
return -EINVAL;
@@ -883,7 +911,7 @@ int nilfs_check_feature_compatibility(struct super_block *sb,
features = le64_to_cpu(sbp->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (!sb_rdonly(sb) && features) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't mount RDWR because of unsupported optional features (%llx)",
(unsigned long long)features);
return -EINVAL;
@@ -902,12 +930,12 @@ static int nilfs_get_root_dentry(struct super_block *sb,
inode = nilfs_iget(sb, root, NILFS_ROOT_INO);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
- nilfs_msg(sb, KERN_ERR, "error %d getting root inode", ret);
+ nilfs_err(sb, "error %d getting root inode", ret);
goto out;
}
if (!S_ISDIR(inode->i_mode) || !inode->i_blocks || !inode->i_size) {
iput(inode);
- nilfs_msg(sb, KERN_ERR, "corrupt root inode");
+ nilfs_err(sb, "corrupt root inode");
ret = -EINVAL;
goto out;
}
@@ -935,7 +963,7 @@ static int nilfs_get_root_dentry(struct super_block *sb,
return ret;
failed_dentry:
- nilfs_msg(sb, KERN_ERR, "error %d getting root dentry", ret);
+ nilfs_err(sb, "error %d getting root dentry", ret);
goto out;
}
@@ -955,7 +983,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
ret = (ret == -ENOENT) ? -EINVAL : ret;
goto out;
} else if (!ret) {
- nilfs_msg(s, KERN_ERR,
+ nilfs_err(s,
"The specified checkpoint is not a snapshot (checkpoint number=%llu)",
(unsigned long long)cno);
ret = -EINVAL;
@@ -964,7 +992,7 @@ static int nilfs_attach_snapshot(struct super_block *s, __u64 cno,
ret = nilfs_attach_checkpoint(s, cno, false, &root);
if (ret) {
- nilfs_msg(s, KERN_ERR,
+ nilfs_err(s,
"error %d while loading snapshot (checkpoint number=%llu)",
ret, (unsigned long long)cno);
goto out;
@@ -1061,7 +1089,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
cno = nilfs_last_cno(nilfs);
err = nilfs_attach_checkpoint(sb, cno, true, &fsroot);
if (err) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"error %d while loading last checkpoint (checkpoint number=%llu)",
err, (unsigned long long)cno);
goto failed_unload;
@@ -1094,6 +1122,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
nilfs_put_root(fsroot);
failed_unload:
+ nilfs_sysfs_delete_device_group(nilfs);
iput(nilfs->ns_sufile);
iput(nilfs->ns_cpfile);
iput(nilfs->ns_dat);
@@ -1123,16 +1152,14 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
err = -EINVAL;
if (!nilfs_valid_fs(nilfs)) {
- nilfs_msg(sb, KERN_WARNING,
- "couldn't remount because the filesystem is in an incomplete recovery state");
+ nilfs_warn(sb,
+ "couldn't remount because the filesystem is in an incomplete recovery state");
goto restore_opts;
}
if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
if (*flags & SB_RDONLY) {
- /* Shutting down log writer */
- nilfs_detach_log_writer(sb);
sb->s_flags |= SB_RDONLY;
/*
@@ -1156,9 +1183,9 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
~NILFS_FEATURE_COMPAT_RO_SUPP;
up_read(&nilfs->ns_sem);
if (features) {
- nilfs_msg(sb, KERN_WARNING,
- "couldn't remount RDWR because of unsupported optional features (%llx)",
- (unsigned long long)features);
+ nilfs_warn(sb,
+ "couldn't remount RDWR because of unsupported optional features (%llx)",
+ (unsigned long long)features);
err = -EROFS;
goto restore_opts;
}
@@ -1217,7 +1244,7 @@ static int nilfs_parse_snapshot_option(const char *option,
return 0;
parse_error:
- nilfs_msg(NULL, KERN_ERR, "invalid option \"%s\": %s", option, msg);
+ nilfs_err(NULL, "invalid option \"%s\": %s", option, msg);
return 1;
}
@@ -1320,7 +1347,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
} else if (!sd.cno) {
if (nilfs_tree_is_busy(s->s_root)) {
if ((flags ^ s->s_flags) & SB_RDONLY) {
- nilfs_msg(s, KERN_ERR,
+ nilfs_err(s,
"the device already has a %s mount.",
sb_rdonly(s) ? "read-only" : "read/write");
err = -EBUSY;
diff --git a/fs/nilfs2/sysfs.c b/fs/nilfs2/sysfs.c
index 28a2db3b1787..57afd06db62d 100644
--- a/fs/nilfs2/sysfs.c
+++ b/fs/nilfs2/sysfs.c
@@ -261,8 +261,8 @@ nilfs_checkpoints_checkpoints_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to get checkpoint stat: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to get checkpoint stat: err=%d",
+ err);
return err;
}
@@ -284,8 +284,8 @@ nilfs_checkpoints_snapshots_number_show(struct nilfs_checkpoints_attr *attr,
err = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to get checkpoint stat: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to get checkpoint stat: err=%d",
+ err);
return err;
}
@@ -403,8 +403,8 @@ nilfs_segments_dirty_segments_show(struct nilfs_segments_attr *attr,
err = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
up_read(&nilfs->ns_segctor_sem);
if (err < 0) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to get segment stat: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to get segment stat: err=%d",
+ err);
return err;
}
@@ -777,15 +777,15 @@ nilfs_superblock_sb_update_frequency_store(struct nilfs_superblock_attr *attr,
err = kstrtouint(skip_spaces(buf), 0, &val);
if (err) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "unable to convert string: err=%d", err);
+ nilfs_err(nilfs->ns_sb, "unable to convert string: err=%d",
+ err);
return err;
}
if (val < NILFS_SB_FREQ) {
val = NILFS_SB_FREQ;
- nilfs_msg(nilfs->ns_sb, KERN_WARNING,
- "superblock update frequency cannot be lesser than 10 seconds");
+ nilfs_warn(nilfs->ns_sb,
+ "superblock update frequency cannot be lesser than 10 seconds");
}
down_write(&nilfs->ns_sem);
@@ -988,8 +988,7 @@ int nilfs_sysfs_create_device_group(struct super_block *sb)
nilfs->ns_dev_subgroups = kzalloc(devgrp_size, GFP_KERNEL);
if (unlikely(!nilfs->ns_dev_subgroups)) {
err = -ENOMEM;
- nilfs_msg(sb, KERN_ERR,
- "unable to allocate memory for device group");
+ nilfs_err(sb, "unable to allocate memory for device group");
goto failed_create_device_group;
}
@@ -1098,15 +1097,13 @@ int __init nilfs_sysfs_init(void)
nilfs_kset = kset_create_and_add(NILFS_ROOT_GROUP_NAME, NULL, fs_kobj);
if (!nilfs_kset) {
err = -ENOMEM;
- nilfs_msg(NULL, KERN_ERR,
- "unable to create sysfs entry: err=%d", err);
+ nilfs_err(NULL, "unable to create sysfs entry: err=%d", err);
goto failed_sysfs_init;
}
err = sysfs_create_group(&nilfs_kset->kobj, &nilfs_feature_attr_group);
if (unlikely(err)) {
- nilfs_msg(NULL, KERN_ERR,
- "unable to create feature group: err=%d", err);
+ nilfs_err(NULL, "unable to create feature group: err=%d", err);
goto cleanup_sysfs_init;
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 931870768556..0480034644aa 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -13,6 +13,7 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/random.h>
+#include <linux/log2.h>
#include <linux/crc32.h>
#include "nilfs.h"
#include "segment.h"
@@ -86,7 +87,6 @@ void destroy_nilfs(struct the_nilfs *nilfs)
{
might_sleep();
if (nilfs_init(nilfs)) {
- nilfs_sysfs_delete_device_group(nilfs);
brelse(nilfs->ns_sbh[0]);
brelse(nilfs->ns_sbh[1]);
}
@@ -183,7 +183,7 @@ static int nilfs_store_log_cursor(struct the_nilfs *nilfs,
nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
nilfs->ns_cno = nilfs->ns_last_cno + 1;
if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ nilfs_err(nilfs->ns_sb,
"pointed segment number is out of range: segnum=%llu, nsegments=%lu",
(unsigned long long)nilfs->ns_segnum,
nilfs->ns_nsegments);
@@ -210,12 +210,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
int err;
if (!valid_fs) {
- nilfs_msg(sb, KERN_WARNING, "mounting unchecked fs");
+ nilfs_warn(sb, "mounting unchecked fs");
if (s_flags & SB_RDONLY) {
- nilfs_msg(sb, KERN_INFO,
- "recovery required for readonly filesystem");
- nilfs_msg(sb, KERN_INFO,
- "write access will be enabled during recovery");
+ nilfs_info(sb,
+ "recovery required for readonly filesystem");
+ nilfs_info(sb,
+ "write access will be enabled during recovery");
}
}
@@ -230,12 +230,11 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
goto scan_error;
if (!nilfs_valid_sb(sbp[1])) {
- nilfs_msg(sb, KERN_WARNING,
- "unable to fall back to spare super block");
+ nilfs_warn(sb,
+ "unable to fall back to spare super block");
goto scan_error;
}
- nilfs_msg(sb, KERN_INFO,
- "trying rollback from an earlier position");
+ nilfs_info(sb, "trying rollback from an earlier position");
/*
* restore super block with its spare and reconfigure
@@ -248,9 +247,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
/* verify consistency between two super blocks */
blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size);
if (blocksize != nilfs->ns_blocksize) {
- nilfs_msg(sb, KERN_WARNING,
- "blocksize differs between two super blocks (%d != %d)",
- blocksize, nilfs->ns_blocksize);
+ nilfs_warn(sb,
+ "blocksize differs between two super blocks (%d != %d)",
+ blocksize, nilfs->ns_blocksize);
goto scan_error;
}
@@ -269,11 +268,14 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
err = nilfs_load_super_root(nilfs, sb, ri.ri_super_root);
if (unlikely(err)) {
- nilfs_msg(sb, KERN_ERR, "error %d while loading super root",
- err);
+ nilfs_err(sb, "error %d while loading super root", err);
goto failed;
}
+ err = nilfs_sysfs_create_device_group(sb);
+ if (unlikely(err))
+ goto sysfs_error;
+
if (valid_fs)
goto skip_recovery;
@@ -281,28 +283,28 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
__u64 features;
if (nilfs_test_opt(nilfs, NORECOVERY)) {
- nilfs_msg(sb, KERN_INFO,
- "norecovery option specified, skipping roll-forward recovery");
+ nilfs_info(sb,
+ "norecovery option specified, skipping roll-forward recovery");
goto skip_recovery;
}
features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) &
~NILFS_FEATURE_COMPAT_RO_SUPP;
if (features) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't proceed with recovery because of unsupported optional features (%llx)",
(unsigned long long)features);
err = -EROFS;
goto failed_unload;
}
if (really_read_only) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"write access unavailable, cannot proceed");
err = -EROFS;
goto failed_unload;
}
sb->s_flags &= ~SB_RDONLY;
} else if (nilfs_test_opt(nilfs, NORECOVERY)) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"recovery cancelled because norecovery option was specified for a read/write mount");
err = -EINVAL;
goto failed_unload;
@@ -318,12 +320,12 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
up_write(&nilfs->ns_sem);
if (err) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"error %d updating super block. recovery unfinished.",
err);
goto failed_unload;
}
- nilfs_msg(sb, KERN_INFO, "recovery complete");
+ nilfs_info(sb, "recovery complete");
skip_recovery:
nilfs_clear_recovery_info(&ri);
@@ -331,10 +333,13 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
return 0;
scan_error:
- nilfs_msg(sb, KERN_ERR, "error %d while searching super root", err);
+ nilfs_err(sb, "error %d while searching super root", err);
goto failed;
failed_unload:
+ nilfs_sysfs_delete_device_group(nilfs);
+
+ sysfs_error:
iput(nilfs->ns_cpfile);
iput(nilfs->ns_sufile);
iput(nilfs->ns_dat);
@@ -368,6 +373,18 @@ unsigned long nilfs_nrsvsegs(struct the_nilfs *nilfs, unsigned long nsegs)
100));
}
+/**
+ * nilfs_max_segment_count - calculate the maximum number of segments
+ * @nilfs: nilfs object
+ */
+static u64 nilfs_max_segment_count(struct the_nilfs *nilfs)
+{
+ u64 max_count = U64_MAX;
+
+ do_div(max_count, nilfs->ns_blocks_per_segment);
+ return min_t(u64, max_count, ULONG_MAX);
+}
+
void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
{
nilfs->ns_nsegments = nsegs;
@@ -377,8 +394,10 @@ void nilfs_set_nsegments(struct the_nilfs *nilfs, unsigned long nsegs)
static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
struct nilfs_super_block *sbp)
{
+ u64 nsegments, nblocks;
+
if (le32_to_cpu(sbp->s_rev_level) < NILFS_MIN_SUPP_REV) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ nilfs_err(nilfs->ns_sb,
"unsupported revision (superblock rev.=%d.%d, current rev.=%d.%d). Please check the version of mkfs.nilfs(2).",
le32_to_cpu(sbp->s_rev_level),
le16_to_cpu(sbp->s_minor_rev_level),
@@ -391,13 +410,11 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
if (nilfs->ns_inode_size > nilfs->ns_blocksize) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "too large inode size: %d bytes",
+ nilfs_err(nilfs->ns_sb, "too large inode size: %d bytes",
nilfs->ns_inode_size);
return -EINVAL;
} else if (nilfs->ns_inode_size < NILFS_MIN_INODE_SIZE) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "too small inode size: %d bytes",
+ nilfs_err(nilfs->ns_sb, "too small inode size: %d bytes",
nilfs->ns_inode_size);
return -EINVAL;
}
@@ -406,8 +423,7 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
- "too short segment: %lu blocks",
+ nilfs_err(nilfs->ns_sb, "too short segment: %lu blocks",
nilfs->ns_blocks_per_segment);
return -EINVAL;
}
@@ -417,13 +433,41 @@ static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
le32_to_cpu(sbp->s_r_segments_percentage);
if (nilfs->ns_r_segments_percentage < 1 ||
nilfs->ns_r_segments_percentage > 99) {
- nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ nilfs_err(nilfs->ns_sb,
"invalid reserved segments percentage: %lu",
nilfs->ns_r_segments_percentage);
return -EINVAL;
}
- nilfs_set_nsegments(nilfs, le64_to_cpu(sbp->s_nsegments));
+ nsegments = le64_to_cpu(sbp->s_nsegments);
+ if (nsegments > nilfs_max_segment_count(nilfs)) {
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "segment count %llu exceeds upper limit (%llu segments)",
+ (unsigned long long)nsegments,
+ (unsigned long long)nilfs_max_segment_count(nilfs));
+ return -EINVAL;
+ }
+
+ nblocks = (u64)i_size_read(nilfs->ns_sb->s_bdev->bd_inode) >>
+ nilfs->ns_sb->s_blocksize_bits;
+ if (nblocks) {
+ u64 min_block_count = nsegments * nilfs->ns_blocks_per_segment;
+ /*
+ * To avoid failing to mount early device images without a
+ * second superblock, exclude that block count from the
+ * "min_block_count" calculation.
+ */
+
+ if (nblocks < min_block_count) {
+ nilfs_msg(nilfs->ns_sb, KERN_ERR,
+ "total number of segment blocks %llu exceeds device size (%llu blocks)",
+ (unsigned long long)min_block_count,
+ (unsigned long long)nblocks);
+ return -EINVAL;
+ }
+ }
+
+ nilfs_set_nsegments(nilfs, nsegments);
nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
return 0;
}
@@ -448,11 +492,33 @@ static int nilfs_valid_sb(struct nilfs_super_block *sbp)
return crc == le32_to_cpu(sbp->s_sum);
}
-static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
+/**
+ * nilfs_sb2_bad_offset - check the location of the second superblock
+ * @sbp: superblock raw data buffer
+ * @offset: byte offset of second superblock calculated from device size
+ *
+ * nilfs_sb2_bad_offset() checks if the position on the second
+ * superblock is valid or not based on the filesystem parameters
+ * stored in @sbp. If @offset points to a location within the segment
+ * area, or if the parameters themselves are not normal, it is
+ * determined to be invalid.
+ *
+ * Return Value: true if invalid, false if valid.
+ */
+static bool nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
{
- return offset < ((le64_to_cpu(sbp->s_nsegments) *
- le32_to_cpu(sbp->s_blocks_per_segment)) <<
- (le32_to_cpu(sbp->s_log_block_size) + 10));
+ unsigned int shift_bits = le32_to_cpu(sbp->s_log_block_size);
+ u32 blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
+ u64 nsegments = le64_to_cpu(sbp->s_nsegments);
+ u64 index;
+
+ if (blocks_per_segment < NILFS_SEG_MIN_BLOCKS ||
+ shift_bits > ilog2(NILFS_MAX_BLOCK_SIZE) - BLOCK_SIZE_BITS)
+ return true;
+
+ index = offset >> (shift_bits + BLOCK_SIZE_BITS);
+ do_div(index, blocks_per_segment);
+ return index < nsegments;
}
static void nilfs_release_super_block(struct the_nilfs *nilfs)
@@ -494,25 +560,31 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
{
struct nilfs_super_block **sbp = nilfs->ns_sbp;
struct buffer_head **sbh = nilfs->ns_sbh;
- u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
+ u64 sb2off, devsize = nilfs->ns_bdev->bd_inode->i_size;
int valid[2], swp = 0;
+ if (devsize < NILFS_SEG_MIN_BLOCKS * NILFS_MIN_BLOCK_SIZE + 4096) {
+ nilfs_msg(sb, KERN_ERR, "device size too small");
+ return -EINVAL;
+ }
+ sb2off = NILFS_SB2_OFFSET_BYTES(devsize);
+
sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
&sbh[0]);
sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
if (!sbp[0]) {
if (!sbp[1]) {
- nilfs_msg(sb, KERN_ERR, "unable to read superblock");
+ nilfs_err(sb, "unable to read superblock");
return -EIO;
}
- nilfs_msg(sb, KERN_WARNING,
- "unable to read primary superblock (blocksize = %d)",
- blocksize);
+ nilfs_warn(sb,
+ "unable to read primary superblock (blocksize = %d)",
+ blocksize);
} else if (!sbp[1]) {
- nilfs_msg(sb, KERN_WARNING,
- "unable to read secondary superblock (blocksize = %d)",
- blocksize);
+ nilfs_warn(sb,
+ "unable to read secondary superblock (blocksize = %d)",
+ blocksize);
}
/*
@@ -534,14 +606,14 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs,
}
if (!valid[swp]) {
nilfs_release_super_block(nilfs);
- nilfs_msg(sb, KERN_ERR, "couldn't find nilfs on the device");
+ nilfs_err(sb, "couldn't find nilfs on the device");
return -EINVAL;
}
if (!valid[!swp])
- nilfs_msg(sb, KERN_WARNING,
- "broken superblock, retrying with spare superblock (blocksize = %d)",
- blocksize);
+ nilfs_warn(sb,
+ "broken superblock, retrying with spare superblock (blocksize = %d)",
+ blocksize);
if (swp)
nilfs_swap_super_block(nilfs);
@@ -575,7 +647,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE);
if (!blocksize) {
- nilfs_msg(sb, KERN_ERR, "unable to set blocksize");
+ nilfs_err(sb, "unable to set blocksize");
err = -EINVAL;
goto out;
}
@@ -594,7 +666,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
if (blocksize < NILFS_MIN_BLOCK_SIZE ||
blocksize > NILFS_MAX_BLOCK_SIZE) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"couldn't mount because of unsupported filesystem blocksize %d",
blocksize);
err = -EINVAL;
@@ -604,14 +676,18 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
if (blocksize < hw_blocksize) {
- nilfs_msg(sb, KERN_ERR,
+ nilfs_err(sb,
"blocksize %d too small for device (sector-size = %d)",
blocksize, hw_blocksize);
err = -EINVAL;
goto failed_sbh;
}
nilfs_release_super_block(nilfs);
- sb_set_blocksize(sb, blocksize);
+ if (!sb_set_blocksize(sb, blocksize)) {
+ nilfs_msg(sb, KERN_ERR, "bad blocksize %d", blocksize);
+ err = -EINVAL;
+ goto out;
+ }
err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
if (err)
@@ -639,10 +715,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
if (err)
goto failed_sbh;
- err = nilfs_sysfs_create_device_group(sb);
- if (err)
- goto failed_sbh;
-
set_nilfs_init(nilfs);
err = 0;
out:
@@ -695,9 +767,7 @@ int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
{
unsigned long ncleansegs;
- down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
- up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
return 0;
}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 380a543c5b19..de6e24d80eb6 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -29,6 +29,7 @@ enum {
THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
THE_NILFS_GC_RUNNING, /* gc process is running */
THE_NILFS_SB_DIRTY, /* super block is dirty */
+ THE_NILFS_PURGING, /* disposing dirty files for cleanup */
};
/**
@@ -208,6 +209,7 @@ THE_NILFS_FNS(INIT, init)
THE_NILFS_FNS(DISCONTINUED, discontinued)
THE_NILFS_FNS(GC_RUNNING, gc_running)
THE_NILFS_FNS(SB_DIRTY, sb_dirty)
+THE_NILFS_FNS(PURGING, purging)
/*
* Mount option operations
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c
index 52ccd34b1e79..a026dbd3593f 100644
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -272,7 +272,7 @@ int unregister_nls(struct nls_table * nls)
return -EINVAL;
}
-static struct nls_table *find_nls(char *charset)
+static struct nls_table *find_nls(const char *charset)
{
struct nls_table *nls;
spin_lock(&nls_lock);
@@ -288,7 +288,7 @@ static struct nls_table *find_nls(char *charset)
return nls;
}
-struct nls_table *load_nls(char *charset)
+struct nls_table *load_nls(const char *charset)
{
return try_then_request_module(find_nls(charset), "nls_%s", charset);
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 8508ab575017..ec4eadf459ae 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -928,8 +928,11 @@ static int fanotify_test_fid(struct path *path, __kernel_fsid_t *fsid)
return 0;
}
-static int fanotify_events_supported(struct path *path, __u64 mask)
+static int fanotify_events_supported(struct path *path, __u64 mask,
+ unsigned int flags)
{
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
+
/*
* Some filesystems such as 'proc' acquire unusual locks when opening
* files. For them fanotify permission events have high chances of
@@ -941,6 +944,21 @@ static int fanotify_events_supported(struct path *path, __u64 mask)
if (mask & FANOTIFY_PERM_EVENTS &&
path->mnt->mnt_sb->s_type->fs_flags & FS_DISALLOW_NOTIFY_PERM)
return -EINVAL;
+
+ /*
+ * mount and sb marks are not allowed on kernel internal pseudo fs,
+ * like pipe_mnt, because that would subscribe to events on all the
+ * anonynous pipes in the system.
+ *
+ * SB_NOUSER covers all of the internal pseudo fs whose objects are not
+ * exposed to user's mount namespace, but there are other SB_KERNMOUNT
+ * fs, like nsfs, debugfs, for which the value of allowing sb and mount
+ * mark is questionable. For now we leave them alone.
+ */
+ if (mark_type != FAN_MARK_INODE &&
+ path->mnt->mnt_sb->s_flags & SB_NOUSER)
+ return -EINVAL;
+
return 0;
}
@@ -1050,7 +1068,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
goto fput_and_out;
if (flags & FAN_MARK_ADD) {
- ret = fanotify_events_supported(&path, mask);
+ ret = fanotify_events_supported(&path, mask, flags);
if (ret)
goto path_put_and_out;
}
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 589dee962993..a3cbfd38f1f6 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -66,7 +66,7 @@ int inotify_handle_event(struct fsnotify_group *group,
struct inotify_event_info *event;
struct fsnotify_event *fsn_event;
int ret;
- int len = 0;
+ int len = 0, wd;
int alloc_len = sizeof(struct inotify_event_info);
if (WARN_ON(fsnotify_iter_vfsmount_mark(iter_info)))
@@ -91,6 +91,13 @@ int inotify_handle_event(struct fsnotify_group *group,
fsn_mark);
/*
+ * We can be racing with mark being detached. Don't report event with
+ * invalid wd.
+ */
+ wd = READ_ONCE(i_mark->wd);
+ if (wd == -1)
+ return 0;
+ /*
* Whoever is interested in the event, pays for the allocation. Do not
* trigger OOM killer in the target monitoring memcg as it may have
* security repercussion.
@@ -120,7 +127,7 @@ int inotify_handle_event(struct fsnotify_group *group,
fsn_event = &event->fse;
fsnotify_init_event(fsn_event, (unsigned long)inode);
event->mask = mask;
- event->wd = i_mark->wd;
+ event->wd = wd;
event->sync_cookie = cookie;
event->name_len = len;
if (len)
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index 914e99173130..c0881d39d36a 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -594,17 +594,37 @@ static int ntfs_attr_find(const ATTR_TYPE type, const ntfschar *name,
for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
u8 *mrec_end = (u8 *)ctx->mrec +
le32_to_cpu(ctx->mrec->bytes_allocated);
- u8 *name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
- a->name_length * sizeof(ntfschar);
- if ((u8*)a < (u8*)ctx->mrec || (u8*)a > mrec_end ||
- name_end > mrec_end)
+ u8 *name_end;
+
+ /* check whether ATTR_RECORD wrap */
+ if ((u8 *)a < (u8 *)ctx->mrec)
+ break;
+
+ /* check whether Attribute Record Header is within bounds */
+ if ((u8 *)a > mrec_end ||
+ (u8 *)a + sizeof(ATTR_RECORD) > mrec_end)
+ break;
+
+ /* check whether ATTR_RECORD's name is within bounds */
+ name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
+ a->name_length * sizeof(ntfschar);
+ if (name_end > mrec_end)
break;
+
ctx->attr = a;
if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
a->type == AT_END))
return -ENOENT;
if (unlikely(!a->length))
break;
+
+ /* check whether ATTR_RECORD's length wrap */
+ if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a)
+ break;
+ /* check whether ATTR_RECORD's length is within bounds */
+ if ((u8 *)a + le32_to_cpu(a->length) > mrec_end)
+ break;
+
if (a->type != type)
continue;
/*
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index cf222c9225d6..645c4b1b23de 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1829,6 +1829,13 @@ int ntfs_read_inode_mount(struct inode *vi)
goto err_out;
}
+ /* Sanity check offset to the first attribute */
+ if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) {
+ ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.",
+ le16_to_cpu(m->attrs_offset));
+ goto err_out;
+ }
+
/* Need this to sanity check attribute list references to $MFT. */
vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 7dc3bc604f78..8f33382a00ea 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2092,7 +2092,8 @@ get_ctx_vol_failed:
// TODO: Initialize security.
/* Get the extended system files' directory inode. */
vol->extend_ino = ntfs_iget(sb, FILE_Extend);
- if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) {
+ if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
+ !S_ISDIR(vol->extend_ino->i_mode)) {
if (!IS_ERR(vol->extend_ino))
iput(vol->extend_ino);
ntfs_error(sb, "Failed to load $Extend.");
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 4db87b26cf7b..52ccd8d4ed82 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4708,7 +4708,7 @@ int ocfs2_insert_extent(handle_t *handle,
struct ocfs2_alloc_context *meta_ac)
{
int status;
- int uninitialized_var(free_records);
+ int free_records;
struct buffer_head *last_eb_bh = NULL;
struct ocfs2_insert_type insert = {0, };
struct ocfs2_extent_rec rec;
@@ -7052,7 +7052,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
int need_free = 0;
u32 bit_off, num;
handle_t *handle;
- u64 uninitialized_var(block);
+ u64 block;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 7f66e3342475..91702ebffe84 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1990,11 +1990,25 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
}
if (unlikely(copied < len) && wc->w_target_page) {
+ loff_t new_isize;
+
if (!PageUptodate(wc->w_target_page))
copied = 0;
- ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
- start+len);
+ new_isize = max_t(loff_t, i_size_read(inode), pos + copied);
+ if (new_isize > page_offset(wc->w_target_page))
+ ocfs2_zero_new_buffers(wc->w_target_page, start+copied,
+ start+len);
+ else {
+ /*
+ * When page is fully beyond new isize (data copy
+ * failed), do not bother zeroing the page. Invalidate
+ * it instead so that writeback does not get confused
+ * put page & buffer dirty bits into inconsistent
+ * state.
+ */
+ block_invalidatepage(wc->w_target_page, 0, PAGE_SIZE);
+ }
}
if (wc->w_target_page)
flush_dcache_page(wc->w_target_page);
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index bdef72c0f099..49b9c61459c5 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -848,9 +848,9 @@ static int ocfs2_dx_dir_lookup(struct inode *inode,
u64 *ret_phys_blkno)
{
int ret = 0;
- unsigned int cend, uninitialized_var(clen);
- u32 uninitialized_var(cpos);
- u64 uninitialized_var(blkno);
+ unsigned int cend, clen;
+ u32 cpos;
+ u64 blkno;
u32 name_hash = hinfo->major_hash;
ret = ocfs2_dx_dir_lookup_rec(inode, el, name_hash, &cpos, &blkno,
@@ -894,7 +894,7 @@ static int ocfs2_dx_dir_search(const char *name, int namelen,
struct ocfs2_dir_lookup_result *res)
{
int ret, i, found;
- u64 uninitialized_var(phys);
+ u64 phys;
struct buffer_head *dx_leaf_bh = NULL;
struct ocfs2_dx_leaf *dx_leaf;
struct ocfs2_dx_entry *dx_entry = NULL;
@@ -4393,9 +4393,9 @@ out:
int ocfs2_dx_dir_truncate(struct inode *dir, struct buffer_head *di_bh)
{
int ret;
- unsigned int uninitialized_var(clen);
- u32 major_hash = UINT_MAX, p_cpos, uninitialized_var(cpos);
- u64 uninitialized_var(blkno);
+ unsigned int clen;
+ u32 major_hash = UINT_MAX, p_cpos, cpos;
+ u64 blkno;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct buffer_head *dx_root_bh = NULL;
struct ocfs2_dx_root_block *dx_root;
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index aaf24548b02a..0463dce65bb2 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -688,10 +688,6 @@ struct dlm_begin_reco
__be32 pad2;
};
-
-#define BITS_PER_BYTE 8
-#define BITS_TO_BYTES(bits) (((bits)+BITS_PER_BYTE-1)/BITS_PER_BYTE)
-
struct dlm_query_join_request
{
u8 node_idx;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 207ec61569ea..bcc4b5d3e54e 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3396,10 +3396,12 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
ocfs2_lock_res_free(&osb->osb_nfs_sync_lockres);
ocfs2_lock_res_free(&osb->osb_orphan_scan.os_lockres);
- ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
- osb->cconn = NULL;
+ if (osb->cconn) {
+ ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
+ osb->cconn = NULL;
- ocfs2_dlm_shutdown_debug(osb);
+ ocfs2_dlm_shutdown_debug(osb);
+ }
}
static int ocfs2_drop_lock(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index e3e2d1b2af51..f3926765c3f0 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -403,7 +403,7 @@ static int ocfs2_get_clusters_nocache(struct inode *inode,
{
int i, ret, tree_height, len;
struct ocfs2_dinode *di;
- struct ocfs2_extent_block *uninitialized_var(eb);
+ struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec;
struct buffer_head *eb_bh = NULL;
@@ -599,7 +599,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
unsigned int *extent_flags)
{
int ret;
- unsigned int uninitialized_var(hole_len), flags = 0;
+ unsigned int hole_len, flags = 0;
struct buffer_head *di_bh = NULL;
struct ocfs2_extent_rec rec;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index c30747bdfb12..6fea3f46a9e3 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2103,14 +2103,20 @@ static long ocfs2_fallocate(struct file *file, int mode, loff_t offset,
struct ocfs2_space_resv sr;
int change_size = 1;
int cmd = OCFS2_IOC_RESVSP64;
+ int ret = 0;
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
if (!ocfs2_writes_unwritten_extents(osb))
return -EOPNOTSUPP;
- if (mode & FALLOC_FL_KEEP_SIZE)
+ if (mode & FALLOC_FL_KEEP_SIZE) {
change_size = 0;
+ } else {
+ ret = inode_newsize_ok(inode, offset + len);
+ if (ret)
+ return ret;
+ }
if (mode & FALLOC_FL_PUNCH_HOLE)
cmd = OCFS2_IOC_UNRESVSP64;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 900e4ef686bf..da95ed79c12e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -159,7 +159,7 @@ static void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
replay_map->rm_state = REPLAY_DONE;
}
-static void ocfs2_free_replay_slots(struct ocfs2_super *osb)
+void ocfs2_free_replay_slots(struct ocfs2_super *osb)
{
struct ocfs2_replay_map *replay_map = osb->replay_map;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index bfe611ed1b1d..eb7a21bac71e 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -152,6 +152,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb);
void ocfs2_recovery_exit(struct ocfs2_super *osb);
int ocfs2_compute_replay_slots(struct ocfs2_super *osb);
+void ocfs2_free_replay_slots(struct ocfs2_super *osb);
/*
* Journal Control:
* Initialize, Load, Shutdown, Wipe a journal.
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 758d9661ef1e..98e77ea957ff 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -107,14 +107,6 @@ static int __ocfs2_move_extent(handle_t *handle,
*/
replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
- ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
- context->et.et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
ret = ocfs2_split_extent(handle, &context->et, path, index,
&replace_rec, context->meta_ac,
&context->dealloc);
@@ -123,8 +115,6 @@ static int __ocfs2_move_extent(handle_t *handle,
goto out;
}
- ocfs2_journal_dirty(handle, context->et.et_root_bh);
-
context->new_phys_cpos = new_p_cpos;
/*
@@ -446,7 +436,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode,
bg = (struct ocfs2_group_desc *)gd_bh->b_data;
if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
- le16_to_cpu(bg->bg_bits))) {
+ (le16_to_cpu(bg->bg_bits) << bits_per_unit))) {
*ret_bh = gd_bh;
*vict_bit = (vict_blkno - blkno) >>
@@ -561,6 +551,7 @@ static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
last_free_bits++;
if (last_free_bits == move_len) {
+ i -= move_len;
*goal_bit = i;
*phys_cpos = base_cpos + i;
break;
@@ -1032,18 +1023,19 @@ int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
context->range = &range;
+ /*
+ * ok, the default theshold for the defragmentation
+ * is 1M, since our maximum clustersize was 1M also.
+ * any thought?
+ */
+ if (!range.me_threshold)
+ range.me_threshold = 1024 * 1024;
+
+ if (range.me_threshold > i_size_read(inode))
+ range.me_threshold = i_size_read(inode);
+
if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
context->auto_defrag = 1;
- /*
- * ok, the default theshold for the defragmentation
- * is 1M, since our maximum clustersize was 1M also.
- * any thought?
- */
- if (!range.me_threshold)
- range.me_threshold = 1024 * 1024;
-
- if (range.me_threshold > i_size_read(inode))
- range.me_threshold = i_size_read(inode);
if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
context->partial = 1;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8ea51cf27b97..ba574fade6f0 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -198,6 +198,7 @@ static struct inode *ocfs2_get_init_inode(struct inode *dir, umode_t mode)
* callers. */
if (S_ISDIR(mode))
set_nlink(inode, 2);
+ mode = mode_strip_sgid(dir, mode);
inode_init_owner(inode, dir, mode);
status = dquot_initialize(inode);
if (status)
@@ -231,6 +232,7 @@ static int ocfs2_mknod(struct inode *dir,
handle_t *handle = NULL;
struct ocfs2_super *osb;
struct ocfs2_dinode *dirfe;
+ struct ocfs2_dinode *fe = NULL;
struct buffer_head *new_fe_bh = NULL;
struct inode *inode = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
@@ -381,6 +383,7 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
if (S_ISDIR(mode)) {
status = ocfs2_fill_new_dir(osb, handle, dir, inode,
new_fe_bh, data_ac, meta_ac);
@@ -446,8 +449,11 @@ static int ocfs2_mknod(struct inode *dir,
leave:
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && fe)
+ ocfs2_set_links_count(fe, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -625,18 +631,9 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
return status;
}
- status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
- if (status < 0) {
- u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
- int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
- inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
- if (tmp)
- mlog_errno(tmp);
- }
-
- return status;
}
static int ocfs2_mkdir(struct inode *dir,
@@ -1528,6 +1525,10 @@ static int ocfs2_rename(struct inode *old_dir,
status = ocfs2_add_entry(handle, new_dentry, old_inode,
OCFS2_I(old_inode)->ip_blkno,
new_dir_bh, &target_insert);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
}
old_inode->i_ctime = current_time(old_inode);
@@ -2017,8 +2018,11 @@ bail:
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && fe)
+ ocfs2_set_links_count(fe, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -2492,7 +2496,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
struct buffer_head *new_di_bh = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
- u64 uninitialized_var(di_blkno), suballoc_loc;
+ u64 di_blkno, suballoc_loc;
u16 suballoc_bit;
status = ocfs2_inode_lock(dir, &parent_di_bh, 1);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ee43e51188be..3f812d348d6b 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -1061,7 +1061,7 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
struct buffer_head **ret_bh)
{
int ret = 0, i, found;
- u32 low_cpos, uninitialized_var(cpos_end);
+ u32 low_cpos, cpos_end;
struct ocfs2_extent_list *el;
struct ocfs2_extent_rec *rec = NULL;
struct ocfs2_extent_block *eb = NULL;
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 188038760136..9f0326672af6 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -707,6 +707,8 @@ static struct ctl_table_header *ocfs2_table_header;
static int __init ocfs2_stack_glue_init(void)
{
+ int ret;
+
strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
@@ -716,7 +718,11 @@ static int __init ocfs2_stack_glue_init(void)
return -ENOMEM; /* or something. */
}
- return ocfs2_sysfs_init();
+ ret = ocfs2_sysfs_init();
+ if (ret)
+ unregister_sysctl_table(ocfs2_table_header);
+
+ return ret;
}
static void __exit ocfs2_stack_glue_exit(void)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index c1cf67b24c19..052abfa31d38 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -955,8 +955,10 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
if (!sb_has_quota_loaded(sb, type))
continue;
- oinfo = sb_dqinfo(sb, type)->dqi_priv;
- cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ if (!sb_has_quota_suspended(sb, type)) {
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ }
inode = igrab(sb->s_dquot.files[type]);
/* Turn off quotas. This will remove all dquot structures from
* memory and so they will be automatically synced to global
@@ -984,28 +986,27 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
if (!ocfs2_parse_options(sb, data, &parsed_options, 0)) {
status = -EINVAL;
- goto read_super_error;
+ goto out;
}
/* probe for superblock */
status = ocfs2_sb_probe(sb, &bh, &sector_size, &stats);
if (status < 0) {
mlog(ML_ERROR, "superblock probe failed!\n");
- goto read_super_error;
+ goto out;
}
status = ocfs2_initialize_super(sb, bh, sector_size, &stats);
- osb = OCFS2_SB(sb);
- if (status < 0) {
- mlog_errno(status);
- goto read_super_error;
- }
brelse(bh);
bh = NULL;
+ if (status < 0)
+ goto out;
+
+ osb = OCFS2_SB(sb);
if (!ocfs2_check_set_options(sb, &parsed_options)) {
status = -EINVAL;
- goto read_super_error;
+ goto out_super;
}
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
@@ -1022,7 +1023,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
- goto read_super_error;
+ goto out_super;
sb->s_magic = OCFS2_SUPER_MAGIC;
@@ -1036,7 +1037,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = -EACCES;
mlog(ML_ERROR, "Readonly device detected but readonly "
"mount was not specified.\n");
- goto read_super_error;
+ goto out_super;
}
/* You should not be able to start a local heartbeat
@@ -1045,7 +1046,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = -EROFS;
mlog(ML_ERROR, "Local heartbeat specified on readonly "
"device.\n");
- goto read_super_error;
+ goto out_super;
}
status = ocfs2_check_journals_nolocks(osb);
@@ -1054,9 +1055,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
mlog(ML_ERROR, "Recovery required on readonly "
"file system, but write access is "
"unavailable.\n");
- else
- mlog_errno(status);
- goto read_super_error;
+ goto out_super;
}
ocfs2_set_ro_flag(osb, 1);
@@ -1072,10 +1071,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
}
status = ocfs2_verify_heartbeat(osb);
- if (status < 0) {
- mlog_errno(status);
- goto read_super_error;
- }
+ if (status < 0)
+ goto out_super;
osb->osb_debug_root = debugfs_create_dir(osb->uuid_str,
ocfs2_debugfs_root);
@@ -1089,15 +1086,14 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = ocfs2_mount_volume(sb);
if (status < 0)
- goto read_super_error;
+ goto out_debugfs;
if (osb->root_inode)
inode = igrab(osb->root_inode);
if (!inode) {
status = -EIO;
- mlog_errno(status);
- goto read_super_error;
+ goto out_dismount;
}
osb->osb_dev_kset = kset_create_and_add(sb->s_id, NULL,
@@ -1105,7 +1101,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
if (!osb->osb_dev_kset) {
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create device kset %s.\n", sb->s_id);
- goto read_super_error;
+ goto out_dismount;
}
/* Create filecheck sysfs related directories/files at
@@ -1114,14 +1110,13 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
status = -ENOMEM;
mlog(ML_ERROR, "Unable to create filecheck sysfs directory at "
"/sys/fs/ocfs2/%s/filecheck.\n", sb->s_id);
- goto read_super_error;
+ goto out_dismount;
}
root = d_make_root(inode);
if (!root) {
status = -ENOMEM;
- mlog_errno(status);
- goto read_super_error;
+ goto out_dismount;
}
sb->s_root = root;
@@ -1168,17 +1163,22 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
return status;
-read_super_error:
- brelse(bh);
-
- if (status)
- mlog_errno(status);
+out_dismount:
+ atomic_set(&osb->vol_state, VOLUME_DISABLED);
+ wake_up(&osb->osb_mount_event);
+ ocfs2_free_replay_slots(osb);
+ ocfs2_dismount_volume(sb, 1);
+ goto out;
- if (osb) {
- atomic_set(&osb->vol_state, VOLUME_DISABLED);
- wake_up(&osb->osb_mount_event);
- ocfs2_dismount_volume(sb, 1);
- }
+out_debugfs:
+ debugfs_remove_recursive(osb->osb_debug_root);
+out_super:
+ ocfs2_release_system_inodes(osb);
+ kfree(osb->recovery_map);
+ ocfs2_delete_osb(osb);
+ kfree(osb);
+out:
+ mlog_errno(status);
return status;
}
@@ -1787,11 +1787,10 @@ static int ocfs2_get_sector(struct super_block *sb,
static int ocfs2_mount_volume(struct super_block *sb)
{
int status = 0;
- int unlock_super = 0;
struct ocfs2_super *osb = OCFS2_SB(sb);
if (ocfs2_is_hard_readonly(osb))
- goto leave;
+ goto out;
mutex_init(&osb->obs_trim_fs_mutex);
@@ -1801,44 +1800,58 @@ static int ocfs2_mount_volume(struct super_block *sb)
if (status == -EBADR && ocfs2_userspace_stack(osb))
mlog(ML_ERROR, "couldn't mount because cluster name on"
" disk does not match the running cluster name.\n");
- goto leave;
+ goto out;
}
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_dlm;
}
- unlock_super = 1;
/* This will load up the node map and add ourselves to it. */
status = ocfs2_find_slot(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_super_lock;
}
/* load all node-local system inodes */
status = ocfs2_init_local_system_inodes(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_super_lock;
}
status = ocfs2_check_volume(osb);
if (status < 0) {
mlog_errno(status);
- goto leave;
+ goto out_system_inodes;
}
status = ocfs2_truncate_log_init(osb);
- if (status < 0)
+ if (status < 0) {
mlog_errno(status);
+ goto out_check_volume;
+ }
-leave:
- if (unlock_super)
- ocfs2_super_unlock(osb, 1);
+ ocfs2_super_unlock(osb, 1);
+ return 0;
+out_check_volume:
+ ocfs2_free_replay_slots(osb);
+out_system_inodes:
+ if (osb->local_alloc_state == OCFS2_LA_ENABLED)
+ ocfs2_shutdown_local_alloc(osb);
+ ocfs2_release_system_inodes(osb);
+ /* before journal shutdown, we should release slot_info */
+ ocfs2_free_slot_info(osb);
+ ocfs2_journal_shutdown(osb);
+out_super_lock:
+ ocfs2_super_unlock(osb, 1);
+out_dlm:
+ ocfs2_dlm_shutdown(osb, 0);
+out:
return status;
}
@@ -1911,8 +1924,7 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
!ocfs2_is_hard_readonly(osb))
hangup_needed = 1;
- if (osb->cconn)
- ocfs2_dlm_shutdown(osb, hangup_needed);
+ ocfs2_dlm_shutdown(osb, hangup_needed);
ocfs2_blockcheck_stats_debugfs_remove(&osb->osb_ecc_stats);
debugfs_remove_recursive(osb->osb_debug_root);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 90c830e3758e..9ccd19d8f7b1 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1211,7 +1211,7 @@ static int ocfs2_xattr_block_get(struct inode *inode,
struct ocfs2_xattr_value_root *xv;
size_t size;
int ret = -ENODATA, name_offset, name_len, i;
- int uninitialized_var(block_off);
+ int block_off;
xs->bucket = ocfs2_xattr_bucket_new(inode);
if (!xs->bucket) {
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index d640b9388238..b08881b35ac3 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -220,7 +220,7 @@ static int omfs_get_block(struct inode *inode, sector_t block,
struct buffer_head *bh;
sector_t next, offset;
int ret;
- u64 uninitialized_var(new_block);
+ u64 new_block;
u32 max_extents;
int extent_count;
struct omfs_extent *oe;
diff --git a/fs/open.c b/fs/open.c
index bfbba52d86d4..df57d8ec38ae 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -571,14 +571,19 @@ out_unlock:
return error;
}
+int vfs_fchmod(struct file *file, umode_t mode)
+{
+ audit_file(file);
+ return chmod_common(&file->f_path, mode);
+}
+
int ksys_fchmod(unsigned int fd, umode_t mode)
{
struct fd f = fdget(fd);
int err = -EBADF;
if (f.file) {
- audit_file(f.file);
- err = chmod_common(&f.file->f_path, mode);
+ err = vfs_fchmod(f.file, mode);
fdput(f);
}
return err;
@@ -709,23 +714,28 @@ SYSCALL_DEFINE3(lchown, const char __user *, filename, uid_t, user, gid_t, group
AT_SYMLINK_NOFOLLOW);
}
+int vfs_fchown(struct file *file, uid_t user, gid_t group)
+{
+ int error;
+
+ error = mnt_want_write_file(file);
+ if (error)
+ return error;
+ audit_file(file);
+ error = chown_common(&file->f_path, user, group);
+ mnt_drop_write_file(file);
+ return error;
+}
+
int ksys_fchown(unsigned int fd, uid_t user, gid_t group)
{
struct fd f = fdget(fd);
int error = -EBADF;
- if (!f.file)
- goto out;
-
- error = mnt_want_write_file(f.file);
- if (error)
- goto out_fput;
- audit_file(f.file);
- error = chown_common(&f.file->f_path, user, group);
- mnt_drop_write_file(f.file);
-out_fput:
- fdput(f);
-out:
+ if (f.file) {
+ error = vfs_fchown(f.file, user, group);
+ fdput(f);
+ }
return error;
}
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 29eaa4544372..1b508f543384 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -194,15 +194,10 @@ void orangefs_debugfs_init(int debug_mask)
*/
static void orangefs_kernel_debug_init(void)
{
- int rc = -ENOMEM;
- char *k_buffer = NULL;
+ static char k_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
- k_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
- if (!k_buffer)
- goto out;
-
if (strlen(kernel_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcpy(k_buffer, kernel_debug_string);
strcat(k_buffer, "\n");
@@ -213,15 +208,14 @@ static void orangefs_kernel_debug_init(void)
debugfs_create_file(ORANGEFS_KMOD_DEBUG_FILE, 0444, debug_dir, k_buffer,
&kernel_debug_fops);
-
-out:
- gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
}
void orangefs_debugfs_cleanup(void)
{
debugfs_remove_recursive(debug_dir);
+ kfree(debug_help_string);
+ debug_help_string = NULL;
}
/* open ORANGEFS_KMOD_DEBUG_HELP_FILE */
@@ -297,18 +291,13 @@ static int help_show(struct seq_file *m, void *v)
/*
* initialize the client-debug file.
*/
-static int orangefs_client_debug_init(void)
+static void orangefs_client_debug_init(void)
{
- int rc = -ENOMEM;
- char *c_buffer = NULL;
+ static char c_buffer[ORANGEFS_MAX_DEBUG_STRING_LEN] = { };
gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: start\n", __func__);
- c_buffer = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL);
- if (!c_buffer)
- goto out;
-
if (strlen(client_debug_string) + 1 < ORANGEFS_MAX_DEBUG_STRING_LEN) {
strcpy(c_buffer, client_debug_string);
strcat(c_buffer, "\n");
@@ -322,13 +311,6 @@ static int orangefs_client_debug_init(void)
debug_dir,
c_buffer,
&kernel_debug_fops);
-
- rc = 0;
-
-out:
-
- gossip_debug(GOSSIP_DEBUGFS_DEBUG, "%s: rc:%d:\n", __func__, rc);
- return rc;
}
/* open ORANGEFS_KMOD_DEBUG_FILE or ORANGEFS_CLIENT_DEBUG_FILE.*/
@@ -671,6 +653,7 @@ int orangefs_prepare_debugfs_help_string(int at_boot)
memset(debug_help_string, 0, DEBUG_HELP_STRING_SIZE);
strlcat(debug_help_string, new, string_size);
mutex_unlock(&orangefs_help_file_lock);
+ kfree(new);
}
rc = 0;
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index c010c1fddafc..6aa7a23b04df 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -141,7 +141,7 @@ static int __init orangefs_init(void)
gossip_err("%s: could not initialize device subsystem %d!\n",
__func__,
ret);
- goto cleanup_device;
+ goto cleanup_sysfs;
}
ret = register_filesystem(&orangefs_fs_type);
@@ -153,11 +153,11 @@ static int __init orangefs_init(void)
goto out;
}
- orangefs_sysfs_exit();
-
-cleanup_device:
orangefs_dev_cleanup();
+cleanup_sysfs:
+ orangefs_sysfs_exit();
+
sysfs_init_failed:
orangefs_debugfs_cleanup();
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 143c52de97ec..9e5223108362 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -192,7 +192,7 @@ static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat)
{
struct iattr attr = {
.ia_valid =
- ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET,
+ ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME,
.ia_atime = stat->atime,
.ia_mtime = stat->mtime,
};
@@ -741,7 +741,7 @@ static int ovl_copy_up_meta_inode_data(struct ovl_copy_up_ctx *c)
struct path upperpath, datapath;
int err;
char *capability = NULL;
- ssize_t uninitialized_var(cap_size);
+ ssize_t cap_size;
ovl_path_upper(c->dentry, &upperpath);
if (WARN_ON(upperpath.dentry == NULL))
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 8c89eaea1583..fe88ac18ad93 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -557,28 +557,42 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
goto out_revert_creds;
}
- err = -ENOMEM;
- override_cred = prepare_creds();
- if (override_cred) {
+ if (!attr->hardlink) {
+ err = -ENOMEM;
+ override_cred = prepare_creds();
+ if (!override_cred)
+ goto out_revert_creds;
+ /*
+ * In the creation cases(create, mkdir, mknod, symlink),
+ * ovl should transfer current's fs{u,g}id to underlying
+ * fs. Because underlying fs want to initialize its new
+ * inode owner using current's fs{u,g}id. And in this
+ * case, the @inode is a new inode that is initialized
+ * in inode_init_owner() to current's fs{u,g}id. So use
+ * the inode's i_{u,g}id to override the cred's fs{u,g}id.
+ *
+ * But in the other hardlink case, ovl_link() does not
+ * create a new inode, so just use the ovl mounter's
+ * fs{u,g}id.
+ */
override_cred->fsuid = inode->i_uid;
override_cred->fsgid = inode->i_gid;
- if (!attr->hardlink) {
- err = security_dentry_create_files_as(dentry,
- attr->mode, &dentry->d_name, old_cred,
- override_cred);
- if (err) {
- put_cred(override_cred);
- goto out_revert_creds;
- }
+ err = security_dentry_create_files_as(dentry,
+ attr->mode, &dentry->d_name, old_cred,
+ override_cred);
+ if (err) {
+ put_cred(override_cred);
+ goto out_revert_creds;
}
put_cred(override_creds(override_cred));
put_cred(override_cred);
-
- if (!ovl_dentry_is_whiteout(dentry))
- err = ovl_create_upper(dentry, inode, attr);
- else
- err = ovl_create_over_whiteout(dentry, inode, attr);
}
+
+ if (!ovl_dentry_is_whiteout(dentry))
+ err = ovl_create_upper(dentry, inode, attr);
+ else
+ err = ovl_create_over_whiteout(dentry, inode, attr);
+
out_revert_creds:
revert_creds(old_cred);
return err;
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 19574ef17470..984df246962e 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -274,7 +274,7 @@ static int ovl_encode_fh(struct inode *inode, u32 *fid, int *max_len,
return FILEID_INVALID;
dentry = d_find_any_alias(inode);
- if (WARN_ON(!dentry))
+ if (!dentry)
return FILEID_INVALID;
type = ovl_dentry_to_fh(dentry, fid, max_len);
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index f47c591402d7..625da4bc8d0f 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -200,7 +200,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
int err;
bool last_element = !post[0];
- this = lookup_one_len_unlocked(name, base, namelen);
+ this = lookup_positive_unlocked(name, base, namelen);
if (IS_ERR(this)) {
err = PTR_ERR(this);
this = NULL;
@@ -208,8 +208,6 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
goto out;
goto out_err;
}
- if (!this->d_inode)
- goto put_and_out;
if (ovl_dentry_weird(this)) {
/* Don't support traversing automounts and other weirdness */
@@ -659,7 +657,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh)
if (err)
return ERR_PTR(err);
- index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len);
+ index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len);
kfree(name.name);
if (IS_ERR(index)) {
if (PTR_ERR(index) == -ENOENT)
@@ -667,9 +665,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh)
return index;
}
- if (d_is_negative(index))
- err = 0;
- else if (ovl_is_whiteout(index))
+ if (ovl_is_whiteout(index))
err = -ESTALE;
else if (ovl_dentry_weird(index))
err = -EIO;
@@ -693,7 +689,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
if (err)
return ERR_PTR(err);
- index = lookup_one_len_unlocked(name.name, ofs->indexdir, name.len);
+ index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
if (err == -ENOENT) {
@@ -708,9 +704,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
}
inode = d_inode(index);
- if (d_is_negative(index)) {
- goto out_dput;
- } else if (ovl_is_whiteout(index) && !verify) {
+ if (ovl_is_whiteout(index) && !verify) {
/*
* When index lookup is called with !verify for decoding an
* overlay file handle, a whiteout index implies that decode
@@ -1139,7 +1133,7 @@ bool ovl_lower_positive(struct dentry *dentry)
struct dentry *this;
struct dentry *lowerdir = poe->lowerstack[i].dentry;
- this = lookup_one_len_unlocked(name->name, lowerdir,
+ this = lookup_positive_unlocked(name->name, lowerdir,
name->len);
if (IS_ERR(this)) {
switch (PTR_ERR(this)) {
@@ -1156,10 +1150,8 @@ bool ovl_lower_positive(struct dentry *dentry)
break;
}
} else {
- if (this->d_inode) {
- positive = !ovl_is_whiteout(this);
- done = true;
- }
+ positive = !ovl_is_whiteout(this);
+ done = true;
dput(this);
}
}
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index 28348c44ea5b..8d81e88f1d1e 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -27,6 +27,7 @@ struct ovl_sb {
};
struct ovl_layer {
+ /* ovl_free_fs() relies on @mnt being the first member! */
struct vfsmount *mnt;
/* Trap in ovl inode cache */
struct inode *trap;
@@ -37,6 +38,14 @@ struct ovl_layer {
int fsid;
};
+/*
+ * ovl_free_fs() relies on @mnt being the first member when unmounting
+ * the private mounts created for each layer. Let's check both the
+ * offset and type.
+ */
+static_assert(offsetof(struct ovl_layer, mnt) == 0);
+static_assert(__same_type(typeof_member(struct ovl_layer, mnt), struct vfsmount *));
+
struct ovl_path {
struct ovl_layer *layer;
struct dentry *dentry;
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index f5cf0938f298..fcf453f7f4ae 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -263,8 +263,8 @@ static int ovl_sync_fs(struct super_block *sb, int wait)
return 0;
/*
- * If this is a sync(2) call or an emergency sync, all the super blocks
- * will be iterated, including upper_sb, so no need to do anything.
+ * Not called for sync(2) call or an emergency sync (SB_I_SKIP_SYNC).
+ * All the super blocks will be iterated, including upper_sb.
*
* If this is a syncfs(2) call, then we do need to call
* sync_filesystem() on upper_sb, but enough if we do it when being
@@ -1710,6 +1710,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
sb->s_xattr = ovl_xattr_handlers;
sb->s_fs_info = ofs;
sb->s_flags |= SB_POSIXACL;
+ sb->s_iflags |= SB_I_SKIP_SYNC;
err = -ENOMEM;
root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
diff --git a/fs/pnode.c b/fs/pnode.c
index 1106137c747a..468e4e65a615 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -244,7 +244,7 @@ static int propagate_one(struct mount *m)
}
do {
struct mount *parent = last_source->mnt_parent;
- if (last_source == first_source)
+ if (peers(last_source, first_source))
break;
done = parent->mnt_master == p;
if (done && peers(n, parent))
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cd4f16b0de67..03c3fff5a159 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3412,7 +3412,8 @@ static int proc_tid_comm_permission(struct inode *inode, int mask)
}
static const struct inode_operations proc_tid_comm_inode_operations = {
- .permission = proc_tid_comm_permission,
+ .setattr = proc_setattr,
+ .permission = proc_tid_comm_permission,
};
/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d80989b6c344..f4264dd4ea31 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -14,6 +14,7 @@
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/bpf-cgroup.h>
+#include <linux/kmemleak.h>
#include "internal.h"
static const struct dentry_operations proc_sys_dentry_operations;
@@ -1397,6 +1398,38 @@ struct ctl_table_header *register_sysctl(const char *path, struct ctl_table *tab
}
EXPORT_SYMBOL(register_sysctl);
+/**
+ * __register_sysctl_init() - register sysctl table to path
+ * @path: path name for sysctl base
+ * @table: This is the sysctl table that needs to be registered to the path
+ * @table_name: The name of sysctl table, only used for log printing when
+ * registration fails
+ *
+ * The sysctl interface is used by userspace to query or modify at runtime
+ * a predefined value set on a variable. These variables however have default
+ * values pre-set. Code which depends on these variables will always work even
+ * if register_sysctl() fails. If register_sysctl() fails you'd just loose the
+ * ability to query or modify the sysctls dynamically at run time. Chances of
+ * register_sysctl() failing on init are extremely low, and so for both reasons
+ * this function does not return any error as it is used by initialization code.
+ *
+ * Context: Can only be called after your respective sysctl base path has been
+ * registered. So for instance, most base directories are registered early on
+ * init before init levels are processed through proc_sys_init() and
+ * sysctl_init().
+ */
+void __init __register_sysctl_init(const char *path, struct ctl_table *table,
+ const char *table_name)
+{
+ struct ctl_table_header *hdr = register_sysctl(path, table);
+
+ if (unlikely(!hdr)) {
+ pr_err("failed when register_sysctl %s to %s\n", table_name, path);
+ return;
+ }
+ kmemleak_not_leak(hdr);
+}
+
static char *append_path(const char *path, char *pos, const char *name)
{
int namelen;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 328e3eff890e..865b4d30741e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -726,9 +726,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
page = device_private_entry_to_page(swpent);
}
if (page) {
- int mapcount = page_mapcount(page);
-
- if (mapcount >= 2)
+ if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte))
mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
else
mss->private_hugetlb += huge_page_size(hstate_vma(vma));
@@ -887,7 +885,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v)
last_vma_end = vma->vm_end;
}
- show_vma_header_prefix(m, priv->mm->mmap->vm_start,
+ show_vma_header_prefix(m, priv->mm->mmap ? priv->mm->mmap->vm_start : 0,
last_vma_end, 0, 0, 0, 0);
seq_pad(m, ' ');
seq_puts(m, "[rollup]\n");
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 8f0369aad22a..9fe46cc26403 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -118,6 +118,7 @@ config PSTORE_CONSOLE
config PSTORE_PMSG
bool "Log user space messages"
depends on PSTORE
+ select RT_MUTEXES
help
When the option is enabled, pstore will export a character
interface /dev/pmsg0 to log user space messages. On reboot
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 013486b5125e..9f83d8eba0e6 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -563,6 +563,7 @@ static int ramoops_init_przs(const char *name,
}
zone_sz = mem_sz / *cnt;
+ zone_sz = ALIGN_DOWN(zone_sz, 2);
if (!zone_sz) {
dev_err(dev, "%s zone size == 0\n", name);
goto fail;
@@ -759,6 +760,7 @@ static int ramoops_probe(struct platform_device *pdev)
/* Make sure we didn't get bogus platform data pointer. */
if (!pdata) {
pr_err("NULL platform data\n");
+ err = -EINVAL;
goto fail_out;
}
@@ -766,6 +768,7 @@ static int ramoops_probe(struct platform_device *pdev)
!pdata->ftrace_size && !pdata->pmsg_size)) {
pr_err("The memory size and the record/console size must be "
"non-zero\n");
+ err = -EINVAL;
goto fail_out;
}
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 1f4d8c06f9be..679b250a3912 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -190,7 +190,7 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz,
{
int numerr;
struct persistent_ram_buffer *buffer = prz->buffer;
- int ecc_blocks;
+ size_t ecc_blocks;
size_t ecc_total;
if (!ecc_info || !ecc_info->ecc_size)
@@ -427,7 +427,11 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size,
phys_addr_t addr = page_start + i * PAGE_SIZE;
pages[i] = pfn_to_page(addr >> PAGE_SHIFT);
}
- vaddr = vmap(pages, page_count, VM_MAP, prot);
+ /*
+ * VM_IOREMAP used here to bypass this region during vread()
+ * and kmap_atomic() (i.e. kcore) to avoid __va() failures.
+ */
+ vaddr = vmap(pages, page_count, VM_MAP | VM_IOREMAP, prot);
kfree(pages);
/*
@@ -502,7 +506,7 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
sig ^= PERSISTENT_RAM_SIG;
if (prz->buffer->sig == sig) {
- if (buffer_size(prz) == 0) {
+ if (buffer_size(prz) == 0 && buffer_start(prz) == 0) {
pr_debug("found existing empty buffer\n");
return 0;
}
@@ -575,6 +579,8 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
raw_spin_lock_init(&prz->buffer_lock);
prz->flags = flags;
prz->label = kstrdup(label, GFP_KERNEL);
+ if (!prz->label)
+ goto err;
ret = persistent_ram_buffer_map(start, size, prz, memtype);
if (ret)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index dc5f8654b277..a7ddb874912d 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -223,18 +223,26 @@ static void put_quota_format(struct quota_format_type *fmt)
/*
* Dquot List Management:
- * The quota code uses four lists for dquot management: the inuse_list,
- * free_dquots, dqi_dirty_list, and dquot_hash[] array. A single dquot
- * structure may be on some of those lists, depending on its current state.
+ * The quota code uses five lists for dquot management: the inuse_list,
+ * releasing_dquots, free_dquots, dqi_dirty_list, and dquot_hash[] array.
+ * A single dquot structure may be on some of those lists, depending on
+ * its current state.
*
* All dquots are placed to the end of inuse_list when first created, and this
* list is used for invalidate operation, which must look at every dquot.
*
- * Unused dquots (dq_count == 0) are added to the free_dquots list when freed,
- * and this list is searched whenever we need an available dquot. Dquots are
- * removed from the list as soon as they are used again, and
- * dqstats.free_dquots gives the number of dquots on the list. When
- * dquot is invalidated it's completely released from memory.
+ * When the last reference of a dquot is dropped, the dquot is added to
+ * releasing_dquots. We'll then queue work item which will call
+ * synchronize_srcu() and after that perform the final cleanup of all the
+ * dquots on the list. Each cleaned up dquot is moved to free_dquots list.
+ * Both releasing_dquots and free_dquots use the dq_free list_head in the dquot
+ * struct.
+ *
+ * Unused and cleaned up dquots are in the free_dquots list and this list is
+ * searched whenever we need an available dquot. Dquots are removed from the
+ * list as soon as they are used again and dqstats.free_dquots gives the number
+ * of dquots on the list. When dquot is invalidated it's completely released
+ * from memory.
*
* Dirty dquots are added to the dqi_dirty_list of quota_info when mark
* dirtied, and this list is searched when writing dirty dquots back to
@@ -248,6 +256,7 @@ static void put_quota_format(struct quota_format_type *fmt)
static LIST_HEAD(inuse_list);
static LIST_HEAD(free_dquots);
+static LIST_HEAD(releasing_dquots);
static unsigned int dq_hash_bits, dq_hash_mask;
static struct hlist_head *dquot_hash;
@@ -258,6 +267,9 @@ static qsize_t inode_get_rsv_space(struct inode *inode);
static qsize_t __inode_get_rsv_space(struct inode *inode);
static int __dquot_initialize(struct inode *inode, int type);
+static void quota_release_workfn(struct work_struct *work);
+static DECLARE_DELAYED_WORK(quota_release_work, quota_release_workfn);
+
static inline unsigned int
hashfn(const struct super_block *sb, struct kqid qid)
{
@@ -305,12 +317,21 @@ static inline void put_dquot_last(struct dquot *dquot)
dqstats_inc(DQST_FREE_DQUOTS);
}
+static inline void put_releasing_dquots(struct dquot *dquot)
+{
+ list_add_tail(&dquot->dq_free, &releasing_dquots);
+ set_bit(DQ_RELEASING_B, &dquot->dq_flags);
+}
+
static inline void remove_free_dquot(struct dquot *dquot)
{
if (list_empty(&dquot->dq_free))
return;
list_del_init(&dquot->dq_free);
- dqstats_dec(DQST_FREE_DQUOTS);
+ if (!test_bit(DQ_RELEASING_B, &dquot->dq_flags))
+ dqstats_dec(DQST_FREE_DQUOTS);
+ else
+ clear_bit(DQ_RELEASING_B, &dquot->dq_flags);
}
static inline void put_inuse(struct dquot *dquot)
@@ -336,6 +357,11 @@ static void wait_on_dquot(struct dquot *dquot)
mutex_unlock(&dquot->dq_lock);
}
+static inline int dquot_active(struct dquot *dquot)
+{
+ return test_bit(DQ_ACTIVE_B, &dquot->dq_flags);
+}
+
static inline int dquot_dirty(struct dquot *dquot)
{
return test_bit(DQ_MOD_B, &dquot->dq_flags);
@@ -351,14 +377,14 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
{
int ret = 1;
- if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+ if (!dquot_active(dquot))
return 0;
if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NOLIST_DIRTY)
return test_and_set_bit(DQ_MOD_B, &dquot->dq_flags);
/* If quota is dirty already, we don't have to acquire dq_list_lock */
- if (test_bit(DQ_MOD_B, &dquot->dq_flags))
+ if (dquot_dirty(dquot))
return 1;
spin_lock(&dq_list_lock);
@@ -373,15 +399,17 @@ int dquot_mark_dquot_dirty(struct dquot *dquot)
EXPORT_SYMBOL(dquot_mark_dquot_dirty);
/* Dirtify all the dquots - this can block when journalling */
-static inline int mark_all_dquot_dirty(struct dquot * const *dquot)
+static inline int mark_all_dquot_dirty(struct dquot __rcu * const *dquots)
{
int ret, err, cnt;
+ struct dquot *dquot;
ret = err = 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (dquot[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (dquot)
/* Even in case of error we have to continue */
- ret = mark_dquot_dirty(dquot[cnt]);
+ ret = mark_dquot_dirty(dquot);
if (!err)
err = ret;
}
@@ -438,7 +466,7 @@ int dquot_acquire(struct dquot *dquot)
smp_mb__before_atomic();
set_bit(DQ_READ_B, &dquot->dq_flags);
/* Instantiate dquot if needed */
- if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags) && !dquot->dq_off) {
+ if (!dquot_active(dquot) && !dquot->dq_off) {
ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
/* Write the info if needed */
if (info_dirty(&dqopt->info[dquot->dq_id.type])) {
@@ -477,7 +505,7 @@ int dquot_commit(struct dquot *dquot)
goto out_lock;
/* Inactive dquot can be only if there was error during read/init
* => we have better not writing it */
- if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+ if (dquot_active(dquot))
ret = dqopt->ops[dquot->dq_id.type]->commit_dqblk(dquot);
else
ret = -EIO;
@@ -538,6 +566,8 @@ static void invalidate_dquots(struct super_block *sb, int type)
struct dquot *dquot, *tmp;
restart:
+ flush_delayed_work(&quota_release_work);
+
spin_lock(&dq_list_lock);
list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) {
if (dquot->dq_sb != sb)
@@ -546,7 +576,7 @@ restart:
continue;
/* Wait for dquot users */
if (atomic_read(&dquot->dq_count)) {
- dqgrab(dquot);
+ atomic_inc(&dquot->dq_count);
spin_unlock(&dq_list_lock);
/*
* Once dqput() wakes us up, we know it's time to free
@@ -565,6 +595,15 @@ restart:
goto restart;
}
/*
+ * The last user already dropped its reference but dquot didn't
+ * get fully cleaned up yet. Restart the scan which flushes the
+ * work cleaning up released dquots.
+ */
+ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
+ spin_unlock(&dq_list_lock);
+ goto restart;
+ }
+ /*
* Quota now has no users and it has been written on last
* dqput()
*/
@@ -588,14 +627,13 @@ int dquot_scan_active(struct super_block *sb,
spin_lock(&dq_list_lock);
list_for_each_entry(dquot, &inuse_list, dq_inuse) {
- if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+ if (!dquot_active(dquot))
continue;
if (dquot->dq_sb != sb)
continue;
/* Now we have active dquot so we can just increase use count */
atomic_inc(&dquot->dq_count);
spin_unlock(&dq_list_lock);
- dqstats_inc(DQST_LOOKUPS);
dqput(old_dquot);
old_dquot = dquot;
/*
@@ -604,7 +642,7 @@ int dquot_scan_active(struct super_block *sb,
* outstanding call and recheck the DQ_ACTIVE_B after that.
*/
wait_on_dquot(dquot);
- if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+ if (dquot_active(dquot)) {
ret = fn(dquot, priv);
if (ret < 0)
goto out;
@@ -620,6 +658,18 @@ out:
}
EXPORT_SYMBOL(dquot_scan_active);
+static inline int dquot_write_dquot(struct dquot *dquot)
+{
+ int ret = dquot->dq_sb->dq_op->write_dquot(dquot);
+ if (ret < 0) {
+ quota_error(dquot->dq_sb, "Can't write quota structure "
+ "(error %d). Quota may get out of sync!", ret);
+ /* Clear dirty bit anyway to avoid infinite loop. */
+ clear_dquot_dirty(dquot);
+ }
+ return ret;
+}
+
/* Write all dquot structures to quota files */
int dquot_writeback_dquots(struct super_block *sb, int type)
{
@@ -643,24 +693,23 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
dquot = list_first_entry(&dirty, struct dquot,
dq_dirty);
- WARN_ON(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags));
+ WARN_ON(!dquot_active(dquot));
+ /* If the dquot is releasing we should not touch it */
+ if (test_bit(DQ_RELEASING_B, &dquot->dq_flags)) {
+ spin_unlock(&dq_list_lock);
+ flush_delayed_work(&quota_release_work);
+ spin_lock(&dq_list_lock);
+ continue;
+ }
/* Now we have active dquot from which someone is
* holding reference so we can safely just increase
* use count */
dqgrab(dquot);
spin_unlock(&dq_list_lock);
- dqstats_inc(DQST_LOOKUPS);
- err = sb->dq_op->write_dquot(dquot);
- if (err) {
- /*
- * Clear dirty bit anyway to avoid infinite
- * loop here.
- */
- clear_dquot_dirty(dquot);
- if (!ret)
- ret = err;
- }
+ err = dquot_write_dquot(dquot);
+ if (err && !ret)
+ ret = err;
dqput(dquot);
spin_lock(&dq_list_lock);
}
@@ -754,12 +803,52 @@ static struct shrinker dqcache_shrinker = {
};
/*
+ * Safely release dquot and put reference to dquot.
+ */
+static void quota_release_workfn(struct work_struct *work)
+{
+ struct dquot *dquot;
+ struct list_head rls_head;
+
+ spin_lock(&dq_list_lock);
+ /* Exchange the list head to avoid livelock. */
+ list_replace_init(&releasing_dquots, &rls_head);
+ spin_unlock(&dq_list_lock);
+ synchronize_srcu(&dquot_srcu);
+
+restart:
+ spin_lock(&dq_list_lock);
+ while (!list_empty(&rls_head)) {
+ dquot = list_first_entry(&rls_head, struct dquot, dq_free);
+ WARN_ON_ONCE(atomic_read(&dquot->dq_count));
+ /*
+ * Note that DQ_RELEASING_B protects us from racing with
+ * invalidate_dquots() calls so we are safe to work with the
+ * dquot even after we drop dq_list_lock.
+ */
+ if (dquot_dirty(dquot)) {
+ spin_unlock(&dq_list_lock);
+ /* Commit dquot before releasing */
+ dquot_write_dquot(dquot);
+ goto restart;
+ }
+ if (dquot_active(dquot)) {
+ spin_unlock(&dq_list_lock);
+ dquot->dq_sb->dq_op->release_dquot(dquot);
+ goto restart;
+ }
+ /* Dquot is inactive and clean, now move it to free list */
+ remove_free_dquot(dquot);
+ put_dquot_last(dquot);
+ }
+ spin_unlock(&dq_list_lock);
+}
+
+/*
* Put reference to dquot
*/
void dqput(struct dquot *dquot)
{
- int ret;
-
if (!dquot)
return;
#ifdef CONFIG_QUOTA_DEBUG
@@ -771,7 +860,7 @@ void dqput(struct dquot *dquot)
}
#endif
dqstats_inc(DQST_DROPS);
-we_slept:
+
spin_lock(&dq_list_lock);
if (atomic_read(&dquot->dq_count) > 1) {
/* We have more than one user... nothing to do */
@@ -783,35 +872,16 @@ we_slept:
spin_unlock(&dq_list_lock);
return;
}
+
/* Need to release dquot? */
- if (dquot_dirty(dquot)) {
- spin_unlock(&dq_list_lock);
- /* Commit dquot before releasing */
- ret = dquot->dq_sb->dq_op->write_dquot(dquot);
- if (ret < 0) {
- quota_error(dquot->dq_sb, "Can't write quota structure"
- " (error %d). Quota may get out of sync!",
- ret);
- /*
- * We clear dirty bit anyway, so that we avoid
- * infinite loop here
- */
- clear_dquot_dirty(dquot);
- }
- goto we_slept;
- }
- if (test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
- spin_unlock(&dq_list_lock);
- dquot->dq_sb->dq_op->release_dquot(dquot);
- goto we_slept;
- }
- atomic_dec(&dquot->dq_count);
#ifdef CONFIG_QUOTA_DEBUG
/* sanity check */
BUG_ON(!list_empty(&dquot->dq_free));
#endif
- put_dquot_last(dquot);
+ put_releasing_dquots(dquot);
+ atomic_dec(&dquot->dq_count);
spin_unlock(&dq_list_lock);
+ queue_delayed_work(system_unbound_wq, &quota_release_work, 1);
}
EXPORT_SYMBOL(dqput);
@@ -898,10 +968,10 @@ we_slept:
dqstats_inc(DQST_LOOKUPS);
}
/* Wait for dq_lock - after this we know that either dquot_release() is
- * already finished or it will be canceled due to dq_count > 1 test */
+ * already finished or it will be canceled due to dq_count > 0 test */
wait_on_dquot(dquot);
/* Read the dquot / allocate space in quota file */
- if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) {
+ if (!dquot_active(dquot)) {
int err;
err = sb->dq_op->acquire_dquot(dquot);
@@ -927,14 +997,15 @@ out:
}
EXPORT_SYMBOL(dqget);
-static inline struct dquot **i_dquot(struct inode *inode)
+static inline struct dquot __rcu **i_dquot(struct inode *inode)
{
- return inode->i_sb->s_op->get_dquots(inode);
+ /* Force __rcu for now until filesystems are fixed */
+ return (struct dquot __rcu **)inode->i_sb->s_op->get_dquots(inode);
}
static int dqinit_needed(struct inode *inode, int type)
{
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
int cnt;
if (IS_NOQUOTA(inode))
@@ -1007,59 +1078,7 @@ out:
return err;
}
-/*
- * Remove references to dquots from inode and add dquot to list for freeing
- * if we have the last reference to dquot
- */
-static void remove_inode_dquot_ref(struct inode *inode, int type,
- struct list_head *tofree_head)
-{
- struct dquot **dquots = i_dquot(inode);
- struct dquot *dquot = dquots[type];
-
- if (!dquot)
- return;
-
- dquots[type] = NULL;
- if (list_empty(&dquot->dq_free)) {
- /*
- * The inode still has reference to dquot so it can't be in the
- * free list
- */
- spin_lock(&dq_list_lock);
- list_add(&dquot->dq_free, tofree_head);
- spin_unlock(&dq_list_lock);
- } else {
- /*
- * Dquot is already in a list to put so we won't drop the last
- * reference here.
- */
- dqput(dquot);
- }
-}
-
-/*
- * Free list of dquots
- * Dquots are removed from inodes and no new references can be got so we are
- * the only ones holding reference
- */
-static void put_dquot_list(struct list_head *tofree_head)
-{
- struct list_head *act_head;
- struct dquot *dquot;
-
- act_head = tofree_head->next;
- while (act_head != tofree_head) {
- dquot = list_entry(act_head, struct dquot, dq_free);
- act_head = act_head->next;
- /* Remove dquot from the list so we won't have problems... */
- list_del_init(&dquot->dq_free);
- dqput(dquot);
- }
-}
-
-static void remove_dquot_ref(struct super_block *sb, int type,
- struct list_head *tofree_head)
+static void remove_dquot_ref(struct super_block *sb, int type)
{
struct inode *inode;
#ifdef CONFIG_QUOTA_DEBUG
@@ -1076,11 +1095,18 @@ static void remove_dquot_ref(struct super_block *sb, int type,
*/
spin_lock(&dq_data_lock);
if (!IS_NOQUOTA(inode)) {
+ struct dquot __rcu **dquots = i_dquot(inode);
+ struct dquot *dquot = srcu_dereference_check(
+ dquots[type], &dquot_srcu,
+ lockdep_is_held(&dq_data_lock));
+
#ifdef CONFIG_QUOTA_DEBUG
if (unlikely(inode_get_rsv_space(inode) > 0))
reserved = 1;
#endif
- remove_inode_dquot_ref(inode, type, tofree_head);
+ rcu_assign_pointer(dquots[type], NULL);
+ if (dquot)
+ dqput(dquot);
}
spin_unlock(&dq_data_lock);
}
@@ -1097,13 +1123,8 @@ static void remove_dquot_ref(struct super_block *sb, int type,
/* Gather all references from inodes and drop them */
static void drop_dquot_ref(struct super_block *sb, int type)
{
- LIST_HEAD(tofree_head);
-
- if (sb->dq_op) {
- remove_dquot_ref(sb, type, &tofree_head);
- synchronize_srcu(&dquot_srcu);
- put_dquot_list(&tofree_head);
- }
+ if (sb->dq_op)
+ remove_dquot_ref(sb, type);
}
static inline
@@ -1418,7 +1439,7 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
return QUOTA_NL_NOWARN;
}
-static int dquot_active(const struct inode *inode)
+static int inode_quota_active(const struct inode *inode)
{
struct super_block *sb = inode->i_sb;
@@ -1436,12 +1457,13 @@ static int dquot_active(const struct inode *inode)
static int __dquot_initialize(struct inode *inode, int type)
{
int cnt, init_needed = 0;
- struct dquot **dquots, *got[MAXQUOTAS] = {};
+ struct dquot __rcu **dquots;
+ struct dquot *got[MAXQUOTAS] = {};
struct super_block *sb = inode->i_sb;
qsize_t rsv;
int ret = 0;
- if (!dquot_active(inode))
+ if (!inode_quota_active(inode))
return 0;
dquots = i_dquot(inode);
@@ -1511,7 +1533,7 @@ static int __dquot_initialize(struct inode *inode, int type)
if (!got[cnt])
continue;
if (!dquots[cnt]) {
- dquots[cnt] = got[cnt];
+ rcu_assign_pointer(dquots[cnt], got[cnt]);
got[cnt] = NULL;
/*
* Make quota reservation system happy if someone
@@ -1519,12 +1541,16 @@ static int __dquot_initialize(struct inode *inode, int type)
*/
rsv = inode_get_rsv_space(inode);
if (unlikely(rsv)) {
+ struct dquot *dquot = srcu_dereference_check(
+ dquots[cnt], &dquot_srcu,
+ lockdep_is_held(&dq_data_lock));
+
spin_lock(&inode->i_lock);
/* Get reservation again under proper lock */
rsv = __inode_get_rsv_space(inode);
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- dquots[cnt]->dq_dqb.dqb_rsvspace += rsv;
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ spin_lock(&dquot->dq_dqb_lock);
+ dquot->dq_dqb.dqb_rsvspace += rsv;
+ spin_unlock(&dquot->dq_dqb_lock);
spin_unlock(&inode->i_lock);
}
}
@@ -1546,10 +1572,10 @@ EXPORT_SYMBOL(dquot_initialize);
bool dquot_initialize_needed(struct inode *inode)
{
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
int i;
- if (!dquot_active(inode))
+ if (!inode_quota_active(inode))
return false;
dquots = i_dquot(inode);
@@ -1571,13 +1597,14 @@ EXPORT_SYMBOL(dquot_initialize_needed);
static void __dquot_drop(struct inode *inode)
{
int cnt;
- struct dquot **dquots = i_dquot(inode);
+ struct dquot __rcu **dquots = i_dquot(inode);
struct dquot *put[MAXQUOTAS];
spin_lock(&dq_data_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- put[cnt] = dquots[cnt];
- dquots[cnt] = NULL;
+ put[cnt] = srcu_dereference_check(dquots[cnt], &dquot_srcu,
+ lockdep_is_held(&dq_data_lock));
+ rcu_assign_pointer(dquots[cnt], NULL);
}
spin_unlock(&dq_data_lock);
dqput_all(put);
@@ -1585,7 +1612,7 @@ static void __dquot_drop(struct inode *inode)
void dquot_drop(struct inode *inode)
{
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
int cnt;
if (IS_NOQUOTA(inode))
@@ -1658,9 +1685,10 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
int reserve = flags & DQUOT_SPACE_RESERVE;
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
- if (!dquot_active(inode)) {
+ if (!inode_quota_active(inode)) {
if (reserve) {
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) += number;
@@ -1678,27 +1706,26 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
index = srcu_read_lock(&dquot_srcu);
spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
if (reserve) {
- ret = dquot_add_space(dquots[cnt], 0, number, flags,
- &warn[cnt]);
+ ret = dquot_add_space(dquot, 0, number, flags, &warn[cnt]);
} else {
- ret = dquot_add_space(dquots[cnt], number, 0, flags,
- &warn[cnt]);
+ ret = dquot_add_space(dquot, number, 0, flags, &warn[cnt]);
}
if (ret) {
/* Back out changes we already did */
for (cnt--; cnt >= 0; cnt--) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- spin_lock(&dquots[cnt]->dq_dqb_lock);
+ spin_lock(&dquot->dq_dqb_lock);
if (reserve)
- dquot_free_reserved_space(dquots[cnt],
- number);
+ dquot_free_reserved_space(dquot, number);
else
- dquot_decr_space(dquots[cnt], number);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ dquot_decr_space(dquot, number);
+ spin_unlock(&dquot->dq_dqb_lock);
}
spin_unlock(&inode->i_lock);
goto out_flush_warn;
@@ -1728,9 +1755,10 @@ int dquot_alloc_inode(struct inode *inode)
{
int cnt, ret = 0, index;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
+ struct dquot *dquot;
- if (!dquot_active(inode))
+ if (!inode_quota_active(inode))
return 0;
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
warn[cnt].w_type = QUOTA_NL_NOWARN;
@@ -1739,17 +1767,19 @@ int dquot_alloc_inode(struct inode *inode)
index = srcu_read_lock(&dquot_srcu);
spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- ret = dquot_add_inodes(dquots[cnt], 1, &warn[cnt]);
+ ret = dquot_add_inodes(dquot, 1, &warn[cnt]);
if (ret) {
for (cnt--; cnt >= 0; cnt--) {
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
/* Back out changes we already did */
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- dquot_decr_inodes(dquots[cnt], 1);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ spin_lock(&dquot->dq_dqb_lock);
+ dquot_decr_inodes(dquot, 1);
+ spin_unlock(&dquot->dq_dqb_lock);
}
goto warn_put_all;
}
@@ -1770,10 +1800,11 @@ EXPORT_SYMBOL(dquot_alloc_inode);
*/
int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
int cnt, index;
- if (!dquot_active(inode)) {
+ if (!inode_quota_active(inode)) {
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) -= number;
__inode_add_bytes(inode, number);
@@ -1786,9 +1817,8 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
spin_lock(&inode->i_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (dquots[cnt]) {
- struct dquot *dquot = dquots[cnt];
-
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (dquot) {
spin_lock(&dquot->dq_dqb_lock);
if (WARN_ON_ONCE(dquot->dq_dqb.dqb_rsvspace < number))
number = dquot->dq_dqb.dqb_rsvspace;
@@ -1812,10 +1842,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
*/
void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
{
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
int cnt, index;
- if (!dquot_active(inode)) {
+ if (!inode_quota_active(inode)) {
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) += number;
__inode_sub_bytes(inode, number);
@@ -1828,9 +1859,8 @@ void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number)
spin_lock(&inode->i_lock);
/* Claim reserved quotas to allocated quotas */
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
- if (dquots[cnt]) {
- struct dquot *dquot = dquots[cnt];
-
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (dquot) {
spin_lock(&dquot->dq_dqb_lock);
if (WARN_ON_ONCE(dquot->dq_dqb.dqb_curspace < number))
number = dquot->dq_dqb.dqb_curspace;
@@ -1856,10 +1886,11 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
{
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot **dquots;
+ struct dquot __rcu **dquots;
+ struct dquot *dquot;
int reserve = flags & DQUOT_SPACE_RESERVE, index;
- if (!dquot_active(inode)) {
+ if (!inode_quota_active(inode)) {
if (reserve) {
spin_lock(&inode->i_lock);
*inode_reserved_space(inode) -= number;
@@ -1877,17 +1908,18 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
int wtype;
warn[cnt].w_type = QUOTA_NL_NOWARN;
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- wtype = info_bdq_free(dquots[cnt], number);
+ spin_lock(&dquot->dq_dqb_lock);
+ wtype = info_bdq_free(dquot, number);
if (wtype != QUOTA_NL_NOWARN)
- prepare_warning(&warn[cnt], dquots[cnt], wtype);
+ prepare_warning(&warn[cnt], dquot, wtype);
if (reserve)
- dquot_free_reserved_space(dquots[cnt], number);
+ dquot_free_reserved_space(dquot, number);
else
- dquot_decr_space(dquots[cnt], number);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ dquot_decr_space(dquot, number);
+ spin_unlock(&dquot->dq_dqb_lock);
}
if (reserve)
*inode_reserved_space(inode) -= number;
@@ -1911,10 +1943,11 @@ void dquot_free_inode(struct inode *inode)
{
unsigned int cnt;
struct dquot_warn warn[MAXQUOTAS];
- struct dquot * const *dquots;
+ struct dquot __rcu * const *dquots;
+ struct dquot *dquot;
int index;
- if (!dquot_active(inode))
+ if (!inode_quota_active(inode))
return;
dquots = i_dquot(inode);
@@ -1922,16 +1955,16 @@ void dquot_free_inode(struct inode *inode)
spin_lock(&inode->i_lock);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
int wtype;
-
warn[cnt].w_type = QUOTA_NL_NOWARN;
- if (!dquots[cnt])
+ dquot = srcu_dereference(dquots[cnt], &dquot_srcu);
+ if (!dquot)
continue;
- spin_lock(&dquots[cnt]->dq_dqb_lock);
- wtype = info_idq_free(dquots[cnt], 1);
+ spin_lock(&dquot->dq_dqb_lock);
+ wtype = info_idq_free(dquot, 1);
if (wtype != QUOTA_NL_NOWARN)
- prepare_warning(&warn[cnt], dquots[cnt], wtype);
- dquot_decr_inodes(dquots[cnt], 1);
- spin_unlock(&dquots[cnt]->dq_dqb_lock);
+ prepare_warning(&warn[cnt], dquot, wtype);
+ dquot_decr_inodes(dquot, 1);
+ spin_unlock(&dquot->dq_dqb_lock);
}
spin_unlock(&inode->i_lock);
mark_all_dquot_dirty(dquots);
@@ -1957,8 +1990,9 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
qsize_t cur_space;
qsize_t rsv_space = 0;
qsize_t inode_usage = 1;
+ struct dquot __rcu **dquots;
struct dquot *transfer_from[MAXQUOTAS] = {};
- int cnt, ret = 0;
+ int cnt, index, ret = 0;
char is_valid[MAXQUOTAS] = {};
struct dquot_warn warn_to[MAXQUOTAS];
struct dquot_warn warn_from_inodes[MAXQUOTAS];
@@ -1989,6 +2023,7 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
}
cur_space = __inode_get_bytes(inode);
rsv_space = __inode_get_rsv_space(inode);
+ dquots = i_dquot(inode);
/*
* Build the transfer_from list, check limits, and update usage in
* the target structures.
@@ -2003,7 +2038,8 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
if (!sb_has_quota_active(inode->i_sb, cnt))
continue;
is_valid[cnt] = 1;
- transfer_from[cnt] = i_dquot(inode)[cnt];
+ transfer_from[cnt] = srcu_dereference_check(dquots[cnt],
+ &dquot_srcu, lockdep_is_held(&dq_data_lock));
ret = dquot_add_inodes(transfer_to[cnt], inode_usage,
&warn_to[cnt]);
if (ret)
@@ -2042,13 +2078,21 @@ int __dquot_transfer(struct inode *inode, struct dquot **transfer_to)
rsv_space);
spin_unlock(&transfer_from[cnt]->dq_dqb_lock);
}
- i_dquot(inode)[cnt] = transfer_to[cnt];
+ rcu_assign_pointer(dquots[cnt], transfer_to[cnt]);
}
spin_unlock(&inode->i_lock);
spin_unlock(&dq_data_lock);
- mark_all_dquot_dirty(transfer_from);
- mark_all_dquot_dirty(transfer_to);
+ /*
+ * These arrays are local and we hold dquot references so we don't need
+ * the srcu protection but still take dquot_srcu to avoid warning in
+ * mark_all_dquot_dirty().
+ */
+ index = srcu_read_lock(&dquot_srcu);
+ mark_all_dquot_dirty((struct dquot __rcu **)transfer_from);
+ mark_all_dquot_dirty((struct dquot __rcu **)transfer_to);
+ srcu_read_unlock(&dquot_srcu, index);
+
flush_warnings(warn_to);
flush_warnings(warn_from_inodes);
flush_warnings(warn_from_space);
@@ -2085,7 +2129,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
struct super_block *sb = inode->i_sb;
int ret;
- if (!dquot_active(inode))
+ if (!inode_quota_active(inode))
return 0;
if (iattr->ia_valid & ATTR_UID && !uid_eq(iattr->ia_uid, inode->i_uid)){
@@ -2306,28 +2350,76 @@ EXPORT_SYMBOL(dquot_quota_off);
* Turn quotas on on a device
*/
-/*
- * Helper function to turn quotas on when we already have the inode of
- * quota file and no quota information is loaded.
- */
-static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+static int vfs_setup_quota_inode(struct inode *inode, int type)
+{
+ struct super_block *sb = inode->i_sb;
+ struct quota_info *dqopt = sb_dqopt(sb);
+
+ if (is_bad_inode(inode))
+ return -EUCLEAN;
+ if (!S_ISREG(inode->i_mode))
+ return -EACCES;
+ if (IS_RDONLY(inode))
+ return -EROFS;
+ if (sb_has_quota_loaded(sb, type))
+ return -EBUSY;
+
+ /*
+ * Quota files should never be encrypted. They should be thought of as
+ * filesystem metadata, not user data. New-style internal quota files
+ * cannot be encrypted by users anyway, but old-style external quota
+ * files could potentially be incorrectly created in an encrypted
+ * directory, hence this explicit check. Some reasons why encrypted
+ * quota files don't work include: (1) some filesystems that support
+ * encryption don't handle it in their quota_read and quota_write, and
+ * (2) cleaning up encrypted quota files at unmount would need special
+ * consideration, as quota files are cleaned up later than user files.
+ */
+ if (IS_ENCRYPTED(inode))
+ return -EINVAL;
+
+ dqopt->files[type] = igrab(inode);
+ if (!dqopt->files[type])
+ return -EIO;
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+ /* We don't want quota and atime on quota files (deadlocks
+ * possible) Also nobody should write to the file - we use
+ * special IO operations which ignore the immutable bit. */
+ inode_lock(inode);
+ inode->i_flags |= S_NOQUOTA;
+ inode_unlock(inode);
+ /*
+ * When S_NOQUOTA is set, remove dquot references as no more
+ * references can be added
+ */
+ __dquot_drop(inode);
+ }
+ return 0;
+}
+
+static void vfs_cleanup_quota_inode(struct super_block *sb, int type)
+{
+ struct quota_info *dqopt = sb_dqopt(sb);
+ struct inode *inode = dqopt->files[type];
+
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+ inode_lock(inode);
+ inode->i_flags &= ~S_NOQUOTA;
+ inode_unlock(inode);
+ }
+ dqopt->files[type] = NULL;
+ iput(inode);
+}
+
+int dquot_load_quota_sb(struct super_block *sb, int type, int format_id,
unsigned int flags)
{
struct quota_format_type *fmt = find_quota_format(format_id);
- struct super_block *sb = inode->i_sb;
struct quota_info *dqopt = sb_dqopt(sb);
int error;
if (!fmt)
return -ESRCH;
- if (!S_ISREG(inode->i_mode)) {
- error = -EACCES;
- goto out_fmt;
- }
- if (IS_RDONLY(inode)) {
- error = -EROFS;
- goto out_fmt;
- }
if (!sb->s_op->quota_write || !sb->s_op->quota_read ||
(type == PRJQUOTA && sb->dq_op->get_projid == NULL)) {
error = -EINVAL;
@@ -2359,27 +2451,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
invalidate_bdev(sb->s_bdev);
}
- if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
- /* We don't want quota and atime on quota files (deadlocks
- * possible) Also nobody should write to the file - we use
- * special IO operations which ignore the immutable bit. */
- inode_lock(inode);
- inode->i_flags |= S_NOQUOTA;
- inode_unlock(inode);
- /*
- * When S_NOQUOTA is set, remove dquot references as no more
- * references can be added
- */
- __dquot_drop(inode);
- }
-
- error = -EIO;
- dqopt->files[type] = igrab(inode);
- if (!dqopt->files[type])
- goto out_file_flags;
error = -EINVAL;
if (!fmt->qf_ops->check_quota_file(sb, type))
- goto out_file_init;
+ goto out_fmt;
dqopt->ops[type] = fmt->qf_ops;
dqopt->info[type].dqi_format = fmt;
@@ -2387,7 +2461,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list);
error = dqopt->ops[type]->read_file_info(sb, type);
if (error < 0)
- goto out_file_init;
+ goto out_fmt;
if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) {
spin_lock(&dq_data_lock);
dqopt->info[type].dqi_flags |= DQF_SYS_FILE;
@@ -2399,21 +2473,34 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
error = add_dquot_ref(sb, type);
if (error)
- dquot_disable(sb, type, flags);
+ dquot_disable(sb, type,
+ DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
return error;
-out_file_init:
- dqopt->files[type] = NULL;
- iput(inode);
-out_file_flags:
- inode_lock(inode);
- inode->i_flags &= ~S_NOQUOTA;
- inode_unlock(inode);
out_fmt:
put_quota_format(fmt);
return error;
}
+EXPORT_SYMBOL(dquot_load_quota_sb);
+
+/*
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+ unsigned int flags)
+{
+ int err;
+
+ err = vfs_setup_quota_inode(inode, type);
+ if (err < 0)
+ return err;
+ err = dquot_load_quota_sb(inode->i_sb, type, format_id, flags);
+ if (err < 0)
+ vfs_cleanup_quota_inode(inode->i_sb, type);
+ return err;
+}
/* Reenable quotas on remount RW */
int dquot_resume(struct super_block *sb, int type)
@@ -2514,21 +2601,15 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
struct dentry *dentry;
int error;
- dentry = lookup_one_len_unlocked(qf_name, sb->s_root, strlen(qf_name));
+ dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name));
if (IS_ERR(dentry))
return PTR_ERR(dentry);
- if (d_really_is_negative(dentry)) {
- error = -ENOENT;
- goto out;
- }
-
error = security_quota_on(dentry);
if (!error)
error = vfs_load_quota_inode(d_inode(dentry), type, format_id,
DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
-out:
dput(dentry);
return error;
}
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 1a188fbdf34e..07948f6ac84e 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -80,6 +80,35 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf)
return ret;
}
+static inline int do_check_range(struct super_block *sb, const char *val_name,
+ uint val, uint min_val, uint max_val)
+{
+ if (val < min_val || val > max_val) {
+ quota_error(sb, "Getting %s %u out of range %u-%u",
+ val_name, val, min_val, max_val);
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
+static int check_dquot_block_header(struct qtree_mem_dqinfo *info,
+ struct qt_disk_dqdbheader *dh)
+{
+ int err = 0;
+
+ err = do_check_range(info->dqi_sb, "dqdh_next_free",
+ le32_to_cpu(dh->dqdh_next_free), 0,
+ info->dqi_blocks - 1);
+ if (err)
+ return err;
+ err = do_check_range(info->dqi_sb, "dqdh_prev_free",
+ le32_to_cpu(dh->dqdh_prev_free), 0,
+ info->dqi_blocks - 1);
+
+ return err;
+}
+
/* Remove empty block from list and return it */
static int get_free_dqblk(struct qtree_mem_dqinfo *info)
{
@@ -94,6 +123,9 @@ static int get_free_dqblk(struct qtree_mem_dqinfo *info)
ret = read_blk(info, blk, buf);
if (ret < 0)
goto out_buf;
+ ret = check_dquot_block_header(info, dh);
+ if (ret)
+ goto out_buf;
info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
}
else {
@@ -241,6 +273,9 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
*err = read_blk(info, blk, buf);
if (*err < 0)
goto out_buf;
+ *err = check_dquot_block_header(info, dh);
+ if (*err)
+ goto out_buf;
} else {
blk = get_free_dqblk(info);
if ((int)blk < 0) {
@@ -433,6 +468,9 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
goto out_buf;
}
dh = (struct qt_disk_dqdbheader *)buf;
+ ret = check_dquot_block_header(info, dh);
+ if (ret)
+ goto out_buf;
le16_add_cpu(&dh->dqdh_entries, -1);
if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
ret = remove_free_dqentry(info, buf, blk);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 09ad022a78a5..b8277a5dfe40 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -2327,7 +2327,7 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
int i, j;
bh = __getblk(dev, block, bufsize);
- if (buffer_uptodate(bh))
+ if (!bh || buffer_uptodate(bh))
return (bh);
if (block + BUFNR > max_block) {
@@ -2337,6 +2337,8 @@ static struct buffer_head *reiserfs_breada(struct block_device *dev,
j = 1;
for (i = 1; i < blocks; i++) {
bh = __getblk(dev, block + i, bufsize);
+ if (!bh)
+ break;
if (buffer_uptodate(bh)) {
brelse(bh);
break;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 959a066b7bb0..2843b7cf4d7a 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -695,6 +695,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, umode_t mod
out_failed:
reiserfs_write_unlock(dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
@@ -778,6 +779,7 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode
out_failed:
reiserfs_write_unlock(dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
@@ -876,6 +878,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
retval = journal_end(&th);
out_failed:
reiserfs_write_unlock(dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
@@ -1191,6 +1194,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
retval = journal_end(&th);
out_failed:
reiserfs_write_unlock(parent_dir->i_sb);
+ reiserfs_security_free(&security);
return retval;
}
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 913f5af9bf24..0ebb6e684908 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -1437,7 +1437,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
unsigned long safe_mask = 0;
unsigned int commit_max_age = (unsigned int)-1;
struct reiserfs_journal *journal = SB_JOURNAL(s);
- char *new_opts;
int err;
char *qf_names[REISERFS_MAXQUOTAS];
unsigned int qfmt = 0;
@@ -1445,10 +1444,6 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
int i;
#endif
- new_opts = kstrdup(arg, GFP_KERNEL);
- if (arg && !new_opts)
- return -ENOMEM;
-
sync_filesystem(s);
reiserfs_write_lock(s);
@@ -1599,7 +1594,6 @@ out_ok_unlocked:
out_err_unlock:
reiserfs_write_unlock(s);
out_err:
- kfree(new_opts);
return err;
}
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index 20be9a0e5870..159af6c26f4b 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -49,6 +49,7 @@ int reiserfs_security_init(struct inode *dir, struct inode *inode,
int error;
sec->name = NULL;
+ sec->value = NULL;
/* Don't add selinux attributes on xattrs - they'll never get used */
if (IS_PRIVATE(dir))
@@ -80,11 +81,15 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th,
struct inode *inode,
struct reiserfs_security_handle *sec)
{
+ char xattr_name[XATTR_NAME_MAX + 1] = XATTR_SECURITY_PREFIX;
int error;
- if (strlen(sec->name) < sizeof(XATTR_SECURITY_PREFIX))
+
+ if (XATTR_SECURITY_PREFIX_LEN + strlen(sec->name) > XATTR_NAME_MAX)
return -EINVAL;
- error = reiserfs_xattr_set_handle(th, inode, sec->name, sec->value,
+ strlcat(xattr_name, sec->name, sizeof(xattr_name));
+
+ error = reiserfs_xattr_set_handle(th, inode, xattr_name, sec->value,
sec->length, XATTR_CREATE);
if (error == -ENODATA || error == -EOPNOTSUPP)
error = 0;
@@ -94,7 +99,6 @@ int reiserfs_security_write(struct reiserfs_transaction_handle *th,
void reiserfs_security_free(struct reiserfs_security_handle *sec)
{
- kfree(sec->name);
kfree(sec->value);
sec->name = NULL;
sec->value = NULL;
diff --git a/fs/select.c b/fs/select.c
index 7716d9d5be1e..f405dc5adf3c 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -475,7 +475,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in,
wait->_key |= POLLOUT_SET;
}
-static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
+static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
ktime_t expire, *to = NULL;
struct poll_wqueues table;
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 3e94d181930f..c3415d969ecf 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -248,6 +248,7 @@ static const struct file_operations signalfd_fops = {
.poll = signalfd_poll,
.read = signalfd_read,
.llseek = noop_llseek,
+ .may_pollfree = true,
};
static int do_signalfd4(int ufd, sigset_t *mask, int flags)
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 236664d69141..ccfaf8c81958 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -183,7 +183,7 @@ static inline int squashfs_block_size(__le32 raw)
#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
sizeof(u64))
/* xattr id lookup table defines */
-#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id))
+#define SQUASHFS_XATTR_BYTES(A) (((u64) (A)) * sizeof(struct squashfs_xattr_id))
#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \
SQUASHFS_METADATA_SIZE)
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 166e98806265..8f9445e290e7 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -63,7 +63,7 @@ struct squashfs_sb_info {
long long bytes_used;
unsigned int inodes;
unsigned int fragments;
- int xattr_ids;
+ unsigned int xattr_ids;
unsigned int ids;
};
#endif
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
index d8a270d3ac4c..f1a463d8bfa0 100644
--- a/fs/squashfs/xattr.h
+++ b/fs/squashfs/xattr.h
@@ -10,12 +10,12 @@
#ifdef CONFIG_SQUASHFS_XATTR
extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
- u64 *, int *);
+ u64 *, unsigned int *);
extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
unsigned int *, unsigned long long *);
#else
static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
- u64 start, u64 *xattr_table_start, int *xattr_ids)
+ u64 start, u64 *xattr_table_start, unsigned int *xattr_ids)
{
struct squashfs_xattr_id_table *id_table;
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
index 087cab8c78f4..c8469c656e0d 100644
--- a/fs/squashfs/xattr_id.c
+++ b/fs/squashfs/xattr_id.c
@@ -56,7 +56,7 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
* Read uncompressed xattr id lookup table indexes from disk into memory
*/
__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,
- u64 *xattr_table_start, int *xattr_ids)
+ u64 *xattr_table_start, unsigned int *xattr_ids)
{
struct squashfs_sb_info *msblk = sb->s_fs_info;
unsigned int len, indexes;
diff --git a/fs/statfs.c b/fs/statfs.c
index 2616424012ea..4960c69cc47c 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -128,6 +128,7 @@ static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
if (sizeof(buf) == sizeof(*st))
memcpy(&buf, st, sizeof(*st));
else {
+ memset(&buf, 0, sizeof(buf));
if (sizeof buf.f_blocks == 4) {
if ((st->f_blocks | st->f_bfree | st->f_bavail |
st->f_bsize | st->f_frsize) &
@@ -156,7 +157,6 @@ static int do_statfs_native(struct kstatfs *st, struct statfs __user *p)
buf.f_namelen = st->f_namelen;
buf.f_frsize = st->f_frsize;
buf.f_flags = st->f_flags;
- memset(buf.f_spare, 0, sizeof(buf.f_spare));
}
if (copy_to_user(p, &buf, sizeof(buf)))
return -EFAULT;
@@ -169,6 +169,7 @@ static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
if (sizeof(buf) == sizeof(*st))
memcpy(&buf, st, sizeof(*st));
else {
+ memset(&buf, 0, sizeof(buf));
buf.f_type = st->f_type;
buf.f_bsize = st->f_bsize;
buf.f_blocks = st->f_blocks;
@@ -180,7 +181,6 @@ static int do_statfs64(struct kstatfs *st, struct statfs64 __user *p)
buf.f_namelen = st->f_namelen;
buf.f_frsize = st->f_frsize;
buf.f_flags = st->f_flags;
- memset(buf.f_spare, 0, sizeof(buf.f_spare));
}
if (copy_to_user(p, &buf, sizeof(buf)))
return -EFAULT;
diff --git a/fs/super.c b/fs/super.c
index e255c18fa2c8..47ca7dc0e6c3 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -905,6 +905,7 @@ int reconfigure_super(struct fs_context *fc)
struct super_block *sb = fc->root->d_sb;
int retval;
bool remount_ro = false;
+ bool remount_rw = false;
bool force = fc->sb_flags & SB_FORCE;
if (fc->sb_flags_mask & ~MS_RMT_MASK)
@@ -921,7 +922,7 @@ int reconfigure_super(struct fs_context *fc)
if (!(fc->sb_flags & SB_RDONLY) && bdev_read_only(sb->s_bdev))
return -EACCES;
#endif
-
+ remount_rw = !(fc->sb_flags & SB_RDONLY) && sb_rdonly(sb);
remount_ro = (fc->sb_flags & SB_RDONLY) && !sb_rdonly(sb);
}
@@ -951,6 +952,14 @@ int reconfigure_super(struct fs_context *fc)
if (retval)
return retval;
}
+ } else if (remount_rw) {
+ /*
+ * We set s_readonly_remount here to protect filesystem's
+ * reconfigure code from writes from userspace until
+ * reconfigure finishes.
+ */
+ sb->s_readonly_remount = 1;
+ smp_wmb();
}
if (fc->ops->reconfigure) {
diff --git a/fs/sync.c b/fs/sync.c
index 67c66358f3fe..034ecb4ac3e3 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -77,7 +77,8 @@ static void sync_inodes_one_sb(struct super_block *sb, void *arg)
static void sync_fs_one_sb(struct super_block *sb, void *arg)
{
- if (!sb_rdonly(sb) && sb->s_op->sync_fs)
+ if (!sb_rdonly(sb) && !(sb->s_iflags & SB_I_SKIP_SYNC) &&
+ sb->s_op->sync_fs)
sb->s_op->sync_fs(sb, *(int *)arg);
}
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 4f7cf975b27c..b69ce0f68bcb 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -429,6 +429,8 @@ struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj,
kn = kernfs_find_and_get(kobj->sd, attr->name);
if (kn)
kernfs_break_active_protection(kn);
+ else
+ kobject_put(kobj);
return kn;
}
EXPORT_SYMBOL_GPL(sysfs_break_active_protection);
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index bcb67b0cabe7..ef9bcfeec21a 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -82,9 +82,6 @@ static inline sysv_zone_t *block_end(struct buffer_head *bh)
return (sysv_zone_t*)((char*)bh->b_data + bh->b_size);
}
-/*
- * Requires read_lock(&pointers_lock) or write_lock(&pointers_lock)
- */
static Indirect *get_branch(struct inode *inode,
int depth,
int offsets[],
@@ -104,15 +101,18 @@ static Indirect *get_branch(struct inode *inode,
bh = sb_bread(sb, block);
if (!bh)
goto failure;
+ read_lock(&pointers_lock);
if (!verify_chain(chain, p))
goto changed;
add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets);
+ read_unlock(&pointers_lock);
if (!p->key)
goto no_block;
}
return NULL;
changed:
+ read_unlock(&pointers_lock);
brelse(bh);
*err = -EAGAIN;
goto no_block;
@@ -145,6 +145,10 @@ static int alloc_branch(struct inode *inode,
*/
parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key);
bh = sb_getblk(inode->i_sb, parent);
+ if (!bh) {
+ sysv_free_block(inode->i_sb, branch[n].key);
+ break;
+ }
lock_buffer(bh);
memset(bh->b_data, 0, blocksize);
branch[n].bh = bh;
@@ -214,9 +218,7 @@ static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *b
goto out;
reread:
- read_lock(&pointers_lock);
partial = get_branch(inode, depth, offsets, chain, &err);
- read_unlock(&pointers_lock);
/* Simplest case - block found, no allocation needed */
if (!partial) {
@@ -286,9 +288,9 @@ static Indirect *find_shared(struct inode *inode,
*top = 0;
for (k = depth; k > 1 && !offsets[k-1]; k--)
;
+ partial = get_branch(inode, k, offsets, chain, &err);
write_lock(&pointers_lock);
- partial = get_branch(inode, k, offsets, chain, &err);
if (!partial)
partial = chain + k-1;
/*
@@ -438,7 +440,7 @@ static unsigned sysv_nblocks(struct super_block *s, loff_t size)
res += blocks;
direct = 1;
}
- return blocks;
+ return res;
}
int sysv_getattr(const struct path *path, struct kstat *stat,
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 7878f145bf1b..90ce45c8ecac 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -139,6 +139,8 @@ struct tracefs_mount_opts {
kuid_t uid;
kgid_t gid;
umode_t mode;
+ /* Opt_* bitfield. */
+ unsigned int opts;
};
enum {
@@ -239,6 +241,7 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
kgid_t gid;
char *p;
+ opts->opts = 0;
opts->mode = TRACEFS_DEFAULT_MODE;
while ((p = strsep(&data, ",")) != NULL) {
@@ -273,24 +276,36 @@ static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
* but traditionally tracefs has ignored all mount options
*/
}
+
+ opts->opts |= BIT(token);
}
return 0;
}
-static int tracefs_apply_options(struct super_block *sb)
+static int tracefs_apply_options(struct super_block *sb, bool remount)
{
struct tracefs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = sb->s_root->d_inode;
struct tracefs_mount_opts *opts = &fsi->mount_opts;
- inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= opts->mode;
+ /*
+ * On remount, only reset mode/uid/gid if they were provided as mount
+ * options.
+ */
- inode->i_uid = opts->uid;
+ if (!remount || opts->opts & BIT(Opt_mode)) {
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+ }
- /* Set all the group ids to the mount option */
- set_gid(sb->s_root, opts->gid);
+ if (!remount || opts->opts & BIT(Opt_uid))
+ inode->i_uid = opts->uid;
+
+ if (!remount || opts->opts & BIT(Opt_gid)) {
+ /* Set all the group ids to the mount option */
+ set_gid(sb->s_root, opts->gid);
+ }
return 0;
}
@@ -305,7 +320,7 @@ static int tracefs_remount(struct super_block *sb, int *flags, char *data)
if (err)
goto fail;
- tracefs_apply_options(sb);
+ tracefs_apply_options(sb, true);
fail:
return err;
@@ -357,7 +372,7 @@ static int trace_fill_super(struct super_block *sb, void *data, int silent)
sb->s_op = &tracefs_super_operations;
- tracefs_apply_options(sb);
+ tracefs_apply_options(sb, false);
return 0;
@@ -536,6 +551,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
*/
struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
{
+ if (security_locked_down(LOCKDOWN_TRACEFS))
+ return NULL;
+
return __create_dir(name, parent, &simple_dir_inode_operations);
}
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index c0b84e960b20..9cb05ef9b9dd 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -212,11 +212,10 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs)
subtract_lebs += 1;
/*
- * The GC journal head LEB is not really accessible. And since
- * different write types go to different heads, we may count only on
- * one head's space.
+ * Since different write types go to different heads, we should
+ * reserve one leb for each head.
*/
- subtract_lebs += c->jhead_cnt - 1;
+ subtract_lebs += c->jhead_cnt;
/* We also reserve one LEB for deletions, which bypass budgeting */
subtract_lebs += 1;
@@ -403,7 +402,7 @@ static int calc_dd_growth(const struct ubifs_info *c,
dd_growth = req->dirtied_page ? c->bi.page_budget : 0;
if (req->dirtied_ino)
- dd_growth += c->bi.inode_budget << (req->dirtied_ino - 1);
+ dd_growth += c->bi.inode_budget * req->dirtied_ino;
if (req->mod_dent)
dd_growth += c->bi.dent_budget;
dd_growth += req->dirtied_ino_d;
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index ad292c5a43a9..b5cdac9b0368 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -552,11 +552,11 @@ out:
*/
int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
{
- int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
+ int lnum, offs, len, err = 0, last_level, child_cnt;
int first = 1, iip;
struct ubifs_debug_info *d = c->dbg;
- union ubifs_key uninitialized_var(lower_key), upper_key, l_key, u_key;
- unsigned long long uninitialized_var(last_sqnum);
+ union ubifs_key lower_key, upper_key, l_key, u_key;
+ unsigned long long last_sqnum;
struct ubifs_idx_node *idx;
struct list_head list;
struct idx_node *i;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 83a173feb698..c63baff39ba9 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -433,6 +433,7 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
mutex_unlock(&dir_ui->ui_mutex);
ubifs_release_budget(c, &req);
+ fscrypt_free_filename(&nm);
return 0;
@@ -1125,7 +1126,6 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
int err, sz_change, len = strlen(symname);
struct fscrypt_str disk_link;
struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
- .new_ino_d = ALIGN(len, 8),
.dirtied_ino = 1 };
struct fscrypt_name nm;
@@ -1141,6 +1141,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
* Budget request settings: new inode, new direntry and changing parent
* directory inode.
*/
+ req.new_ino_d = ALIGN(disk_link.len - 1, 8);
err = ubifs_budget_space(c, &req);
if (err)
return err;
@@ -1205,6 +1206,8 @@ out_cancel:
dir_ui->ui_size = dir->i_size;
mutex_unlock(&dir_ui->ui_mutex);
out_inode:
+ /* Free inode->i_link before inode is marked as bad. */
+ fscrypt_free_inode(inode);
make_bad_inode(inode);
iput(inode);
out_fname:
@@ -1277,7 +1280,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
struct timespec64 time;
- unsigned int uninitialized_var(saved_nlink);
+ unsigned int saved_nlink;
struct fscrypt_name old_nm, new_nm;
/*
@@ -1296,6 +1299,8 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
if (unlink) {
ubifs_assert(c, inode_is_locked(new_inode));
+ /* Budget for old inode's data when its nlink > 1. */
+ req.dirtied_ino_d = ALIGN(ubifs_inode(new_inode)->data_len, 8);
err = ubifs_purge_xattrs(new_inode);
if (err)
return err;
@@ -1538,6 +1543,10 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
return err;
}
+ err = ubifs_budget_space(c, &req);
+ if (err)
+ goto out;
+
lock_4_inodes(old_dir, new_dir, NULL, NULL);
time = current_time(old_dir);
@@ -1563,6 +1572,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
unlock_4_inodes(old_dir, new_dir, NULL, NULL);
ubifs_release_budget(c, &req);
+out:
fscrypt_free_filename(&fst_nm);
fscrypt_free_filename(&snd_nm);
return err;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 6069c63d833a..ebad140f8d05 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -222,7 +222,7 @@ static int write_begin_slow(struct address_space *mapping,
struct ubifs_info *c = inode->i_sb->s_fs_info;
pgoff_t index = pos >> PAGE_SHIFT;
struct ubifs_budget_req req = { .new_page = 1 };
- int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+ int err, appending = !!(pos + len > inode->i_size);
struct page *page;
dbg_gen("ino %lu, pos %llu, len %u, i_size %lld",
@@ -262,9 +262,6 @@ static int write_begin_slow(struct address_space *mapping,
return err;
}
}
-
- SetPageUptodate(page);
- ClearPageError(page);
}
if (PagePrivate(page))
@@ -426,7 +423,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_inode *ui = ubifs_inode(inode);
pgoff_t index = pos >> PAGE_SHIFT;
- int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+ int err, appending = !!(pos + len > inode->i_size);
int skipped_read = 0;
struct page *page;
@@ -463,9 +460,6 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
return err;
}
}
-
- SetPageUptodate(page);
- ClearPageError(page);
}
err = allocate_budget(c, page, ui, appending);
@@ -475,10 +469,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
* If we skipped reading the page because we were going to
* write all of it, then it is not up to date.
*/
- if (skipped_read) {
+ if (skipped_read)
ClearPageChecked(page);
- ClearPageUptodate(page);
- }
/*
* Budgeting failed which means it would have to force
* write-back but didn't, because we set the @fast flag in the
@@ -569,6 +561,9 @@ static int ubifs_write_end(struct file *file, struct address_space *mapping,
goto out;
}
+ if (len == PAGE_SIZE)
+ SetPageUptodate(page);
+
if (!PagePrivate(page)) {
SetPagePrivate(page);
atomic_long_inc(&c->dirty_pg_cnt);
@@ -1031,7 +1026,7 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
if (page->index >= synced_i_size >> PAGE_SHIFT) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
- goto out_unlock;
+ goto out_redirty;
/*
* The inode has been written, but the write-buffer has
* not been synchronized, so in case of an unclean
@@ -1059,11 +1054,17 @@ static int ubifs_writepage(struct page *page, struct writeback_control *wbc)
if (i_size > synced_i_size) {
err = inode->i_sb->s_op->write_inode(inode, NULL);
if (err)
- goto out_unlock;
+ goto out_redirty;
}
return do_writepage(page, len);
-
+out_redirty:
+ /*
+ * redirty_page_for_writepage() won't call ubifs_dirty_inode() because
+ * it passes I_DIRTY_PAGES flag while calling __mark_inode_dirty(), so
+ * there is no need to do space budget for dirty inode.
+ */
+ redirty_page_for_writepage(wbc, page);
out_unlock:
unlock_page(page);
return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f78c3e3ef931..99f391293c5e 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -1225,7 +1225,7 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
int last_reference = !!(new_inode && new_inode->i_nlink == 0);
int move = (old_dir != new_dir);
- struct ubifs_inode *uninitialized_var(new_ui);
+ struct ubifs_inode *new_ui;
u8 hash_old_dir[UBIFS_HASH_ARR_SZ];
u8 hash_new_dir[UBIFS_HASH_ARR_SZ];
u8 hash_new_inode[UBIFS_HASH_ARR_SZ];
@@ -1511,7 +1511,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
union ubifs_key key, to_key;
struct ubifs_ino_node *ino;
struct ubifs_trun_node *trun;
- struct ubifs_data_node *uninitialized_var(dn);
+ struct ubifs_data_node *dn;
int err, dlen, len, lnum, offs, bit, sz, sync = IS_SYNC(inode);
struct ubifs_inode *ui = ubifs_inode(inode);
ino_t inum = inode->i_ino;
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index e21abf250951..6e0a153b7194 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -275,7 +275,7 @@ uint32_t ubifs_unpack_bits(const struct ubifs_info *c, uint8_t **addr, int *pos,
const int k = 32 - nrbits;
uint8_t *p = *addr;
int b = *pos;
- uint32_t uninitialized_var(val);
+ uint32_t val;
const int bytes = (nrbits + b + 7) >> 3;
ubifs_assert(c, nrbits > 0);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index b37c6b1e3325..29526040ad77 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -815,7 +815,7 @@ static int alloc_wbufs(struct ubifs_info *c)
INIT_LIST_HEAD(&c->jheads[i].buds_list);
err = ubifs_wbuf_init(c, &c->jheads[i].wbuf);
if (err)
- return err;
+ goto out_wbuf;
c->jheads[i].wbuf.sync_callback = &bud_wbuf_callback;
c->jheads[i].wbuf.jhead = i;
@@ -823,7 +823,7 @@ static int alloc_wbufs(struct ubifs_info *c)
c->jheads[i].log_hash = ubifs_hash_get_desc(c);
if (IS_ERR(c->jheads[i].log_hash)) {
err = PTR_ERR(c->jheads[i].log_hash);
- goto out;
+ goto out_log_hash;
}
}
@@ -836,9 +836,18 @@ static int alloc_wbufs(struct ubifs_info *c)
return 0;
-out:
- while (i--)
+out_log_hash:
+ kfree(c->jheads[i].wbuf.buf);
+ kfree(c->jheads[i].wbuf.inodes);
+
+out_wbuf:
+ while (i--) {
+ kfree(c->jheads[i].wbuf.buf);
+ kfree(c->jheads[i].wbuf.inodes);
kfree(c->jheads[i].log_hash);
+ }
+ kfree(c->jheads);
+ c->jheads = NULL;
return err;
}
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 33742ee3945b..c8b99a86b63f 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -44,6 +44,33 @@ enum {
NOT_ON_MEDIA = 3,
};
+static void do_insert_old_idx(struct ubifs_info *c,
+ struct ubifs_old_idx *old_idx)
+{
+ struct ubifs_old_idx *o;
+ struct rb_node **p, *parent = NULL;
+
+ p = &c->old_idx.rb_node;
+ while (*p) {
+ parent = *p;
+ o = rb_entry(parent, struct ubifs_old_idx, rb);
+ if (old_idx->lnum < o->lnum)
+ p = &(*p)->rb_left;
+ else if (old_idx->lnum > o->lnum)
+ p = &(*p)->rb_right;
+ else if (old_idx->offs < o->offs)
+ p = &(*p)->rb_left;
+ else if (old_idx->offs > o->offs)
+ p = &(*p)->rb_right;
+ else {
+ ubifs_err(c, "old idx added twice!");
+ kfree(old_idx);
+ }
+ }
+ rb_link_node(&old_idx->rb, parent, p);
+ rb_insert_color(&old_idx->rb, &c->old_idx);
+}
+
/**
* insert_old_idx - record an index node obsoleted since the last commit start.
* @c: UBIFS file-system description object
@@ -69,35 +96,15 @@ enum {
*/
static int insert_old_idx(struct ubifs_info *c, int lnum, int offs)
{
- struct ubifs_old_idx *old_idx, *o;
- struct rb_node **p, *parent = NULL;
+ struct ubifs_old_idx *old_idx;
old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
if (unlikely(!old_idx))
return -ENOMEM;
old_idx->lnum = lnum;
old_idx->offs = offs;
+ do_insert_old_idx(c, old_idx);
- p = &c->old_idx.rb_node;
- while (*p) {
- parent = *p;
- o = rb_entry(parent, struct ubifs_old_idx, rb);
- if (lnum < o->lnum)
- p = &(*p)->rb_left;
- else if (lnum > o->lnum)
- p = &(*p)->rb_right;
- else if (offs < o->offs)
- p = &(*p)->rb_left;
- else if (offs > o->offs)
- p = &(*p)->rb_right;
- else {
- ubifs_err(c, "old idx added twice!");
- kfree(old_idx);
- return 0;
- }
- }
- rb_link_node(&old_idx->rb, parent, p);
- rb_insert_color(&old_idx->rb, &c->old_idx);
return 0;
}
@@ -199,23 +206,6 @@ static struct ubifs_znode *copy_znode(struct ubifs_info *c,
__set_bit(DIRTY_ZNODE, &zn->flags);
__clear_bit(COW_ZNODE, &zn->flags);
- ubifs_assert(c, !ubifs_zn_obsolete(znode));
- __set_bit(OBSOLETE_ZNODE, &znode->flags);
-
- if (znode->level != 0) {
- int i;
- const int n = zn->child_cnt;
-
- /* The children now have new parent */
- for (i = 0; i < n; i++) {
- struct ubifs_zbranch *zbr = &zn->zbranch[i];
-
- if (zbr->znode)
- zbr->znode->parent = zn;
- }
- }
-
- atomic_long_inc(&c->dirty_zn_cnt);
return zn;
}
@@ -234,6 +224,42 @@ static int add_idx_dirt(struct ubifs_info *c, int lnum, int dirt)
}
/**
+ * replace_znode - replace old znode with new znode.
+ * @c: UBIFS file-system description object
+ * @new_zn: new znode
+ * @old_zn: old znode
+ * @zbr: the branch of parent znode
+ *
+ * Replace old znode with new znode in TNC.
+ */
+static void replace_znode(struct ubifs_info *c, struct ubifs_znode *new_zn,
+ struct ubifs_znode *old_zn, struct ubifs_zbranch *zbr)
+{
+ ubifs_assert(c, !ubifs_zn_obsolete(old_zn));
+ __set_bit(OBSOLETE_ZNODE, &old_zn->flags);
+
+ if (old_zn->level != 0) {
+ int i;
+ const int n = new_zn->child_cnt;
+
+ /* The children now have new parent */
+ for (i = 0; i < n; i++) {
+ struct ubifs_zbranch *child = &new_zn->zbranch[i];
+
+ if (child->znode)
+ child->znode->parent = new_zn;
+ }
+ }
+
+ zbr->znode = new_zn;
+ zbr->lnum = 0;
+ zbr->offs = 0;
+ zbr->len = 0;
+
+ atomic_long_inc(&c->dirty_zn_cnt);
+}
+
+/**
* dirty_cow_znode - ensure a znode is not being committed.
* @c: UBIFS file-system description object
* @zbr: branch of znode to check
@@ -265,21 +291,32 @@ static struct ubifs_znode *dirty_cow_znode(struct ubifs_info *c,
return zn;
if (zbr->len) {
- err = insert_old_idx(c, zbr->lnum, zbr->offs);
- if (unlikely(err))
- return ERR_PTR(err);
+ struct ubifs_old_idx *old_idx;
+
+ old_idx = kmalloc(sizeof(struct ubifs_old_idx), GFP_NOFS);
+ if (unlikely(!old_idx)) {
+ err = -ENOMEM;
+ goto out;
+ }
+ old_idx->lnum = zbr->lnum;
+ old_idx->offs = zbr->offs;
+
err = add_idx_dirt(c, zbr->lnum, zbr->len);
- } else
- err = 0;
+ if (err) {
+ kfree(old_idx);
+ goto out;
+ }
- zbr->znode = zn;
- zbr->lnum = 0;
- zbr->offs = 0;
- zbr->len = 0;
+ do_insert_old_idx(c, old_idx);
+ }
+
+ replace_znode(c, zn, znode, zbr);
- if (unlikely(err))
- return ERR_PTR(err);
return zn;
+
+out:
+ kfree(zn);
+ return ERR_PTR(err);
}
/**
@@ -892,7 +929,7 @@ static int fallible_resolve_collision(struct ubifs_info *c,
int adding)
{
struct ubifs_znode *o_znode = NULL, *znode = *zn;
- int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
+ int o_n, err, cmp, unsure = 0, nn = *n;
cmp = fallible_matches_name(c, &znode->zbranch[nn], nm);
if (unlikely(cmp < 0))
@@ -1514,8 +1551,8 @@ out:
*/
int ubifs_tnc_get_bu_keys(struct ubifs_info *c, struct bu_info *bu)
{
- int n, err = 0, lnum = -1, uninitialized_var(offs);
- int uninitialized_var(len);
+ int n, err = 0, lnum = -1, offs;
+ int len;
unsigned int block = key_block(c, &bu->key);
struct ubifs_znode *znode;
@@ -3054,6 +3091,21 @@ static void tnc_destroy_cnext(struct ubifs_info *c)
cnext = cnext->cnext;
if (ubifs_zn_obsolete(znode))
kfree(znode);
+ else if (!ubifs_zn_cow(znode)) {
+ /*
+ * Don't forget to update clean znode count after
+ * committing failed, because ubifs will check this
+ * count while closing tnc. Non-obsolete znode could
+ * be re-dirtied during committing process, so dirty
+ * flag is untrustable. The flag 'COW_ZNODE' is set
+ * for each dirty znode before committing, and it is
+ * cleared as long as the znode become clean, so we
+ * can statistic clean znode count according to this
+ * flag.
+ */
+ atomic_long_inc(&c->clean_zn_cnt);
+ atomic_long_inc(&ubifs_clean_zn_cnt);
+ }
} while (cnext && cnext != c->cnext);
}
diff --git a/fs/ubifs/tnc_misc.c b/fs/ubifs/tnc_misc.c
index 49cb34c3f324..ccaf94ea5be3 100644
--- a/fs/ubifs/tnc_misc.c
+++ b/fs/ubifs/tnc_misc.c
@@ -126,8 +126,8 @@ int ubifs_search_zbranch(const struct ubifs_info *c,
const struct ubifs_znode *znode,
const union ubifs_key *key, int *n)
{
- int beg = 0, end = znode->child_cnt, uninitialized_var(mid);
- int uninitialized_var(cmp);
+ int beg = 0, end = znode->child_cnt, mid;
+ int cmp;
const struct ubifs_zbranch *zbr = &znode->zbranch[0];
ubifs_assert(c, end > beg);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b3b7e3576e98..85e84138649b 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1593,8 +1593,13 @@ static inline int ubifs_check_hmac(const struct ubifs_info *c,
return crypto_memneq(expected, got, c->hmac_desc_len);
}
+#ifdef CONFIG_UBIFS_FS_AUTHENTICATION
void ubifs_bad_hash(const struct ubifs_info *c, const void *node,
const u8 *hash, int lnum, int offs);
+#else
+static inline void ubifs_bad_hash(const struct ubifs_info *c, const void *node,
+ const u8 *hash, int lnum, int offs) {};
+#endif
int __ubifs_node_check_hash(const struct ubifs_info *c, const void *buf,
const u8 *expected);
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 02f03fadb75b..f416b7fe092f 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -36,18 +36,41 @@ static int read_block_bitmap(struct super_block *sb,
unsigned long bitmap_nr)
{
struct buffer_head *bh = NULL;
- int retval = 0;
+ int i;
+ int max_bits, off, count;
struct kernel_lb_addr loc;
loc.logicalBlockNum = bitmap->s_extPosition;
loc.partitionReferenceNum = UDF_SB(sb)->s_partition;
bh = udf_tread(sb, udf_get_lb_pblock(sb, &loc, block));
+ bitmap->s_block_bitmap[bitmap_nr] = bh;
if (!bh)
- retval = -EIO;
+ return -EIO;
- bitmap->s_block_bitmap[bitmap_nr] = bh;
- return retval;
+ /* Check consistency of Space Bitmap buffer. */
+ max_bits = sb->s_blocksize * 8;
+ if (!bitmap_nr) {
+ off = sizeof(struct spaceBitmapDesc) << 3;
+ count = min(max_bits - off, bitmap->s_nr_groups);
+ } else {
+ /*
+ * Rough check if bitmap number is too big to have any bitmap
+ * blocks reserved.
+ */
+ if (bitmap_nr >
+ (bitmap->s_nr_groups >> (sb->s_blocksize_bits + 3)) + 2)
+ return 0;
+ off = 0;
+ count = bitmap->s_nr_groups - bitmap_nr * max_bits +
+ (sizeof(struct spaceBitmapDesc) << 3);
+ count = min(count, max_bits);
+ }
+
+ for (i = 0; i < count; i++)
+ if (udf_test_bit(i + off, bh->b_data))
+ return -EFSCORRUPTED;
+ return 0;
}
static int __load_block_bitmap(struct super_block *sb,
@@ -564,7 +587,7 @@ static udf_pblk_t udf_table_new_block(struct super_block *sb,
udf_pblk_t newblock = 0;
uint32_t adsize;
uint32_t elen, goal_elen = 0;
- struct kernel_lb_addr eloc, uninitialized_var(goal_eloc);
+ struct kernel_lb_addr eloc, goal_eloc;
struct extent_position epos, goal_epos;
int8_t etype;
struct udf_inode_info *iinfo = UDF_I(table);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 628941a6b79a..3b3729771915 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -148,26 +148,24 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
down_write(&iinfo->i_data_sem);
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
- loff_t end = iocb->ki_pos + iov_iter_count(from);
-
- if (inode->i_sb->s_blocksize <
- (udf_file_entry_alloc_offset(inode) + end)) {
- err = udf_expand_file_adinicb(inode);
- if (err) {
- inode_unlock(inode);
- udf_debug("udf_expand_adinicb: err=%d\n", err);
- return err;
- }
- } else {
- iinfo->i_lenAlloc = max(end, inode->i_size);
- up_write(&iinfo->i_data_sem);
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB &&
+ inode->i_sb->s_blocksize < (udf_file_entry_alloc_offset(inode) +
+ iocb->ki_pos + iov_iter_count(from))) {
+ err = udf_expand_file_adinicb(inode);
+ if (err) {
+ inode_unlock(inode);
+ udf_debug("udf_expand_adinicb: err=%d\n", err);
+ return err;
}
} else
up_write(&iinfo->i_data_sem);
retval = __generic_file_write_iter(iocb, from);
out:
+ down_write(&iinfo->i_data_sem);
+ if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB && retval > 0)
+ iinfo->i_lenAlloc = inode->i_size;
+ up_write(&iinfo->i_data_sem);
inode_unlock(inode);
if (retval > 0) {
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 639aabf30eaf..fef6e5e06e3f 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -57,15 +57,15 @@ static int udf_update_inode(struct inode *, int);
static int udf_sync_inode(struct inode *inode);
static int udf_alloc_i_data(struct inode *inode, size_t size);
static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
-static int8_t udf_insert_aext(struct inode *, struct extent_position,
- struct kernel_lb_addr, uint32_t);
+static int udf_insert_aext(struct inode *, struct extent_position,
+ struct kernel_lb_addr, uint32_t);
static void udf_split_extents(struct inode *, int *, int, udf_pblk_t,
struct kernel_long_ad *, int *);
static void udf_prealloc_extents(struct inode *, int, int,
struct kernel_long_ad *, int *);
static void udf_merge_extents(struct inode *, struct kernel_long_ad *, int *);
-static void udf_update_extents(struct inode *, struct kernel_long_ad *, int,
- int, struct extent_position *);
+static int udf_update_extents(struct inode *, struct kernel_long_ad *, int,
+ int, struct extent_position *);
static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
static void __udf_clear_extent_cache(struct inode *inode)
@@ -441,6 +441,12 @@ static int udf_get_block(struct inode *inode, sector_t block,
iinfo->i_next_alloc_goal++;
}
+ /*
+ * Block beyond EOF and prealloc extents? Just discard preallocation
+ * as it is not useful and complicates things.
+ */
+ if (((loff_t)block) << inode->i_blkbits > iinfo->i_lenExtents)
+ udf_discard_prealloc(inode);
udf_clear_extent_cache(inode);
phys = inode_getblk(inode, block, &err, &new);
if (!phys)
@@ -490,8 +496,6 @@ static int udf_do_extend_file(struct inode *inode,
uint32_t add;
int count = 0, fake = !(last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
struct super_block *sb = inode->i_sb;
- struct kernel_lb_addr prealloc_loc = {};
- uint32_t prealloc_len = 0;
struct udf_inode_info *iinfo;
int err;
@@ -512,19 +516,6 @@ static int udf_do_extend_file(struct inode *inode,
~(sb->s_blocksize - 1);
}
- /* Last extent are just preallocated blocks? */
- if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
- EXT_NOT_RECORDED_ALLOCATED) {
- /* Save the extent so that we can reattach it to the end */
- prealloc_loc = last_ext->extLocation;
- prealloc_len = last_ext->extLength;
- /* Mark the extent as a hole */
- last_ext->extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
- (last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
- last_ext->extLocation.logicalBlockNum = 0;
- last_ext->extLocation.partitionReferenceNum = 0;
- }
-
/* Can we merge with the previous extent? */
if ((last_ext->extLength & UDF_EXTENT_FLAG_MASK) ==
EXT_NOT_RECORDED_NOT_ALLOCATED) {
@@ -537,8 +528,10 @@ static int udf_do_extend_file(struct inode *inode,
}
if (fake) {
- udf_add_aext(inode, last_pos, &last_ext->extLocation,
- last_ext->extLength, 1);
+ err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
+ last_ext->extLength, 1);
+ if (err < 0)
+ goto out_err;
count++;
} else {
struct kernel_lb_addr tmploc;
@@ -552,7 +545,7 @@ static int udf_do_extend_file(struct inode *inode,
* more extents, we may need to enter possible following
* empty indirect extent.
*/
- if (new_block_bytes || prealloc_len)
+ if (new_block_bytes)
udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
}
@@ -572,7 +565,7 @@ static int udf_do_extend_file(struct inode *inode,
err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
if (err)
- return err;
+ goto out_err;
count++;
}
if (new_block_bytes) {
@@ -581,22 +574,11 @@ static int udf_do_extend_file(struct inode *inode,
err = udf_add_aext(inode, last_pos, &last_ext->extLocation,
last_ext->extLength, 1);
if (err)
- return err;
+ goto out_err;
count++;
}
out:
- /* Do we have some preallocated blocks saved? */
- if (prealloc_len) {
- err = udf_add_aext(inode, last_pos, &prealloc_loc,
- prealloc_len, 1);
- if (err)
- return err;
- last_ext->extLocation = prealloc_loc;
- last_ext->extLength = prealloc_len;
- count++;
- }
-
/* last_pos should point to the last written extent... */
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
last_pos->offset -= sizeof(struct short_ad);
@@ -606,19 +588,28 @@ out:
return -EIO;
return count;
+out_err:
+ /* Remove extents we've created so far */
+ udf_clear_extent_cache(inode);
+ udf_truncate_extents(inode);
+ return err;
}
/* Extend the final block of the file to final_block_len bytes */
static void udf_do_extend_final_block(struct inode *inode,
struct extent_position *last_pos,
struct kernel_long_ad *last_ext,
- uint32_t final_block_len)
+ uint32_t new_elen)
{
- struct super_block *sb = inode->i_sb;
uint32_t added_bytes;
- added_bytes = final_block_len -
- (last_ext->extLength & (sb->s_blocksize - 1));
+ /*
+ * Extent already large enough? It may be already rounded up to block
+ * size...
+ */
+ if (new_elen <= (last_ext->extLength & UDF_EXTENT_LENGTH_MASK))
+ return;
+ added_bytes = new_elen - (last_ext->extLength & UDF_EXTENT_LENGTH_MASK);
last_ext->extLength += added_bytes;
UDF_I(inode)->i_lenExtents += added_bytes;
@@ -635,12 +626,12 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
int8_t etype;
struct super_block *sb = inode->i_sb;
sector_t first_block = newsize >> sb->s_blocksize_bits, offset;
- unsigned long partial_final_block;
+ loff_t new_elen;
int adsize;
struct udf_inode_info *iinfo = UDF_I(inode);
struct kernel_long_ad extent;
int err = 0;
- int within_final_block;
+ bool within_last_ext;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
adsize = sizeof(struct short_ad);
@@ -649,8 +640,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
else
BUG();
+ /*
+ * When creating hole in file, just don't bother with preserving
+ * preallocation. It likely won't be very useful anyway.
+ */
+ udf_discard_prealloc(inode);
+
etype = inode_bmap(inode, first_block, &epos, &eloc, &elen, &offset);
- within_final_block = (etype != -1);
+ within_last_ext = (etype != -1);
+ /* We don't expect extents past EOF... */
+ WARN_ON_ONCE(within_last_ext &&
+ elen > ((loff_t)offset + 1) << inode->i_blkbits);
if ((!epos.bh && epos.offset == udf_file_entry_alloc_offset(inode)) ||
(epos.bh && epos.offset == sizeof(struct allocExtDesc))) {
@@ -666,19 +666,17 @@ static int udf_extend_file(struct inode *inode, loff_t newsize)
extent.extLength |= etype << 30;
}
- partial_final_block = newsize & (sb->s_blocksize - 1);
+ new_elen = ((loff_t)offset << inode->i_blkbits) |
+ (newsize & (sb->s_blocksize - 1));
/* File has extent covering the new size (could happen when extending
* inside a block)?
*/
- if (within_final_block) {
+ if (within_last_ext) {
/* Extending file within the last file block */
- udf_do_extend_final_block(inode, &epos, &extent,
- partial_final_block);
+ udf_do_extend_final_block(inode, &epos, &extent, new_elen);
} else {
- loff_t add = ((loff_t)offset << sb->s_blocksize_bits) |
- partial_final_block;
- err = udf_do_extend_file(inode, &epos, &extent, add);
+ err = udf_do_extend_file(inode, &epos, &extent, new_elen);
}
if (err < 0)
@@ -700,7 +698,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
struct kernel_lb_addr eloc, tmpeloc;
int c = 1;
loff_t lbcount = 0, b_off = 0;
- udf_pblk_t newblocknum, newblock;
+ udf_pblk_t newblocknum, newblock = 0;
sector_t offset = 0;
int8_t etype;
struct udf_inode_info *iinfo = UDF_I(inode);
@@ -779,10 +777,11 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
goto out_free;
}
- /* Are we beyond EOF? */
+ /* Are we beyond EOF and preallocated extent? */
if (etype == -1) {
int ret;
loff_t hole_len;
+
isBeyondEOF = true;
if (count) {
if (c)
@@ -802,25 +801,22 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
ret = udf_do_extend_file(inode, &prev_epos, laarr, hole_len);
if (ret < 0) {
*err = ret;
- newblock = 0;
goto out_free;
}
c = 0;
offset = 0;
count += ret;
- /* We are not covered by a preallocated extent? */
- if ((laarr[0].extLength & UDF_EXTENT_FLAG_MASK) !=
- EXT_NOT_RECORDED_ALLOCATED) {
- /* Is there any real extent? - otherwise we overwrite
- * the fake one... */
- if (count)
- c = !c;
- laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
- inode->i_sb->s_blocksize;
- memset(&laarr[c].extLocation, 0x00,
- sizeof(struct kernel_lb_addr));
- count++;
- }
+ /*
+ * Is there any real extent? - otherwise we overwrite the fake
+ * one...
+ */
+ if (count)
+ c = !c;
+ laarr[c].extLength = EXT_NOT_RECORDED_NOT_ALLOCATED |
+ inode->i_sb->s_blocksize;
+ memset(&laarr[c].extLocation, 0x00,
+ sizeof(struct kernel_lb_addr));
+ count++;
endnum = c + 1;
lastblock = 1;
} else {
@@ -867,7 +863,6 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
goal, err);
if (!newblocknum) {
*err = -ENOSPC;
- newblock = 0;
goto out_free;
}
if (isBeyondEOF)
@@ -893,7 +888,9 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
/* write back the new extents, inserting new extents if the new number
* of extents is greater than the old number, and deleting extents if
* the new number of extents is less than the old number */
- udf_update_extents(inode, laarr, startnum, endnum, &prev_epos);
+ *err = udf_update_extents(inode, laarr, startnum, endnum, &prev_epos);
+ if (*err < 0)
+ goto out_free;
newblock = udf_get_pblock(inode->i_sb, newblocknum,
iinfo->i_location.partitionReferenceNum, 0);
@@ -1097,23 +1094,8 @@ static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr,
blocksize - 1) >> blocksize_bits)))) {
if (((li->extLength & UDF_EXTENT_LENGTH_MASK) +
- (lip1->extLength & UDF_EXTENT_LENGTH_MASK) +
- blocksize - 1) & ~UDF_EXTENT_LENGTH_MASK) {
- lip1->extLength = (lip1->extLength -
- (li->extLength &
- UDF_EXTENT_LENGTH_MASK) +
- UDF_EXTENT_LENGTH_MASK) &
- ~(blocksize - 1);
- li->extLength = (li->extLength &
- UDF_EXTENT_FLAG_MASK) +
- (UDF_EXTENT_LENGTH_MASK + 1) -
- blocksize;
- lip1->extLocation.logicalBlockNum =
- li->extLocation.logicalBlockNum +
- ((li->extLength &
- UDF_EXTENT_LENGTH_MASK) >>
- blocksize_bits);
- } else {
+ (lip1->extLength & UDF_EXTENT_LENGTH_MASK) +
+ blocksize - 1) <= UDF_EXTENT_LENGTH_MASK) {
li->extLength = lip1->extLength +
(((li->extLength &
UDF_EXTENT_LENGTH_MASK) +
@@ -1176,21 +1158,30 @@ static void udf_merge_extents(struct inode *inode, struct kernel_long_ad *laarr,
}
}
-static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr,
- int startnum, int endnum,
- struct extent_position *epos)
+static int udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr,
+ int startnum, int endnum,
+ struct extent_position *epos)
{
int start = 0, i;
struct kernel_lb_addr tmploc;
uint32_t tmplen;
+ int err;
if (startnum > endnum) {
for (i = 0; i < (startnum - endnum); i++)
udf_delete_aext(inode, *epos);
} else if (startnum < endnum) {
for (i = 0; i < (endnum - startnum); i++) {
- udf_insert_aext(inode, *epos, laarr[i].extLocation,
- laarr[i].extLength);
+ err = udf_insert_aext(inode, *epos,
+ laarr[i].extLocation,
+ laarr[i].extLength);
+ /*
+ * If we fail here, we are likely corrupting the extent
+ * list and leaking blocks. At least stop early to
+ * limit the damage.
+ */
+ if (err < 0)
+ return err;
udf_next_aext(inode, epos, &laarr[i].extLocation,
&laarr[i].extLength, 1);
start++;
@@ -1202,6 +1193,7 @@ static void udf_update_extents(struct inode *inode, struct kernel_long_ad *laarr
udf_write_aext(inode, epos, &laarr[i].extLocation,
laarr[i].extLength, 1);
}
+ return 0;
}
struct buffer_head *udf_bread(struct inode *inode, udf_pblk_t block,
@@ -1404,6 +1396,7 @@ reread:
ret = -EIO;
goto out;
}
+ iinfo->i_hidden = hidden_inode;
iinfo->i_unique = 0;
iinfo->i_lenEAttr = 0;
iinfo->i_lenExtents = 0;
@@ -1739,8 +1732,12 @@ static int udf_update_inode(struct inode *inode, int do_sync)
if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
- else
- fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
+ else {
+ if (iinfo->i_hidden)
+ fe->fileLinkCount = cpu_to_le16(0);
+ else
+ fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
+ }
fe->informationLength = cpu_to_le64(inode->i_size);
@@ -1911,8 +1908,13 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
if (!inode)
return ERR_PTR(-ENOMEM);
- if (!(inode->i_state & I_NEW))
+ if (!(inode->i_state & I_NEW)) {
+ if (UDF_I(inode)->i_hidden != hidden_inode) {
+ iput(inode);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
return inode;
+ }
memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
err = udf_read_inode(inode, hidden_inode);
@@ -2226,12 +2228,13 @@ int8_t udf_current_aext(struct inode *inode, struct extent_position *epos,
return etype;
}
-static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
- struct kernel_lb_addr neloc, uint32_t nelen)
+static int udf_insert_aext(struct inode *inode, struct extent_position epos,
+ struct kernel_lb_addr neloc, uint32_t nelen)
{
struct kernel_lb_addr oeloc;
uint32_t oelen;
int8_t etype;
+ int err;
if (epos.bh)
get_bh(epos.bh);
@@ -2241,10 +2244,10 @@ static int8_t udf_insert_aext(struct inode *inode, struct extent_position epos,
neloc = oeloc;
nelen = (etype << 30) | oelen;
}
- udf_add_aext(inode, &epos, &neloc, nelen, 1);
+ err = udf_add_aext(inode, &epos, &neloc, nelen, 1);
brelse(epos.bh);
- return (nelen >> 30);
+ return err;
}
int8_t udf_delete_aext(struct inode *inode, struct extent_position epos)
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 3c009562375d..c062b41a1e70 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -241,7 +241,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
poffset - lfi);
else {
if (!copy_name) {
- copy_name = kmalloc(UDF_NAME_LEN,
+ copy_name = kmalloc(UDF_NAME_LEN_CS0,
GFP_NOFS);
if (!copy_name) {
fi = ERR_PTR(-ENOMEM);
@@ -1091,8 +1091,9 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
return -EINVAL;
ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
- if (IS_ERR(ofi)) {
- retval = PTR_ERR(ofi);
+ if (!ofi || IS_ERR(ofi)) {
+ if (IS_ERR(ofi))
+ retval = PTR_ERR(ofi);
goto end_rename;
}
@@ -1101,8 +1102,7 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
brelse(ofibh.sbh);
tloc = lelb_to_cpu(ocfi.icb.extLocation);
- if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0)
- != old_inode->i_ino)
+ if (udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino)
goto end_rename;
nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 5193b94c0683..0f8b3cb35585 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -147,6 +147,7 @@ static struct inode *udf_alloc_inode(struct super_block *sb)
ei->i_next_alloc_goal = 0;
ei->i_strat4096 = 0;
ei->i_streamdir = 0;
+ ei->i_hidden = 0;
init_rwsem(&ei->i_data_sem);
ei->cached_extent.lstart = -1;
spin_lock_init(&ei->i_extent_cache_lock);
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index 63a47f1e1d52..fd3e4a8671f0 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -120,60 +120,42 @@ void udf_truncate_tail_extent(struct inode *inode)
void udf_discard_prealloc(struct inode *inode)
{
- struct extent_position epos = { NULL, 0, {0, 0} };
+ struct extent_position epos = {};
+ struct extent_position prev_epos = {};
struct kernel_lb_addr eloc;
uint32_t elen;
uint64_t lbcount = 0;
int8_t etype = -1, netype;
- int adsize;
struct udf_inode_info *iinfo = UDF_I(inode);
+ int bsize = 1 << inode->i_blkbits;
if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ||
- inode->i_size == iinfo->i_lenExtents)
+ ALIGN(inode->i_size, bsize) == ALIGN(iinfo->i_lenExtents, bsize))
return;
- if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
- adsize = sizeof(struct short_ad);
- else if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
- adsize = sizeof(struct long_ad);
- else
- adsize = 0;
-
epos.block = iinfo->i_location;
/* Find the last extent in the file */
- while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 1)) != -1) {
- etype = netype;
+ while ((netype = udf_next_aext(inode, &epos, &eloc, &elen, 0)) != -1) {
+ brelse(prev_epos.bh);
+ prev_epos = epos;
+ if (prev_epos.bh)
+ get_bh(prev_epos.bh);
+
+ etype = udf_next_aext(inode, &epos, &eloc, &elen, 1);
lbcount += elen;
}
if (etype == (EXT_NOT_RECORDED_ALLOCATED >> 30)) {
- epos.offset -= adsize;
lbcount -= elen;
- extent_trunc(inode, &epos, &eloc, etype, elen, 0);
- if (!epos.bh) {
- iinfo->i_lenAlloc =
- epos.offset -
- udf_file_entry_alloc_offset(inode);
- mark_inode_dirty(inode);
- } else {
- struct allocExtDesc *aed =
- (struct allocExtDesc *)(epos.bh->b_data);
- aed->lengthAllocDescs =
- cpu_to_le32(epos.offset -
- sizeof(struct allocExtDesc));
- if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
- UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
- udf_update_tag(epos.bh->b_data, epos.offset);
- else
- udf_update_tag(epos.bh->b_data,
- sizeof(struct allocExtDesc));
- mark_buffer_dirty_inode(epos.bh, inode);
- }
+ udf_delete_aext(inode, prev_epos);
+ udf_free_blocks(inode->i_sb, inode, &eloc, 0,
+ DIV_ROUND_UP(elen, 1 << inode->i_blkbits));
}
/* This inode entry is in-memory only and thus we don't have to mark
* the inode dirty */
iinfo->i_lenExtents = lbcount;
brelse(epos.bh);
+ brelse(prev_epos.bh);
}
static void udf_update_alloc_ext_desc(struct inode *inode,
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index 4245d1f63258..0372d0d14f72 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -44,7 +44,8 @@ struct udf_inode_info {
unsigned i_use : 1; /* unallocSpaceEntry */
unsigned i_strat4096 : 1;
unsigned i_streamdir : 1;
- unsigned reserved : 25;
+ unsigned i_hidden : 1; /* hidden system inode */
+ unsigned reserved : 24;
union {
struct short_ad *i_sad;
struct long_ad *i_lad;
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 8eace7a633d3..d91cbfb08983 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -51,6 +51,8 @@
#define MF_DUPLICATE_MD 0x01
#define MF_MIRROR_FE_LOADED 0x02
+#define EFSCORRUPTED EUCLEAN
+
struct udf_meta_data {
__u32 s_meta_file_loc;
__u32 s_mirror_file_loc;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index 622569007b53..2142cbd1dde2 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -247,7 +247,7 @@ static int udf_name_from_CS0(struct super_block *sb,
}
if (translate) {
- if (str_o_len <= 2 && str_o[0] == '.' &&
+ if (str_o_len > 0 && str_o_len <= 2 && str_o[0] == '.' &&
(str_o_len == 1 || str_o[1] == '.'))
needsCRC = 1;
if (needsCRC) {
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index ec57bbb6bb05..740853465356 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1018,7 +1018,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx,
int fd;
fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
- O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
+ O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
if (fd < 0)
return fd;
@@ -1969,7 +1969,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
mmgrab(ctx->mm);
fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
- O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+ O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS));
if (fd < 0) {
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
diff --git a/fs/verity/enable.c b/fs/verity/enable.c
index 1370bfd17e87..39459b1eff75 100644
--- a/fs/verity/enable.c
+++ b/fs/verity/enable.c
@@ -350,25 +350,27 @@ int fsverity_ioctl_enable(struct file *filp, const void __user *uarg)
goto out_drop_write;
err = enable_verity(filp, &arg);
- if (err)
- goto out_allow_write_access;
/*
- * Some pages of the file may have been evicted from pagecache after
- * being used in the Merkle tree construction, then read into pagecache
- * again by another process reading from the file concurrently. Since
- * these pages didn't undergo verification against the file measurement
- * which fs-verity now claims to be enforcing, we have to wipe the
- * pagecache to ensure that all future reads are verified.
+ * We no longer drop the inode's pagecache after enabling verity. This
+ * used to be done to try to avoid a race condition where pages could be
+ * evicted after being used in the Merkle tree construction, then
+ * re-instantiated by a concurrent read. Such pages are unverified, and
+ * the backing storage could have filled them with different content, so
+ * they shouldn't be used to fulfill reads once verity is enabled.
+ *
+ * But, dropping the pagecache has a big performance impact, and it
+ * doesn't fully solve the race condition anyway. So for those reasons,
+ * and also because this race condition isn't very important relatively
+ * speaking (especially for small-ish files, where the chance of a page
+ * being used, evicted, *and* re-instantiated all while enabling verity
+ * is quite small), we no longer drop the inode's pagecache.
*/
- filemap_write_and_wait(inode->i_mapping);
- invalidate_inode_pages2(inode->i_mapping);
/*
* allow_write_access() is needed to pair with deny_write_access().
* Regardless, the filesystem won't allow writing to verity files.
*/
-out_allow_write_access:
allow_write_access(filp);
out_drop_write:
mnt_drop_write_file(filp);
diff --git a/fs/verity/signature.c b/fs/verity/signature.c
index c8b255232de5..cd8cbc895a4f 100644
--- a/fs/verity/signature.c
+++ b/fs/verity/signature.c
@@ -58,6 +58,22 @@ int fsverity_verify_signature(const struct fsverity_info *vi,
return -EBADMSG;
}
+ if (fsverity_keyring->keys.nr_leaves_on_tree == 0) {
+ /*
+ * The ".fs-verity" keyring is empty, due to builtin signatures
+ * being supported by the kernel but not actually being used.
+ * In this case, verify_pkcs7_signature() would always return an
+ * error, usually ENOKEY. It could also be EBADMSG if the
+ * PKCS#7 is malformed, but that isn't very important to
+ * distinguish. So, just skip to ENOKEY to avoid the attack
+ * surface of the PKCS#7 parser, which would otherwise be
+ * reachable by any task able to execute FS_IOC_ENABLE_VERITY.
+ */
+ fsverity_err(inode,
+ "fs-verity keyring is empty, rejecting signed file!");
+ return -ENOKEY;
+ }
+
d = kzalloc(sizeof(*d) + hash_alg->digest_size, GFP_KERNEL);
if (!d)
return -ENOMEM;
diff --git a/fs/verity/verify.c b/fs/verity/verify.c
index 3e8f2de44667..a7e24f949e3d 100644
--- a/fs/verity/verify.c
+++ b/fs/verity/verify.c
@@ -259,15 +259,15 @@ EXPORT_SYMBOL_GPL(fsverity_enqueue_verify_work);
int __init fsverity_init_workqueue(void)
{
/*
- * Use an unbound workqueue to allow bios to be verified in parallel
- * even when they happen to complete on the same CPU. This sacrifices
- * locality, but it's worthwhile since hashing is CPU-intensive.
+ * Use a high-priority workqueue to prioritize verification work, which
+ * blocks reads from completing, over regular application tasks.
*
- * Also use a high-priority workqueue to prioritize verification work,
- * which blocks reads from completing, over regular application tasks.
+ * For performance reasons, don't use an unbound workqueue. Using an
+ * unbound workqueue for crypto operations causes excessive scheduler
+ * latency on ARM64.
*/
fsverity_read_workqueue = alloc_workqueue("fsverity_read_queue",
- WQ_UNBOUND | WQ_HIGHPRI,
+ WQ_HIGHPRI,
num_online_cpus());
if (!fsverity_read_workqueue)
return -ENOMEM;
diff --git a/fs/xattr.c b/fs/xattr.c
index 494313bce9cc..abd49f462a7d 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -1014,7 +1014,7 @@ static int xattr_list_one(char **buffer, ssize_t *remaining_size,
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
char *buffer, size_t size)
{
- bool trusted = capable(CAP_SYS_ADMIN);
+ bool trusted = ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN);
struct simple_xattr *xattr;
ssize_t remaining_size = size;
int err = 0;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 436f686a9891..1193fd6e4bad 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -684,8 +684,10 @@ xfs_alloc_update_counters(
xfs_trans_agblocks_delta(tp, len);
if (unlikely(be32_to_cpu(agf->agf_freeblks) >
- be32_to_cpu(agf->agf_length)))
+ be32_to_cpu(agf->agf_length))) {
+ xfs_buf_mark_corrupt(agbp);
return -EFSCORRUPTED;
+ }
xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
return 0;
@@ -751,6 +753,7 @@ xfs_alloc_ag_vextent_small(
bp = xfs_btree_get_bufs(args->mp, args->tp, args->agno, fbno);
if (!bp) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, args->mp);
error = -EFSCORRUPTED;
goto error;
}
@@ -1995,24 +1998,32 @@ xfs_alloc_longest_free_extent(
return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
}
+/*
+ * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
+ * return the largest possible minimum length.
+ */
unsigned int
xfs_alloc_min_freelist(
struct xfs_mount *mp,
struct xfs_perag *pag)
{
+ /* AG btrees have at least 1 level. */
+ static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
+ const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
unsigned int min_free;
+ ASSERT(mp->m_ag_maxlevels > 0);
+
/* space needed by-bno freespace btree */
- min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1,
+ min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
mp->m_ag_maxlevels);
/* space needed by-size freespace btree */
- min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1,
+ min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
mp->m_ag_maxlevels);
/* space needed reverse mapping used space btree */
if (xfs_sb_version_hasrmapbt(&mp->m_sb))
- min_free += min_t(unsigned int,
- pag->pagf_levels[XFS_BTNUM_RMAPi] + 1,
- mp->m_rmap_maxlevels);
+ min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
+ mp->m_rmap_maxlevels);
return min_free;
}
@@ -2087,8 +2098,10 @@ xfs_free_agfl_block(
return error;
bp = xfs_btree_get_bufs(tp->t_mountp, tp, agno, agbno);
- if (!bp)
+ if (!bp) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, tp->t_mountp);
return -EFSCORRUPTED;
+ }
xfs_trans_binval(tp, bp);
return 0;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 510ca6974604..c83ff610ecb6 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -1007,7 +1007,7 @@ restart:
* The INCOMPLETE flag means that we will find the "old"
* attr, not the "new" one.
*/
- args->flags |= XFS_ATTR_INCOMPLETE;
+ args->op_flags |= XFS_DA_OP_INCOMPLETE;
state = xfs_da_state_alloc();
state->args = args;
state->mp = mp;
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index de33efc9b4f9..2b74b6e9a354 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -251,14 +251,6 @@ xfs_attr3_leaf_verify(
return fa;
/*
- * In recovery there is a transient state where count == 0 is valid
- * because we may have transitioned an empty shortform attr to a leaf
- * if the attr didn't fit in shortform.
- */
- if (!xfs_log_in_recovery(mp) && ichdr.count == 0)
- return __this_address;
-
- /*
* firstused is the block offset of the first name info structure.
* Make sure it doesn't go off the block or crash into the header.
*/
@@ -443,7 +435,7 @@ xfs_attr_copy_value(
*========================================================================*/
/*
- * Query whether the requested number of additional bytes of extended
+ * Query whether the total requested number of attr fork bytes of extended
* attribute space will be able to fit inline.
*
* Returns zero if not, else the di_forkoff fork offset to be used in the
@@ -463,8 +455,14 @@ xfs_attr_shortform_bytesfit(
int maxforkoff;
int offset;
+ /*
+ * Check if the new size could fit at all first:
+ */
+ if (bytes > XFS_LITINO(mp))
+ return 0;
+
/* rounded down */
- offset = (XFS_LITINO(mp, dp->i_d.di_version) - bytes) >> 3;
+ offset = (XFS_LITINO(mp) - bytes) >> 3;
if (dp->i_d.di_format == XFS_DINODE_FMT_DEV) {
minforkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
@@ -531,8 +529,7 @@ xfs_attr_shortform_bytesfit(
minforkoff = roundup(minforkoff, 8) >> 3;
/* attr fork btree root can have at least this many key/ptr pairs */
- maxforkoff = XFS_LITINO(mp, dp->i_d.di_version) -
- XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ maxforkoff = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
maxforkoff = maxforkoff >> 3; /* rounded down */
if (offset >= maxforkoff)
@@ -2287,8 +2284,10 @@ xfs_attr3_leaf_lookup_int(
leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(args->geo, &ichdr, leaf);
entries = xfs_attr3_leaf_entryp(leaf);
- if (ichdr.count >= args->geo->blksize / 8)
+ if (ichdr.count >= args->geo->blksize / 8) {
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
+ }
/*
* Binary search. (note: small blocks will skip this loop)
@@ -2304,10 +2303,14 @@ xfs_attr3_leaf_lookup_int(
else
break;
}
- if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count)))
+ if (!(probe >= 0 && (!ichdr.count || probe < ichdr.count))) {
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
- if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval))
+ }
+ if (!(span <= 4 || be32_to_cpu(entry->hashval) == hashval)) {
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
+ }
/*
* Since we may have duplicate hashval's, find the first matching
@@ -2339,8 +2342,8 @@ xfs_attr3_leaf_lookup_int(
* If we are looking for INCOMPLETE entries, show only those.
* If we are looking for complete entries, show only those.
*/
- if ((args->flags & XFS_ATTR_INCOMPLETE) !=
- (entry->flags & XFS_ATTR_INCOMPLETE)) {
+ if (!!(args->op_flags & XFS_DA_OP_INCOMPLETE) !=
+ !!(entry->flags & XFS_ATTR_INCOMPLETE)) {
continue;
}
if (entry->flags & XFS_ATTR_LOCAL) {
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 7b74e18becff..38c05d6ae2aa 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -17,13 +17,27 @@ struct xfs_inode;
struct xfs_trans;
/*
- * Used to keep a list of "remote value" extents when unlinking an inode.
+ * Incore version of the attribute leaf header.
*/
-typedef struct xfs_attr_inactive_list {
- xfs_dablk_t valueblk; /* block number of value bytes */
- int valuelen; /* number of bytes in value */
-} xfs_attr_inactive_list_t;
-
+struct xfs_attr3_icleaf_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t usedbytes;
+ /*
+ * Firstused is 32-bit here instead of 16-bit like the on-disk variant
+ * to support maximum fsb size of 64k without overflow issues throughout
+ * the attr code. Instead, the overflow condition is handled on
+ * conversion to/from disk.
+ */
+ uint32_t firstused;
+ __u8 holes;
+ struct {
+ uint16_t base;
+ uint16_t size;
+ } freemap[XFS_ATTR_LEAF_MAPSIZE];
+};
/*========================================================================
* Function prototypes for the kernel.
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 3e39b7d40f25..de9096b8a47c 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -25,6 +25,23 @@
#define ATTR_RMTVALUE_MAPSIZE 1 /* # of map entries at once */
/*
+ * Remote Attribute Values
+ * =======================
+ *
+ * Remote extended attribute values are conceptually simple -- they're written
+ * to data blocks mapped by an inode's attribute fork, and they have an upper
+ * size limit of 64k. Setting a value does not involve the XFS log.
+ *
+ * However, on a v5 filesystem, maximally sized remote attr values require one
+ * block more than 64k worth of space to hold both the remote attribute value
+ * header (64 bytes). On a 4k block filesystem this results in a 68k buffer;
+ * on a 64k block filesystem, this would be a 128k buffer. Note that the log
+ * format can only handle a dirty buffer of XFS_MAX_BLOCKSIZE length (64k).
+ * Therefore, we /must/ ensure that remote attribute value buffers never touch
+ * the logging system and therefore never have a log item.
+ */
+
+/*
* Each contiguous block has a header, so it is not just a simple attribute
* length to FSB conversion.
*/
@@ -400,17 +417,25 @@ xfs_attr_rmtval_get(
(map[i].br_startblock != HOLESTARTBLOCK));
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
dblkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- error = xfs_trans_read_buf(mp, args->trans,
- mp->m_ddev_targp,
- dblkno, dblkcnt, 0, &bp,
- &xfs_attr3_rmt_buf_ops);
- if (error)
+ bp = xfs_buf_read(mp->m_ddev_targp, dblkno, dblkcnt, 0,
+ &xfs_attr3_rmt_buf_ops);
+ if (!bp)
+ return -ENOMEM;
+ error = bp->b_error;
+ if (error) {
+ xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_relse(bp);
+
+ /* bad CRC means corrupted metadata */
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
return error;
+ }
error = xfs_attr_rmtval_copyout(mp, bp, args->dp->i_ino,
&offset, &valuelen,
&dst);
- xfs_trans_brelse(args->trans, bp);
+ xfs_buf_relse(bp);
if (error)
return error;
@@ -551,6 +576,32 @@ xfs_attr_rmtval_set(
return 0;
}
+/* Mark stale any incore buffers for the remote value. */
+int
+xfs_attr_rmtval_stale(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *map,
+ xfs_buf_flags_t incore_flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_buf *bp;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ ASSERT((map->br_startblock != DELAYSTARTBLOCK) &&
+ (map->br_startblock != HOLESTARTBLOCK));
+
+ bp = xfs_buf_incore(mp->m_ddev_targp,
+ XFS_FSB_TO_DADDR(mp, map->br_startblock),
+ XFS_FSB_TO_BB(mp, map->br_blockcount), incore_flags);
+ if (bp) {
+ xfs_buf_stale(bp);
+ xfs_buf_relse(bp);
+ }
+
+ return 0;
+}
+
/*
* Remove the value associated with an attribute by deleting the
* out-of-line buffer that it is stored on.
@@ -559,7 +610,6 @@ int
xfs_attr_rmtval_remove(
struct xfs_da_args *args)
{
- struct xfs_mount *mp = args->dp->i_mount;
xfs_dablk_t lblkno;
int blkcnt;
int error;
@@ -574,9 +624,6 @@ xfs_attr_rmtval_remove(
blkcnt = args->rmtblkcnt;
while (blkcnt > 0) {
struct xfs_bmbt_irec map;
- struct xfs_buf *bp;
- xfs_daddr_t dblkno;
- int dblkcnt;
int nmap;
/*
@@ -588,21 +635,9 @@ xfs_attr_rmtval_remove(
if (error)
return error;
ASSERT(nmap == 1);
- ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
- (map.br_startblock != HOLESTARTBLOCK));
-
- dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
- dblkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-
- /*
- * If the "remote" value is in the cache, remove it.
- */
- bp = xfs_buf_incore(mp->m_ddev_targp, dblkno, dblkcnt, XBF_TRYLOCK);
- if (bp) {
- xfs_buf_stale(bp);
- xfs_buf_relse(bp);
- bp = NULL;
- }
+ error = xfs_attr_rmtval_stale(args->dp, &map, XBF_TRYLOCK);
+ if (error)
+ return error;
lblkno += map.br_blockcount;
blkcnt -= map.br_blockcount;
diff --git a/fs/xfs/libxfs/xfs_attr_remote.h b/fs/xfs/libxfs/xfs_attr_remote.h
index 9d20b66ad379..6fb4572845ce 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.h
+++ b/fs/xfs/libxfs/xfs_attr_remote.h
@@ -11,5 +11,7 @@ int xfs_attr3_rmt_blocks(struct xfs_mount *mp, int attrlen);
int xfs_attr_rmtval_get(struct xfs_da_args *args);
int xfs_attr_rmtval_set(struct xfs_da_args *args);
int xfs_attr_rmtval_remove(struct xfs_da_args *args);
+int xfs_attr_rmtval_stale(struct xfs_inode *ip, struct xfs_bmbt_irec *map,
+ xfs_buf_flags_t incore_flags);
#endif /* __XFS_ATTR_REMOTE_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c114d24be619..1e0fab62cd7d 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -192,14 +192,12 @@ xfs_default_attroffset(
struct xfs_mount *mp = ip->i_mount;
uint offset;
- if (mp->m_sb.sb_inodesize == 256) {
- offset = XFS_LITINO(mp, ip->i_d.di_version) -
- XFS_BMDR_SPACE_CALC(MINABTPTRS);
- } else {
+ if (mp->m_sb.sb_inodesize == 256)
+ offset = XFS_LITINO(mp) - XFS_BMDR_SPACE_CALC(MINABTPTRS);
+ else
offset = XFS_BMDR_SPACE_CALC(6 * MINABTPTRS);
- }
- ASSERT(offset < XFS_LITINO(mp, ip->i_d.di_version));
+ ASSERT(offset < XFS_LITINO(mp));
return offset;
}
@@ -729,6 +727,7 @@ xfs_bmap_extents_to_btree(
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
abp = xfs_btree_get_bufl(mp, tp, args.fsbno);
if (!abp) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
error = -EFSCORRUPTED;
goto out_unreserve_dquot;
}
@@ -1084,6 +1083,7 @@ xfs_bmap_add_attrfork(
if (XFS_IFORK_Q(ip))
goto trans_cancel;
if (ip->i_d.di_anextents != 0) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
error = -EFSCORRUPTED;
goto trans_cancel;
}
@@ -1374,7 +1374,8 @@ xfs_bmap_last_before(
case XFS_DINODE_FMT_EXTENTS:
break;
default:
- return -EIO;
+ ASSERT(0);
+ return -EFSCORRUPTED;
}
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
@@ -1474,8 +1475,10 @@ xfs_bmap_last_offset(
return 0;
if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
- XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS)
- return -EIO;
+ XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, &is_empty);
if (error || is_empty)
@@ -5871,8 +5874,9 @@ xfs_bmap_insert_extents(
XFS_WANT_CORRUPTED_GOTO(mp, !isnullstartblock(got.br_startblock),
del_cursor);
- if (stop_fsb >= got.br_startoff + got.br_blockcount) {
- error = -EIO;
+ if (stop_fsb > got.br_startoff) {
+ ASSERT(0);
+ error = -EFSCORRUPTED;
goto del_cursor;
}
@@ -5919,8 +5923,8 @@ del_cursor:
* @split_fsb is a block where the extents is split. If split_fsb lies in a
* hole or the first block of extents, just return 0.
*/
-STATIC int
-xfs_bmap_split_extent_at(
+int
+xfs_bmap_split_extent(
struct xfs_trans *tp,
struct xfs_inode *ip,
xfs_fileoff_t split_fsb)
@@ -6031,34 +6035,6 @@ del_cursor:
return error;
}
-int
-xfs_bmap_split_extent(
- struct xfs_inode *ip,
- xfs_fileoff_t split_fsb)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_trans *tp;
- int error;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
- XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
- if (error)
- return error;
-
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-
- error = xfs_bmap_split_extent_at(tp, ip, split_fsb);
- if (error)
- goto out;
-
- return xfs_trans_commit(tp);
-
-out:
- xfs_trans_cancel(tp);
- return error;
-}
-
/* Deferred mapping is only for real extents in the data fork. */
static bool
xfs_bmap_is_update_needed(
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 093716a074fb..a51c2b13a51a 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -158,17 +158,22 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags)
{ BMAP_ATTRFORK, "ATTR" }, \
{ BMAP_COWFORK, "COW" }
+/* Return true if the extent is an allocated extent, written or not. */
+static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+{
+ return irec->br_startblock != HOLESTARTBLOCK &&
+ irec->br_startblock != DELAYSTARTBLOCK &&
+ !isnullstartblock(irec->br_startblock);
+}
/*
* Return true if the extent is a real, allocated extent, or false if it is a
* delayed allocation, and unwritten extent or a hole.
*/
-static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec)
+static inline bool xfs_bmap_is_written_extent(struct xfs_bmbt_irec *irec)
{
- return irec->br_state != XFS_EXT_UNWRITTEN &&
- irec->br_startblock != HOLESTARTBLOCK &&
- irec->br_startblock != DELAYSTARTBLOCK &&
- !isnullstartblock(irec->br_startblock);
+ return xfs_bmap_is_real_extent(irec) &&
+ irec->br_state != XFS_EXT_UNWRITTEN;
}
/*
@@ -222,7 +227,8 @@ int xfs_bmap_can_insert_extents(struct xfs_inode *ip, xfs_fileoff_t off,
int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb,
bool *done, xfs_fileoff_t stop_fsb);
-int xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
+int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip,
+ xfs_fileoff_t split_offset);
int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur,
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 71de937f9e64..121251651fea 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -354,20 +354,17 @@ xfs_btree_free_block(
*/
void
xfs_btree_del_cursor(
- xfs_btree_cur_t *cur, /* btree cursor */
- int error) /* del because of error */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int error) /* del because of error */
{
- int i; /* btree level */
+ int i; /* btree level */
/*
- * Clear the buffer pointers, and release the buffers.
- * If we're doing this in the face of an error, we
- * need to make sure to inspect all of the entries
- * in the bc_bufs array for buffers to be unlocked.
- * This is because some of the btree code works from
- * level n down to 0, and if we get an error along
- * the way we won't have initialized all the entries
- * down to 0.
+ * Clear the buffer pointers and release the buffers. If we're doing
+ * this because of an error, inspect all of the entries in the bc_bufs
+ * array for buffers to be unlocked. This is because some of the btree
+ * code works from level n down to 0, and if we get an error along the
+ * way we won't have initialized all the entries down to 0.
*/
for (i = 0; i < cur->bc_nlevels; i++) {
if (cur->bc_bufs[i])
@@ -375,15 +372,10 @@ xfs_btree_del_cursor(
else if (!error)
break;
}
- /*
- * Can't free a bmap cursor without having dealt with the
- * allocated indirect blocks' accounting.
- */
+
ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
- cur->bc_private.b.allocated == 0);
- /*
- * Free the cursor.
- */
+ cur->bc_private.b.allocated == 0 ||
+ XFS_FORCED_SHUTDOWN(cur->bc_mp));
kmem_zone_free(xfs_btree_cur_zone, cur);
}
@@ -1820,6 +1812,7 @@ xfs_btree_lookup_get_block(
out_bad:
*blkp = NULL;
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(cur->bc_tp, bp);
return -EFSCORRUPTED;
}
@@ -1867,8 +1860,10 @@ xfs_btree_lookup(
XFS_BTREE_STATS_INC(cur, lookup);
/* No such thing as a zero-level tree. */
- if (cur->bc_nlevels == 0)
+ if (cur->bc_nlevels == 0) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, cur->bc_mp);
return -EFSCORRUPTED;
+ }
block = NULL;
keyno = 0;
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index 4fd1223c1bd5..12ef16c157dc 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -504,6 +504,7 @@ xfs_da3_split(
node = oldblk->bp->b_addr;
if (node->hdr.info.forw) {
if (be32_to_cpu(node->hdr.info.forw) != addblk->blkno) {
+ xfs_buf_mark_corrupt(oldblk->bp);
error = -EFSCORRUPTED;
goto out;
}
@@ -516,6 +517,7 @@ xfs_da3_split(
node = oldblk->bp->b_addr;
if (node->hdr.info.back) {
if (be32_to_cpu(node->hdr.info.back) != addblk->blkno) {
+ xfs_buf_mark_corrupt(oldblk->bp);
error = -EFSCORRUPTED;
goto out;
}
@@ -1541,8 +1543,10 @@ xfs_da3_node_lookup_int(
break;
}
- if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC)
+ if (magic != XFS_DA_NODE_MAGIC && magic != XFS_DA3_NODE_MAGIC) {
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
+ }
blk->magic = XFS_DA_NODE_MAGIC;
@@ -1554,15 +1558,18 @@ xfs_da3_node_lookup_int(
btree = dp->d_ops->node_tree_p(node);
/* Tree taller than we can handle; bail out! */
- if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH)
+ if (nodehdr.level >= XFS_DA_NODE_MAXDEPTH) {
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
+ }
/* Check the level from the root. */
if (blkno == args->geo->leafblk)
expected_level = nodehdr.level - 1;
- else if (expected_level != nodehdr.level)
+ else if (expected_level != nodehdr.level) {
+ xfs_buf_mark_corrupt(blk->bp);
return -EFSCORRUPTED;
- else
+ } else
expected_level--;
max = nodehdr.count;
@@ -1612,12 +1619,17 @@ xfs_da3_node_lookup_int(
}
/* We can't point back to the root. */
- if (blkno == args->geo->leafblk)
+ if (blkno == args->geo->leafblk) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW,
+ dp->i_mount);
return -EFSCORRUPTED;
+ }
}
- if (expected_level != 0)
+ if (expected_level != 0) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, dp->i_mount);
return -EFSCORRUPTED;
+ }
/*
* A leaf block that ends in the hashval that we are interested in
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ae0bbd20d9ca..588e4674e931 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -82,6 +82,7 @@ typedef struct xfs_da_args {
#define XFS_DA_OP_OKNOENT 0x0008 /* lookup/add op, ENOENT ok, else die */
#define XFS_DA_OP_CILOOKUP 0x0010 /* lookup to return CI name if found */
#define XFS_DA_OP_ALLOCVAL 0x0020 /* lookup to alloc buffer if found */
+#define XFS_DA_OP_INCOMPLETE 0x0040 /* lookup INCOMPLETE attr keys */
#define XFS_DA_OP_FLAGS \
{ XFS_DA_OP_JUSTCHECK, "JUSTCHECK" }, \
@@ -89,7 +90,8 @@ typedef struct xfs_da_args {
{ XFS_DA_OP_ADDNAME, "ADDNAME" }, \
{ XFS_DA_OP_OKNOENT, "OKNOENT" }, \
{ XFS_DA_OP_CILOOKUP, "CILOOKUP" }, \
- { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }
+ { XFS_DA_OP_ALLOCVAL, "ALLOCVAL" }, \
+ { XFS_DA_OP_INCOMPLETE, "INCOMPLETE" }
/*
* Storage for holding state during Btree searches and split/join ops.
@@ -125,6 +127,19 @@ typedef struct xfs_da_state {
} xfs_da_state_t;
/*
+ * In-core version of the node header to abstract the differences in the v2 and
+ * v3 disk format of the headers. Callers need to convert to/from disk format as
+ * appropriate.
+ */
+struct xfs_da3_icnode_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t level;
+};
+
+/*
* Utility macros to aid in logging changed structure fields.
*/
#define XFS_DA_LOGOFF(BASE, ADDR) ((char *)(ADDR) - (char *)(BASE))
diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c
index b1ae572496b6..31bb250c1899 100644
--- a/fs/xfs/libxfs/xfs_da_format.c
+++ b/fs/xfs/libxfs/xfs_da_format.c
@@ -13,6 +13,7 @@
#include "xfs_mount.h"
#include "xfs_inode.h"
#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
/*
* Shortform directory ops
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index ae654e06b2fb..222ee48da5e8 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -94,19 +94,6 @@ struct xfs_da3_intnode {
};
/*
- * In-core version of the node header to abstract the differences in the v2 and
- * v3 disk format of the headers. Callers need to convert to/from disk format as
- * appropriate.
- */
-struct xfs_da3_icnode_hdr {
- uint32_t forw;
- uint32_t back;
- uint16_t magic;
- uint16_t count;
- uint16_t level;
-};
-
-/*
* Directory version 2.
*
* There are 4 possible formats:
@@ -434,14 +421,6 @@ struct xfs_dir3_leaf_hdr {
__be32 pad; /* 64 bit alignment */
};
-struct xfs_dir3_icleaf_hdr {
- uint32_t forw;
- uint32_t back;
- uint16_t magic;
- uint16_t count;
- uint16_t stale;
-};
-
/*
* Leaf block entry.
*/
@@ -521,19 +500,6 @@ struct xfs_dir3_free {
#define XFS_DIR3_FREE_CRC_OFF offsetof(struct xfs_dir3_free, hdr.hdr.crc)
/*
- * In core version of the free block header, abstracted away from on-disk format
- * differences. Use this in the code, and convert to/from the disk version using
- * xfs_dir3_free_hdr_from_disk/xfs_dir3_free_hdr_to_disk.
- */
-struct xfs_dir3_icfree_hdr {
- uint32_t magic;
- uint32_t firstdb;
- uint32_t nvalid;
- uint32_t nused;
-
-};
-
-/*
* Single block format.
*
* The single block format looks like the following drawing on disk:
@@ -710,29 +676,6 @@ struct xfs_attr3_leafblock {
};
/*
- * incore, neutral version of the attribute leaf header
- */
-struct xfs_attr3_icleaf_hdr {
- uint32_t forw;
- uint32_t back;
- uint16_t magic;
- uint16_t count;
- uint16_t usedbytes;
- /*
- * firstused is 32-bit here instead of 16-bit like the on-disk variant
- * to support maximum fsb size of 64k without overflow issues throughout
- * the attr code. Instead, the overflow condition is handled on
- * conversion to/from disk.
- */
- uint32_t firstused;
- __u8 holes;
- struct {
- uint16_t base;
- uint16_t size;
- } freemap[XFS_ATTR_LEAF_MAPSIZE];
-};
-
-/*
* Special value to represent fs block size in the leaf header firstused field.
* Only used when block size overflows the 2-bytes available on disk.
*/
@@ -740,8 +683,6 @@ struct xfs_attr3_icleaf_hdr {
/*
* Flags used in the leaf_entry[i].flags field.
- * NOTE: the INCOMPLETE bit must not collide with the flags bits specified
- * on the system call, they are "or"ed together for various operations.
*/
#define XFS_ATTR_LOCAL_BIT 0 /* attr is stored locally */
#define XFS_ATTR_ROOT_BIT 1 /* limit access to trusted attrs */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index 22557527cfdb..3a78a189ea01 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -16,6 +16,8 @@
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_log.h"
/*
* Deferred Operations in XFS
@@ -178,6 +180,19 @@ static const struct xfs_defer_op_type *defer_op_types[] = {
[XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
};
+static void
+xfs_defer_create_intent(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp,
+ bool sort)
+{
+ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+
+ if (!dfp->dfp_intent)
+ dfp->dfp_intent = ops->create_intent(tp, &dfp->dfp_work,
+ dfp->dfp_count, sort);
+}
+
/*
* For each pending item in the intake list, log its intent item and the
* associated extents, then add the entire intake list to the end of
@@ -187,17 +202,11 @@ STATIC void
xfs_defer_create_intents(
struct xfs_trans *tp)
{
- struct list_head *li;
struct xfs_defer_pending *dfp;
- const struct xfs_defer_op_type *ops;
list_for_each_entry(dfp, &tp->t_dfops, dfp_list) {
- ops = defer_op_types[dfp->dfp_type];
- dfp->dfp_intent = ops->create_intent(tp, dfp->dfp_count);
trace_xfs_defer_create_intent(tp->t_mountp, dfp);
- list_sort(tp->t_mountp, &dfp->dfp_work, ops->diff_items);
- list_for_each(li, &dfp->dfp_work)
- ops->log_item(tp, dfp->dfp_intent, li);
+ xfs_defer_create_intent(tp, dfp, true);
}
}
@@ -234,10 +243,13 @@ xfs_defer_trans_roll(
struct xfs_log_item *lip;
struct xfs_buf *bplist[XFS_DEFER_OPS_NR_BUFS];
struct xfs_inode *iplist[XFS_DEFER_OPS_NR_INODES];
+ unsigned int ordered = 0; /* bitmap */
int bpcount = 0, ipcount = 0;
int i;
int error;
+ BUILD_BUG_ON(NBBY * sizeof(ordered) < XFS_DEFER_OPS_NR_BUFS);
+
list_for_each_entry(lip, &tp->t_items, li_trans) {
switch (lip->li_type) {
case XFS_LI_BUF:
@@ -248,7 +260,10 @@ xfs_defer_trans_roll(
ASSERT(0);
return -EFSCORRUPTED;
}
- xfs_trans_dirty_buf(tp, bli->bli_buf);
+ if (bli->bli_flags & XFS_BLI_ORDERED)
+ ordered |= (1U << bpcount);
+ else
+ xfs_trans_dirty_buf(tp, bli->bli_buf);
bplist[bpcount++] = bli->bli_buf;
}
break;
@@ -289,6 +304,8 @@ xfs_defer_trans_roll(
/* Rejoin the buffers and dirty them so the log moves forward. */
for (i = 0; i < bpcount; i++) {
xfs_trans_bjoin(tp, bplist[i]);
+ if (ordered & (1U << i))
+ xfs_trans_ordered_buf(tp, bplist[i]);
xfs_trans_bhold(tp, bplist[i]);
}
@@ -346,6 +363,106 @@ xfs_defer_cancel_list(
}
/*
+ * Prevent a log intent item from pinning the tail of the log by logging a
+ * done item to release the intent item; and then log a new intent item.
+ * The caller should provide a fresh transaction and roll it after we're done.
+ */
+static int
+xfs_defer_relog(
+ struct xfs_trans **tpp,
+ struct list_head *dfops)
+{
+ struct xlog *log = (*tpp)->t_mountp->m_log;
+ struct xfs_defer_pending *dfp;
+ xfs_lsn_t threshold_lsn = NULLCOMMITLSN;
+
+
+ ASSERT((*tpp)->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ list_for_each_entry(dfp, dfops, dfp_list) {
+ /*
+ * If the log intent item for this deferred op is not a part of
+ * the current log checkpoint, relog the intent item to keep
+ * the log tail moving forward. We're ok with this being racy
+ * because an incorrect decision means we'll be a little slower
+ * at pushing the tail.
+ */
+ if (dfp->dfp_intent == NULL ||
+ xfs_log_item_in_current_chkpt(dfp->dfp_intent))
+ continue;
+
+ /*
+ * Figure out where we need the tail to be in order to maintain
+ * the minimum required free space in the log. Only sample
+ * the log threshold once per call.
+ */
+ if (threshold_lsn == NULLCOMMITLSN) {
+ threshold_lsn = xlog_grant_push_threshold(log, 0);
+ if (threshold_lsn == NULLCOMMITLSN)
+ break;
+ }
+ if (XFS_LSN_CMP(dfp->dfp_intent->li_lsn, threshold_lsn) >= 0)
+ continue;
+
+ trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
+ XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
+ dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
+ }
+
+ if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
+ return xfs_defer_trans_roll(tpp);
+ return 0;
+}
+
+/*
+ * Log an intent-done item for the first pending intent, and finish the work
+ * items.
+ */
+static int
+xfs_defer_finish_one(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+ void *state = NULL;
+ struct list_head *li, *n;
+ int error;
+
+ trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
+
+ dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+ list_for_each_safe(li, n, &dfp->dfp_work) {
+ list_del(li);
+ dfp->dfp_count--;
+ error = ops->finish_item(tp, li, dfp->dfp_done, &state);
+ if (error == -EAGAIN) {
+ /*
+ * Caller wants a fresh transaction; put the work item
+ * back on the list and log a new log intent item to
+ * replace the old one. See "Requesting a Fresh
+ * Transaction while Finishing Deferred Work" above.
+ */
+ list_add(li, &dfp->dfp_work);
+ dfp->dfp_count++;
+ dfp->dfp_done = NULL;
+ dfp->dfp_intent = NULL;
+ xfs_defer_create_intent(tp, dfp, false);
+ }
+
+ if (error)
+ goto out;
+ }
+
+ /* Done with the dfp, free it. */
+ list_del(&dfp->dfp_list);
+ kmem_free(dfp);
+out:
+ if (ops->finish_cleanup)
+ ops->finish_cleanup(tp, state, error);
+ return error;
+}
+
+/*
* Finish all the pending work. This involves logging intent items for
* any work items that wandered in since the last transaction roll (if
* one has even happened), rolling the transaction, and finishing the
@@ -358,11 +475,7 @@ xfs_defer_finish_noroll(
struct xfs_trans **tp)
{
struct xfs_defer_pending *dfp;
- struct list_head *li;
- struct list_head *n;
- void *state;
int error = 0;
- const struct xfs_defer_op_type *ops;
LIST_HEAD(dop_pending);
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -371,87 +484,44 @@ xfs_defer_finish_noroll(
/* Until we run out of pending work to finish... */
while (!list_empty(&dop_pending) || !list_empty(&(*tp)->t_dfops)) {
- /* log intents and pull in intake items */
- xfs_defer_create_intents(*tp);
- list_splice_tail_init(&(*tp)->t_dfops, &dop_pending);
-
/*
- * Roll the transaction.
+ * Deferred items that are created in the process of finishing
+ * other deferred work items should be queued at the head of
+ * the pending list, which puts them ahead of the deferred work
+ * that was created by the caller. This keeps the number of
+ * pending work items to a minimum, which decreases the amount
+ * of time that any one intent item can stick around in memory,
+ * pinning the log tail.
*/
+ xfs_defer_create_intents(*tp);
+ list_splice_init(&(*tp)->t_dfops, &dop_pending);
+
error = xfs_defer_trans_roll(tp);
if (error)
- goto out;
+ goto out_shutdown;
+
+ /* Possibly relog intent items to keep the log moving. */
+ error = xfs_defer_relog(tp, &dop_pending);
+ if (error)
+ goto out_shutdown;
- /* Log an intent-done item for the first pending item. */
dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
dfp_list);
- ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_pending_finish((*tp)->t_mountp, dfp);
- dfp->dfp_done = ops->create_done(*tp, dfp->dfp_intent,
- dfp->dfp_count);
-
- /* Finish the work items. */
- state = NULL;
- list_for_each_safe(li, n, &dfp->dfp_work) {
- list_del(li);
- dfp->dfp_count--;
- error = ops->finish_item(*tp, li, dfp->dfp_done,
- &state);
- if (error == -EAGAIN) {
- /*
- * Caller wants a fresh transaction;
- * put the work item back on the list
- * and jump out.
- */
- list_add(li, &dfp->dfp_work);
- dfp->dfp_count++;
- break;
- } else if (error) {
- /*
- * Clean up after ourselves and jump out.
- * xfs_defer_cancel will take care of freeing
- * all these lists and stuff.
- */
- if (ops->finish_cleanup)
- ops->finish_cleanup(*tp, state, error);
- goto out;
- }
- }
- if (error == -EAGAIN) {
- /*
- * Caller wants a fresh transaction, so log a
- * new log intent item to replace the old one
- * and roll the transaction. See "Requesting
- * a Fresh Transaction while Finishing
- * Deferred Work" above.
- */
- dfp->dfp_intent = ops->create_intent(*tp,
- dfp->dfp_count);
- dfp->dfp_done = NULL;
- list_for_each(li, &dfp->dfp_work)
- ops->log_item(*tp, dfp->dfp_intent, li);
- } else {
- /* Done with the dfp, free it. */
- list_del(&dfp->dfp_list);
- kmem_free(dfp);
- }
-
- if (ops->finish_cleanup)
- ops->finish_cleanup(*tp, state, error);
- }
-
-out:
- if (error) {
- xfs_defer_trans_abort(*tp, &dop_pending);
- xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
- trace_xfs_defer_finish_error(*tp, error);
- xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
- xfs_defer_cancel(*tp);
- return error;
+ error = xfs_defer_finish_one(*tp, dfp);
+ if (error && error != -EAGAIN)
+ goto out_shutdown;
}
trace_xfs_defer_finish_done(*tp, _RET_IP_);
return 0;
+
+out_shutdown:
+ xfs_defer_trans_abort(*tp, &dop_pending);
+ xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
+ trace_xfs_defer_finish_error(*tp, error);
+ xfs_defer_cancel_list((*tp)->t_mountp, &dop_pending);
+ xfs_defer_cancel(*tp);
+ return error;
}
int
@@ -552,3 +622,137 @@ xfs_defer_move(
xfs_defer_reset(stp);
}
+
+/*
+ * Prepare a chain of fresh deferred ops work items to be completed later. Log
+ * recovery requires the ability to put off until later the actual finishing
+ * work so that it can process unfinished items recovered from the log in
+ * correct order.
+ *
+ * Create and log intent items for all the work that we're capturing so that we
+ * can be assured that the items will get replayed if the system goes down
+ * before log recovery gets a chance to finish the work it put off. The entire
+ * deferred ops state is transferred to the capture structure and the
+ * transaction is then ready for the caller to commit it. If there are no
+ * intent items to capture, this function returns NULL.
+ *
+ * If capture_ip is not NULL, the capture structure will obtain an extra
+ * reference to the inode.
+ */
+static struct xfs_defer_capture *
+xfs_defer_ops_capture(
+ struct xfs_trans *tp,
+ struct xfs_inode *capture_ip)
+{
+ struct xfs_defer_capture *dfc;
+
+ if (list_empty(&tp->t_dfops))
+ return NULL;
+
+ /* Create an object to capture the defer ops. */
+ dfc = kmem_zalloc(sizeof(*dfc), KM_NOFS);
+ INIT_LIST_HEAD(&dfc->dfc_list);
+ INIT_LIST_HEAD(&dfc->dfc_dfops);
+
+ xfs_defer_create_intents(tp);
+
+ /* Move the dfops chain and transaction state to the capture struct. */
+ list_splice_init(&tp->t_dfops, &dfc->dfc_dfops);
+ dfc->dfc_tpflags = tp->t_flags & XFS_TRANS_LOWMODE;
+ tp->t_flags &= ~XFS_TRANS_LOWMODE;
+
+ /* Capture the remaining block reservations along with the dfops. */
+ dfc->dfc_blkres = tp->t_blk_res - tp->t_blk_res_used;
+ dfc->dfc_rtxres = tp->t_rtx_res - tp->t_rtx_res_used;
+
+ /* Preserve the log reservation size. */
+ dfc->dfc_logres = tp->t_log_res;
+
+ /*
+ * Grab an extra reference to this inode and attach it to the capture
+ * structure.
+ */
+ if (capture_ip) {
+ ihold(VFS_I(capture_ip));
+ dfc->dfc_capture_ip = capture_ip;
+ }
+
+ return dfc;
+}
+
+/* Release all resources that we used to capture deferred ops. */
+void
+xfs_defer_ops_release(
+ struct xfs_mount *mp,
+ struct xfs_defer_capture *dfc)
+{
+ xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
+ if (dfc->dfc_capture_ip)
+ xfs_irele(dfc->dfc_capture_ip);
+ kmem_free(dfc);
+}
+
+/*
+ * Capture any deferred ops and commit the transaction. This is the last step
+ * needed to finish a log intent item that we recovered from the log. If any
+ * of the deferred ops operate on an inode, the caller must pass in that inode
+ * so that the reference can be transferred to the capture structure. The
+ * caller must hold ILOCK_EXCL on the inode, and must unlock it before calling
+ * xfs_defer_ops_continue.
+ */
+int
+xfs_defer_ops_capture_and_commit(
+ struct xfs_trans *tp,
+ struct xfs_inode *capture_ip,
+ struct list_head *capture_list)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_defer_capture *dfc;
+ int error;
+
+ ASSERT(!capture_ip || xfs_isilocked(capture_ip, XFS_ILOCK_EXCL));
+
+ /* If we don't capture anything, commit transaction and exit. */
+ dfc = xfs_defer_ops_capture(tp, capture_ip);
+ if (!dfc)
+ return xfs_trans_commit(tp);
+
+ /* Commit the transaction and add the capture structure to the list. */
+ error = xfs_trans_commit(tp);
+ if (error) {
+ xfs_defer_ops_release(mp, dfc);
+ return error;
+ }
+
+ list_add_tail(&dfc->dfc_list, capture_list);
+ return 0;
+}
+
+/*
+ * Attach a chain of captured deferred ops to a new transaction and free the
+ * capture structure. If an inode was captured, it will be passed back to the
+ * caller with ILOCK_EXCL held and joined to the transaction with lockflags==0.
+ * The caller now owns the inode reference.
+ */
+void
+xfs_defer_ops_continue(
+ struct xfs_defer_capture *dfc,
+ struct xfs_trans *tp,
+ struct xfs_inode **captured_ipp)
+{
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+ ASSERT(!(tp->t_flags & XFS_TRANS_DIRTY));
+
+ /* Lock and join the captured inode to the new transaction. */
+ if (dfc->dfc_capture_ip) {
+ xfs_ilock(dfc->dfc_capture_ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, dfc->dfc_capture_ip, 0);
+ }
+ *captured_ipp = dfc->dfc_capture_ip;
+
+ /* Move captured dfops chain and state to the transaction. */
+ list_splice_init(&dfc->dfc_dfops, &tp->t_dfops);
+ tp->t_flags |= dfc->dfc_tpflags;
+
+ kmem_free(dfc);
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 7c28d7608ac6..4c3248d47a35 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -7,6 +7,7 @@
#define __XFS_DEFER_H__
struct xfs_defer_op_type;
+struct xfs_defer_capture;
/*
* Header for deferred operation list.
@@ -28,7 +29,7 @@ enum xfs_defer_ops_type {
struct xfs_defer_pending {
struct list_head dfp_list; /* pending items */
struct list_head dfp_work; /* work items */
- void *dfp_intent; /* log intent item */
+ struct xfs_log_item *dfp_intent; /* log intent item */
void *dfp_done; /* log done item */
unsigned int dfp_count; /* # extent items */
enum xfs_defer_ops_type dfp_type;
@@ -43,15 +44,15 @@ void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);
/* Description of a deferred type. */
struct xfs_defer_op_type {
- void (*abort_intent)(void *);
- void *(*create_done)(struct xfs_trans *, void *, unsigned int);
+ struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
+ struct list_head *items, unsigned int count, bool sort);
+ void (*abort_intent)(struct xfs_log_item *intent);
+ void *(*create_done)(struct xfs_trans *tp, struct xfs_log_item *intent,
+ unsigned int count);
int (*finish_item)(struct xfs_trans *, struct list_head *, void *,
void **);
void (*finish_cleanup)(struct xfs_trans *, void *, int);
void (*cancel_item)(struct list_head *);
- int (*diff_items)(void *, struct list_head *, struct list_head *);
- void *(*create_intent)(struct xfs_trans *, uint);
- void (*log_item)(struct xfs_trans *, void *, struct list_head *);
unsigned int max_items;
};
@@ -61,4 +62,40 @@ extern const struct xfs_defer_op_type xfs_rmap_update_defer_type;
extern const struct xfs_defer_op_type xfs_extent_free_defer_type;
extern const struct xfs_defer_op_type xfs_agfl_free_defer_type;
+/*
+ * This structure enables a dfops user to detach the chain of deferred
+ * operations from a transaction so that they can be continued later.
+ */
+struct xfs_defer_capture {
+ /* List of other capture structures. */
+ struct list_head dfc_list;
+
+ /* Deferred ops state saved from the transaction. */
+ struct list_head dfc_dfops;
+ unsigned int dfc_tpflags;
+
+ /* Block reservations for the data and rt devices. */
+ unsigned int dfc_blkres;
+ unsigned int dfc_rtxres;
+
+ /* Log reservation saved from the transaction. */
+ unsigned int dfc_logres;
+
+ /*
+ * An inode reference that must be maintained to complete the deferred
+ * work.
+ */
+ struct xfs_inode *dfc_capture_ip;
+};
+
+/*
+ * Functions to capture a chain of deferred operations and continue them later.
+ * This doesn't normally happen except log recovery.
+ */
+int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp,
+ struct xfs_inode *capture_ip, struct list_head *capture_list);
+void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp,
+ struct xfs_inode **captured_ipp);
+void xfs_defer_ops_release(struct xfs_mount *mp, struct xfs_defer_capture *d);
+
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 867c5dee0751..452d04ae10ce 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -600,8 +600,10 @@ xfs_dir2_isblock(
if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
return rval;
rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
- if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize)
+ if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, args->dp->i_mount);
return -EFSCORRUPTED;
+ }
*vp = rval;
return 0;
}
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index f54244779492..e170792c0acc 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -18,6 +18,8 @@ struct xfs_dir2_sf_entry;
struct xfs_dir2_data_hdr;
struct xfs_dir2_data_entry;
struct xfs_dir2_data_unused;
+struct xfs_dir3_icfree_hdr;
+struct xfs_dir3_icleaf_hdr;
extern struct xfs_name xfs_name_dotdot;
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 49e4bc39e7bb..d034d661957c 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -114,6 +114,23 @@ const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
.verify_struct = xfs_dir3_block_verify,
};
+static xfs_failaddr_t
+xfs_dir3_block_header_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_blk_hdr *hdr3 = bp->b_addr;
+
+ if (be64_to_cpu(hdr3->owner) != dp->i_ino)
+ return __this_address;
+ }
+
+ return NULL;
+}
+
int
xfs_dir3_block_read(
struct xfs_trans *tp,
@@ -121,12 +138,24 @@ xfs_dir3_block_read(
struct xfs_buf **bpp)
{
struct xfs_mount *mp = dp->i_mount;
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, mp->m_dir_geo->datablk, -1, bpp,
XFS_DATA_FORK, &xfs_dir3_block_buf_ops);
- if (!err && tp && *bpp)
- xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
+ if (err || !*bpp)
+ return err;
+
+ /* Check things that we can't do in the verifier. */
+ fa = xfs_dir3_block_header_check(dp, *bpp);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_BLOCK_BUF);
return err;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 2c79be4c3153..2d92bcd8c801 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -348,6 +348,22 @@ static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
.verify_write = xfs_dir3_data_write_verify,
};
+static xfs_failaddr_t
+xfs_dir3_data_header_check(
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dir3_data_hdr *hdr3 = bp->b_addr;
+
+ if (be64_to_cpu(hdr3->hdr.owner) != dp->i_ino)
+ return __this_address;
+ }
+
+ return NULL;
+}
int
xfs_dir3_data_read(
@@ -357,12 +373,24 @@ xfs_dir3_data_read(
xfs_daddr_t mapped_bno,
struct xfs_buf **bpp)
{
+ xfs_failaddr_t fa;
int err;
err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
- if (!err && tp && *bpp)
- xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
+ if (err || !*bpp)
+ return err;
+
+ /* Check things that we can't do in the verifier. */
+ fa = xfs_dir3_data_header_check(dp, *bpp);
+ if (fa) {
+ __xfs_buf_mark_corrupt(*bpp, fa);
+ xfs_trans_brelse(tp, *bpp);
+ *bpp = NULL;
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
return err;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index a53e4585a2f3..c8ee3250b749 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -1343,8 +1343,10 @@ xfs_dir2_leaf_removename(
oldbest = be16_to_cpu(bf[0].length);
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
bestsp = xfs_dir2_leaf_bests_p(ltp);
- if (be16_to_cpu(bestsp[db]) != oldbest)
+ if (be16_to_cpu(bestsp[db]) != oldbest) {
+ xfs_buf_mark_corrupt(lbp);
return -EFSCORRUPTED;
+ }
/*
* Mark the former data entry unused.
*/
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 99d5b2ed67f2..c8c3c3af539f 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -208,7 +208,7 @@ __xfs_dir3_free_read(
/* Check things that we can't do in the verifier. */
fa = xfs_dir3_free_header_check(dp, fbno, *bpp);
if (fa) {
- xfs_verifier_error(*bpp, -EFSCORRUPTED, fa);
+ __xfs_buf_mark_corrupt(*bpp, fa);
xfs_trans_brelse(tp, *bpp);
*bpp = NULL;
return -EFSCORRUPTED;
@@ -374,8 +374,10 @@ xfs_dir2_leaf_to_node(
leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(args->geo, leaf);
if (be32_to_cpu(ltp->bestcount) >
- (uint)dp->i_d.di_size / args->geo->blksize)
+ (uint)dp->i_d.di_size / args->geo->blksize) {
+ xfs_buf_mark_corrupt(lbp);
return -EFSCORRUPTED;
+ }
/*
* Copy freespace entries from the leaf block to the new block.
@@ -446,8 +448,10 @@ xfs_dir2_leafn_add(
* Quick check just to make sure we are not going to index
* into other peoples memory
*/
- if (index < 0)
+ if (index < 0) {
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
+ }
/*
* If there are already the maximum number of leaf entries in
@@ -740,8 +744,10 @@ xfs_dir2_leafn_lookup_for_entry(
ents = dp->d_ops->leaf_ents_p(leaf);
xfs_dir3_leaf_check(dp, bp);
- if (leafhdr.count <= 0)
+ if (leafhdr.count <= 0) {
+ xfs_buf_mark_corrupt(bp);
return -EFSCORRUPTED;
+ }
/*
* Look up the hash value in the leaf entries.
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 59f9fb2241a5..d2eaea663e7f 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -8,6 +8,25 @@
struct dir_context;
+/*
+ * In-core version of the leaf and free block headers to abstract the
+ * differences in the v2 and v3 disk format of the headers.
+ */
+struct xfs_dir3_icleaf_hdr {
+ uint32_t forw;
+ uint32_t back;
+ uint16_t magic;
+ uint16_t count;
+ uint16_t stale;
+};
+
+struct xfs_dir3_icfree_hdr {
+ uint32_t magic;
+ uint32_t firstdb;
+ uint32_t nvalid;
+ uint32_t nused;
+};
+
/* xfs_dir2.c */
extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
xfs_dir2_db_t *dbp);
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index ae16ca7c422a..f980c3f3d2f6 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -945,6 +945,27 @@ xfs_dir2_sf_removename(
}
/*
+ * Check whether the sf dir replace operation need more blocks.
+ */
+static bool
+xfs_dir2_sf_replace_needblock(
+ struct xfs_inode *dp,
+ xfs_ino_t inum)
+{
+ int newsize;
+ struct xfs_dir2_sf_hdr *sfp;
+
+ if (dp->i_d.di_format != XFS_DINODE_FMT_LOCAL)
+ return false;
+
+ sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
+ newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
+
+ return inum > XFS_DIR2_MAX_SHORT_INUM &&
+ sfp->i8count == 0 && newsize > XFS_IFORK_DSIZE(dp);
+}
+
+/*
* Replace the inode number of an entry in a shortform directory.
*/
int /* error */
@@ -980,17 +1001,14 @@ xfs_dir2_sf_replace(
*/
if (args->inumber > XFS_DIR2_MAX_SHORT_INUM && sfp->i8count == 0) {
int error; /* error return value */
- int newsize; /* new inode size */
- newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
/*
* Won't fit as shortform, convert to block then do replace.
*/
- if (newsize > XFS_IFORK_DSIZE(dp)) {
+ if (xfs_dir2_sf_replace_needblock(dp, args->inumber)) {
error = xfs_dir2_sf_to_block(args);
- if (error) {
+ if (error)
return error;
- }
return xfs_dir2_block_replace(args);
}
/*
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index e8bd688a4073..bedc1e752b60 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -35,10 +35,10 @@ xfs_calc_dquots_per_chunk(
xfs_failaddr_t
xfs_dquot_verify(
- struct xfs_mount *mp,
- xfs_disk_dquot_t *ddq,
- xfs_dqid_t id,
- uint type) /* used only during quotacheck */
+ struct xfs_mount *mp,
+ struct xfs_disk_dquot *ddq,
+ xfs_dqid_t id,
+ uint type) /* used only during quotacheck */
{
/*
* We can encounter an uninitialized dquot buffer for 2 reasons:
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index c968b60cee15..31fa9ab2ab61 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -497,6 +497,23 @@ static inline bool xfs_sb_version_hascrc(struct xfs_sb *sbp)
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
}
+/*
+ * v5 file systems support V3 inodes only, earlier file systems support
+ * v2 and v1 inodes.
+ */
+static inline bool xfs_sb_version_has_v3inode(struct xfs_sb *sbp)
+{
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
+}
+
+static inline bool xfs_dinode_good_version(struct xfs_sb *sbp,
+ uint8_t version)
+{
+ if (xfs_sb_version_has_v3inode(sbp))
+ return version == 3;
+ return version == 1 || version == 2;
+}
+
static inline bool xfs_sb_version_has_pquotino(struct xfs_sb *sbp)
{
return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5;
@@ -946,8 +963,12 @@ typedef enum xfs_dinode_fmt {
/*
* Inode size for given fs.
*/
-#define XFS_LITINO(mp, version) \
- ((int)(((mp)->m_sb.sb_inodesize) - xfs_dinode_size(version)))
+#define XFS_DINODE_SIZE(sbp) \
+ (xfs_sb_version_has_v3inode(sbp) ? \
+ sizeof(struct xfs_dinode) : \
+ offsetof(struct xfs_dinode, di_crc))
+#define XFS_LITINO(mp) \
+ ((mp)->m_sb.sb_inodesize - XFS_DINODE_SIZE(&(mp)->m_sb))
/*
* Inode data & attribute fork sizes, per inode.
@@ -956,13 +977,9 @@ typedef enum xfs_dinode_fmt {
#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
#define XFS_DFORK_DSIZE(dip,mp) \
- (XFS_DFORK_Q(dip) ? \
- XFS_DFORK_BOFF(dip) : \
- XFS_LITINO(mp, (dip)->di_version))
+ (XFS_DFORK_Q(dip) ? XFS_DFORK_BOFF(dip) : XFS_LITINO(mp))
#define XFS_DFORK_ASIZE(dip,mp) \
- (XFS_DFORK_Q(dip) ? \
- XFS_LITINO(mp, (dip)->di_version) - XFS_DFORK_BOFF(dip) : \
- 0)
+ (XFS_DFORK_Q(dip) ? XFS_LITINO(mp) - XFS_DFORK_BOFF(dip) : 0)
#define XFS_DFORK_SIZE(dip,mp,w) \
((w) == XFS_DATA_FORK ? \
XFS_DFORK_DSIZE(dip, mp) : \
@@ -1144,11 +1161,11 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
/*
* This is the main portion of the on-disk representation of quota
- * information for a user. This is the q_core of the xfs_dquot_t that
+ * information for a user. This is the q_core of the struct xfs_dquot that
* is kept in kernel memory. We pad this with some more expansion room
* to construct the on disk structure.
*/
-typedef struct xfs_disk_dquot {
+struct xfs_disk_dquot {
__be16 d_magic; /* dquot magic = XFS_DQUOT_MAGIC */
__u8 d_version; /* dquot version */
__u8 d_flags; /* XFS_DQ_USER/PROJ/GROUP */
@@ -1171,15 +1188,15 @@ typedef struct xfs_disk_dquot {
__be32 d_rtbtimer; /* similar to above; for RT disk blocks */
__be16 d_rtbwarns; /* warnings issued wrt RT disk blocks */
__be16 d_pad;
-} xfs_disk_dquot_t;
+};
/*
* This is what goes on disk. This is separated from the xfs_disk_dquot because
* carrying the unnecessary padding would be a waste of memory.
*/
typedef struct xfs_dqblk {
- xfs_disk_dquot_t dd_diskdq; /* portion that lives incore as well */
- char dd_fill[4]; /* filling for posterity */
+ struct xfs_disk_dquot dd_diskdq; /* portion living incore as well */
+ char dd_fill[4];/* filling for posterity */
/*
* These two are only present on filesystems with the CRC bits set.
@@ -1540,6 +1557,13 @@ typedef struct xfs_bmdr_block {
#define BMBT_BLOCKCOUNT_BITLEN 21
#define BMBT_STARTOFF_MASK ((1ULL << BMBT_STARTOFF_BITLEN) - 1)
+#define BMBT_BLOCKCOUNT_MASK ((1ULL << BMBT_BLOCKCOUNT_BITLEN) - 1)
+
+/*
+ * bmbt records have a file offset (block) field that is 54 bits wide, so this
+ * is the largest xfs_fileoff_t that we ever expect to see.
+ */
+#define XFS_MAX_FILEOFF (BMBT_STARTOFF_MASK + BMBT_BLOCKCOUNT_MASK)
typedef struct xfs_bmbt_rec {
__be64 l0, l1;
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 443cf33f6666..391e441d43a0 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -303,7 +303,7 @@ xfs_ialloc_inode_init(
* That means for v3 inode we log the entire buffer rather than just the
* inode cores.
*/
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
version = 3;
ino = XFS_AGINO_TO_INO(mp, agno, XFS_AGB_TO_AGINO(mp, agbno));
@@ -339,7 +339,7 @@ xfs_ialloc_inode_init(
xfs_buf_zero(fbuf, 0, BBTOB(fbuf->b_length));
for (i = 0; i < M_IGEO(mp)->inodes_per_cluster; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
- uint isize = xfs_dinode_size(version);
+ uint isize = XFS_DINODE_SIZE(&mp->m_sb);
free = xfs_make_iptr(mp, fbuf, i);
free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
@@ -2818,7 +2818,7 @@ xfs_ialloc_setup_geometry(
* cannot change the behavior.
*/
igeo->inode_cluster_size_raw = XFS_INODE_BIG_CLUSTER_SIZE;
- if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
int new_size = igeo->inode_cluster_size_raw;
new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
@@ -2854,3 +2854,67 @@ xfs_ialloc_setup_geometry(
else
igeo->ialloc_align = 0;
}
+
+/* Compute the location of the root directory inode that is laid out by mkfs. */
+xfs_ino_t
+xfs_ialloc_calc_rootino(
+ struct xfs_mount *mp,
+ int sunit)
+{
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agblock_t first_bno;
+
+ /*
+ * Pre-calculate the geometry of AG 0. We know what it looks like
+ * because libxfs knows how to create allocation groups now.
+ *
+ * first_bno is the first block in which mkfs could possibly have
+ * allocated the root directory inode, once we factor in the metadata
+ * that mkfs formats before it. Namely, the four AG headers...
+ */
+ first_bno = howmany(4 * mp->m_sb.sb_sectsize, mp->m_sb.sb_blocksize);
+
+ /* ...the two free space btree roots... */
+ first_bno += 2;
+
+ /* ...the inode btree root... */
+ first_bno += 1;
+
+ /* ...the initial AGFL... */
+ first_bno += xfs_alloc_min_freelist(mp, NULL);
+
+ /* ...the free inode btree root... */
+ if (xfs_sb_version_hasfinobt(&mp->m_sb))
+ first_bno++;
+
+ /* ...the reverse mapping btree root... */
+ if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+ first_bno++;
+
+ /* ...the reference count btree... */
+ if (xfs_sb_version_hasreflink(&mp->m_sb))
+ first_bno++;
+
+ /*
+ * ...and the log, if it is allocated in the first allocation group.
+ *
+ * This can happen with filesystems that only have a single
+ * allocation group, or very odd geometries created by old mkfs
+ * versions on very small filesystems.
+ */
+ if (mp->m_sb.sb_logstart &&
+ XFS_FSB_TO_AGNO(mp, mp->m_sb.sb_logstart) == 0)
+ first_bno += mp->m_sb.sb_logblocks;
+
+ /*
+ * Now round first_bno up to whatever allocation alignment is given
+ * by the filesystem or was passed in.
+ */
+ if (xfs_sb_version_hasdalign(&mp->m_sb) && igeo->ialloc_align > 0)
+ first_bno = roundup(first_bno, sunit);
+ else if (xfs_sb_version_hasalign(&mp->m_sb) &&
+ mp->m_sb.sb_inoalignmt > 1)
+ first_bno = roundup(first_bno, mp->m_sb.sb_inoalignmt);
+
+ return XFS_AGINO_TO_INO(mp, 0, XFS_AGB_TO_AGINO(mp, first_bno));
+}
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index 323592d563d5..72b3468b97b1 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -152,5 +152,6 @@ int xfs_inobt_insert_rec(struct xfs_btree_cur *cur, uint16_t holemask,
int xfs_ialloc_cluster_alignment(struct xfs_mount *mp);
void xfs_ialloc_setup_geometry(struct xfs_mount *mp);
+xfs_ino_t xfs_ialloc_calc_rootino(struct xfs_mount *mp, int sunit);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 28ab3c5255e1..962e95dcdbff 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -44,17 +44,6 @@ xfs_inobp_check(
}
#endif
-bool
-xfs_dinode_good_version(
- struct xfs_mount *mp,
- __u8 version)
-{
- if (xfs_sb_version_hascrc(&mp->m_sb))
- return version == 3;
-
- return version == 1 || version == 2;
-}
-
/*
* If we are doing readahead on an inode buffer, we might be in log recovery
* reading an inode allocation buffer that hasn't yet been replayed, and hence
@@ -93,7 +82,7 @@ xfs_inode_buf_verify(
dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog));
unlinked_ino = be32_to_cpu(dip->di_next_unlinked);
di_ok = xfs_verify_magic16(bp, dip->di_magic) &&
- xfs_dinode_good_version(mp, dip->di_version) &&
+ xfs_dinode_good_version(&mp->m_sb, dip->di_version) &&
xfs_verify_agino_or_null(mp, agno, unlinked_ino);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP))) {
@@ -205,26 +194,23 @@ xfs_inode_from_disk(
struct xfs_icdinode *to = &ip->i_d;
struct inode *inode = VFS_I(ip);
-
/*
* Convert v1 inodes immediately to v2 inode format as this is the
* minimum inode version format we support in the rest of the code.
+ * They will also be unconditionally written back to disk as v2 inodes.
*/
- to->di_version = from->di_version;
- if (to->di_version == 1) {
+ if (unlikely(from->di_version == 1)) {
set_nlink(inode, be16_to_cpu(from->di_onlink));
- to->di_projid_lo = 0;
- to->di_projid_hi = 0;
- to->di_version = 2;
+ to->di_projid = 0;
} else {
set_nlink(inode, be32_to_cpu(from->di_nlink));
- to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
- to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
+ to->di_projid = (prid_t)be16_to_cpu(from->di_projid_hi) << 16 |
+ be16_to_cpu(from->di_projid_lo);
}
to->di_format = from->di_format;
- to->di_uid = be32_to_cpu(from->di_uid);
- to->di_gid = be32_to_cpu(from->di_gid);
+ i_uid_write(inode, be32_to_cpu(from->di_uid));
+ i_gid_write(inode, be32_to_cpu(from->di_gid));
to->di_flushiter = be16_to_cpu(from->di_flushiter);
/*
@@ -253,7 +239,7 @@ xfs_inode_from_disk(
to->di_dmstate = be16_to_cpu(from->di_dmstate);
to->di_flags = be16_to_cpu(from->di_flags);
- if (to->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
inode_set_iversion_queried(inode,
be64_to_cpu(from->di_changecount));
to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
@@ -275,12 +261,11 @@ xfs_inode_to_disk(
to->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
to->di_onlink = 0;
- to->di_version = from->di_version;
to->di_format = from->di_format;
- to->di_uid = cpu_to_be32(from->di_uid);
- to->di_gid = cpu_to_be32(from->di_gid);
- to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
- to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
+ to->di_uid = cpu_to_be32(i_uid_read(inode));
+ to->di_gid = cpu_to_be32(i_gid_read(inode));
+ to->di_projid_lo = cpu_to_be16(from->di_projid & 0xffff);
+ to->di_projid_hi = cpu_to_be16(from->di_projid >> 16);
memset(to->di_pad, 0, sizeof(to->di_pad));
to->di_atime.t_sec = cpu_to_be32(inode->i_atime.tv_sec);
@@ -304,7 +289,8 @@ xfs_inode_to_disk(
to->di_dmstate = cpu_to_be16(from->di_dmstate);
to->di_flags = cpu_to_be16(from->di_flags);
- if (from->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ to->di_version = 3;
to->di_changecount = cpu_to_be64(inode_peek_iversion(inode));
to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
@@ -316,6 +302,7 @@ xfs_inode_to_disk(
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_flushiter = 0;
} else {
+ to->di_version = 2;
to->di_flushiter = cpu_to_be16(from->di_flushiter);
}
}
@@ -429,7 +416,7 @@ xfs_dinode_verify_forkoff(
case XFS_DINODE_FMT_LOCAL: /* fall through ... */
case XFS_DINODE_FMT_EXTENTS: /* fall through ... */
case XFS_DINODE_FMT_BTREE:
- if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3))
+ if (dip->di_forkoff >= (XFS_LITINO(mp) >> 3))
return __this_address;
break;
default:
@@ -455,7 +442,7 @@ xfs_dinode_verify(
/* Verify v3 integrity information first */
if (dip->di_version >= 3) {
- if (!xfs_sb_version_hascrc(&mp->m_sb))
+ if (!xfs_sb_version_has_v3inode(&mp->m_sb))
return __this_address;
if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
XFS_DINODE_CRC_OFF))
@@ -630,12 +617,11 @@ xfs_iread(
/* shortcut IO on inode allocation if possible */
if ((iget_flags & XFS_IGET_CREATE) &&
- xfs_sb_version_hascrc(&mp->m_sb) &&
+ xfs_sb_version_has_v3inode(&mp->m_sb) &&
!(mp->m_flags & XFS_MOUNT_IKEEP)) {
/* initialise the on-disk inode core */
memset(&ip->i_d, 0, sizeof(ip->i_d));
VFS_I(ip)->i_generation = prandom_u32();
- ip->i_d.di_version = 3;
return 0;
}
@@ -677,7 +663,6 @@ xfs_iread(
* Partial initialisation of the in-core inode. Just the bits
* that xfs_ialloc won't overwrite or relies on being correct.
*/
- ip->i_d.di_version = dip->di_version;
VFS_I(ip)->i_generation = be32_to_cpu(dip->di_gen);
ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
@@ -691,7 +676,6 @@ xfs_iread(
VFS_I(ip)->i_mode = 0;
}
- ASSERT(ip->i_d.di_version >= 2);
ip->i_delayed_blks = 0;
/*
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index ab0f84165317..80b574579a21 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -16,13 +16,9 @@ struct xfs_dinode;
* format specific structures at the appropriate time.
*/
struct xfs_icdinode {
- int8_t di_version; /* inode version */
int8_t di_format; /* format of di_c data */
uint16_t di_flushiter; /* incremented on flush */
- uint32_t di_uid; /* owner's user id */
- uint32_t di_gid; /* owner's group id */
- uint16_t di_projid_lo; /* lower part of owner's project id */
- uint16_t di_projid_hi; /* higher part of owner's project id */
+ uint32_t di_projid; /* owner's project id */
xfs_fsize_t di_size; /* number of bytes in file */
xfs_rfsblock_t di_nblocks; /* # of direct & btree blocks used */
xfs_extlen_t di_extsize; /* basic/minimum extent size for file */
@@ -62,8 +58,6 @@ void xfs_inode_from_disk(struct xfs_inode *ip, struct xfs_dinode *from);
void xfs_log_dinode_to_disk(struct xfs_log_dinode *from,
struct xfs_dinode *to);
-bool xfs_dinode_good_version(struct xfs_mount *mp, __u8 version);
-
#if defined(DEBUG)
void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
#else
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 8fdd0424070e..e758d74b2b62 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -75,11 +75,15 @@ xfs_iformat_fork(
error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
break;
default:
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+ dip, sizeof(*dip), __this_address);
return -EFSCORRUPTED;
}
break;
default:
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), __this_address);
return -EFSCORRUPTED;
}
if (error)
@@ -110,6 +114,8 @@ xfs_iformat_fork(
error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
break;
default:
+ xfs_inode_verifier_error(ip, error, __func__, dip,
+ sizeof(*dip), __this_address);
error = -EFSCORRUPTED;
break;
}
@@ -177,7 +183,7 @@ xfs_iformat_local(
*/
if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
xfs_warn(ip->i_mount,
- "corrupt inode %Lu (bad size %d for local fork, size = %d).",
+ "corrupt inode %Lu (bad size %d for local fork, size = %zd).",
(unsigned long long) ip->i_ino, size,
XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
xfs_inode_verifier_error(ip, -EFSCORRUPTED,
@@ -586,7 +592,7 @@ void
xfs_iflush_fork(
xfs_inode_t *ip,
xfs_dinode_t *dip,
- xfs_inode_log_item_t *iip,
+ struct xfs_inode_log_item *iip,
int whichfork)
{
char *cp;
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 7b845c052fb4..a84a1557d11c 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -46,14 +46,9 @@ struct xfs_ifork {
(ip)->i_afp : \
(ip)->i_cowfp))
#define XFS_IFORK_DSIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_IFORK_BOFF(ip) : \
- XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version))
+ (XFS_IFORK_Q(ip) ? XFS_IFORK_BOFF(ip) : XFS_LITINO((ip)->i_mount))
#define XFS_IFORK_ASIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_LITINO((ip)->i_mount, (ip)->i_d.di_version) - \
- XFS_IFORK_BOFF(ip) : \
- 0)
+ (XFS_IFORK_Q(ip) ? XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : 0)
#define XFS_IFORK_SIZE(ip,w) \
((w) == XFS_DATA_FORK ? \
XFS_IFORK_DSIZE(ip) : \
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index e5f97c69b320..d3b255f42789 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -424,12 +424,10 @@ struct xfs_log_dinode {
/* structure must be padded to 64 bit alignment */
};
-static inline uint xfs_log_dinode_size(int version)
-{
- if (version == 3)
- return sizeof(struct xfs_log_dinode);
- return offsetof(struct xfs_log_dinode, di_next_unlinked);
-}
+#define xfs_log_dinode_size(mp) \
+ (xfs_sb_version_has_v3inode(&(mp)->m_sb) ? \
+ sizeof(struct xfs_log_dinode) : \
+ offsetof(struct xfs_log_dinode, di_next_unlinked))
/*
* Buffer Log Format defintions
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 9a7fadb1361c..78236bd6c64f 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -1591,8 +1591,10 @@ xfs_refcount_recover_extent(
struct list_head *debris = priv;
struct xfs_refcount_recovery *rr;
- if (be32_to_cpu(rec->refc.rc_refcount) != 1)
+ if (be32_to_cpu(rec->refc.rc_refcount) != 1) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, cur->bc_mp);
return -EFSCORRUPTED;
+ }
rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), 0);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index 42085e70c01a..cf99e4cab627 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -15,7 +15,7 @@
#include "xfs_bmap.h"
#include "xfs_trans.h"
#include "xfs_rtalloc.h"
-
+#include "xfs_error.h"
/*
* Realtime allocator bitmap functions shared with userspace.
@@ -70,8 +70,10 @@ xfs_rtbuf_get(
if (error)
return error;
- if (nmap == 0 || !xfs_bmap_is_real_extent(&map))
+ if (nmap == 0 || !xfs_bmap_is_written_extent(&map)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
+ }
ASSERT(map.br_startblock != NULLFSBLOCK);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index c45acbd3add9..708feb8eac76 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -65,6 +65,7 @@ void xfs_log_get_max_trans_res(struct xfs_mount *mp,
#define XFS_TRANS_DQ_DIRTY 0x10 /* at least one dquot in trx dirty */
#define XFS_TRANS_RESERVE 0x20 /* OK to use reserved data blocks */
#define XFS_TRANS_NO_WRITECOUNT 0x40 /* do not elevate SB writecount */
+#define XFS_TRANS_RES_FDBLKS 0x80 /* reserve newly freed blocks */
/*
* LOWMODE is used by the allocator to activate the lowspace algorithm - when
* free space is running low the extent allocator may choose to allocate an
diff --git a/fs/xfs/libxfs/xfs_trans_inode.c b/fs/xfs/libxfs/xfs_trans_inode.c
index 0ba7368b9a5f..1d0e78e0099d 100644
--- a/fs/xfs/libxfs/xfs_trans_inode.c
+++ b/fs/xfs/libxfs/xfs_trans_inode.c
@@ -27,7 +27,7 @@ xfs_trans_ijoin(
struct xfs_inode *ip,
uint lock_flags)
{
- xfs_inode_log_item_t *iip;
+ struct xfs_inode_log_item *iip;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (ip->i_itemp == NULL)
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index b3584cd2cc16..8ece346def97 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -187,7 +187,7 @@ xfs_calc_inode_chunk_res(
XFS_FSB_TO_B(mp, 1));
if (alloc) {
/* icreate tx uses ordered buffers */
- if (xfs_sb_version_hascrc(&mp->m_sb))
+ if (xfs_sb_version_has_v3inode(&mp->m_sb))
return res;
size = XFS_FSB_TO_B(mp, 1);
}
@@ -776,7 +776,7 @@ xfs_calc_clear_agi_bucket_reservation(
/*
* Adjusting quota limits.
- * the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ * the disk quota buffer: sizeof(struct xfs_disk_dquot)
*/
STATIC uint
xfs_calc_qm_setqlim_reservation(void)
@@ -800,7 +800,7 @@ xfs_calc_qm_dqalloc_reservation(
/*
* Turning off quotas.
- * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2
* the superblock for the quota flags: sector size
*/
STATIC uint
@@ -813,7 +813,7 @@ xfs_calc_qm_quotaoff_reservation(
/*
* End of turning off quotas.
- * the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ * the quota off logitems: sizeof(struct xfs_qoff_logitem) * 2
*/
STATIC uint
xfs_calc_qm_quotaoff_end_reservation(void)
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 96d7071cfa46..6788b0ca85eb 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -12,6 +12,7 @@
#include "xfs_inode.h"
#include "xfs_attr.h"
#include "xfs_trace.h"
+#include "xfs_error.h"
#include <linux/posix_acl_xattr.h>
@@ -23,6 +24,7 @@
STATIC struct posix_acl *
xfs_acl_from_disk(
+ struct xfs_mount *mp,
const struct xfs_acl *aclp,
int len,
int max_entries)
@@ -32,11 +34,18 @@ xfs_acl_from_disk(
const struct xfs_acl_entry *ace;
unsigned int count, i;
- if (len < sizeof(*aclp))
+ if (len < sizeof(*aclp)) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp,
+ len);
return ERR_PTR(-EFSCORRUPTED);
+ }
+
count = be32_to_cpu(aclp->acl_cnt);
- if (count > max_entries || XFS_ACL_SIZE(count) != len)
+ if (count > max_entries || XFS_ACL_SIZE(count) != len) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, aclp,
+ len);
return ERR_PTR(-EFSCORRUPTED);
+ }
acl = posix_acl_alloc(count, GFP_KERNEL);
if (!acl)
@@ -57,10 +66,12 @@ xfs_acl_from_disk(
switch (acl_e->e_tag) {
case ACL_USER:
- acl_e->e_uid = xfs_uid_to_kuid(be32_to_cpu(ace->ae_id));
+ acl_e->e_uid = make_kuid(&init_user_ns,
+ be32_to_cpu(ace->ae_id));
break;
case ACL_GROUP:
- acl_e->e_gid = xfs_gid_to_kgid(be32_to_cpu(ace->ae_id));
+ acl_e->e_gid = make_kgid(&init_user_ns,
+ be32_to_cpu(ace->ae_id));
break;
case ACL_USER_OBJ:
case ACL_GROUP_OBJ:
@@ -93,10 +104,12 @@ xfs_acl_to_disk(struct xfs_acl *aclp, const struct posix_acl *acl)
ace->ae_tag = cpu_to_be32(acl_e->e_tag);
switch (acl_e->e_tag) {
case ACL_USER:
- ace->ae_id = cpu_to_be32(xfs_kuid_to_uid(acl_e->e_uid));
+ ace->ae_id = cpu_to_be32(
+ from_kuid(&init_user_ns, acl_e->e_uid));
break;
case ACL_GROUP:
- ace->ae_id = cpu_to_be32(xfs_kgid_to_gid(acl_e->e_gid));
+ ace->ae_id = cpu_to_be32(
+ from_kgid(&init_user_ns, acl_e->e_gid));
break;
default:
ace->ae_id = cpu_to_be32(ACL_UNDEFINED_ID);
@@ -145,7 +158,7 @@ xfs_get_acl(struct inode *inode, int type)
if (error != -ENOATTR)
acl = ERR_PTR(error);
} else {
- acl = xfs_acl_from_disk(xfs_acl, len,
+ acl = xfs_acl_from_disk(ip->i_mount, xfs_acl, len,
XFS_ACL_MAX_ENTRIES(ip->i_mount));
kmem_free(xfs_acl);
}
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index f16d5f196c6b..5d9f8e4c4cde 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -495,7 +495,7 @@ xfs_map_blocks(
ssize_t count = i_blocksize(inode);
xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
- xfs_fileoff_t cow_fsb = NULLFILEOFF;
+ xfs_fileoff_t cow_fsb;
struct xfs_bmbt_irec imap;
struct xfs_iext_cursor icur;
int retries = 0;
@@ -529,6 +529,8 @@ xfs_map_blocks(
* landed in a hole and we skip the block.
*/
retry:
+ cow_fsb = NULLFILEOFF;
+ wpc->fork = XFS_DATA_FORK;
xfs_ilock(ip, XFS_ILOCK_SHARED);
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
(ip->i_df.if_flags & XFS_IFEXTENTS));
diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c
index a640a285cc52..f052de128fa1 100644
--- a/fs/xfs/xfs_attr_inactive.c
+++ b/fs/xfs/xfs_attr_inactive.c
@@ -22,24 +22,21 @@
#include "xfs_attr_leaf.h"
#include "xfs_quota.h"
#include "xfs_dir2.h"
+#include "xfs_error.h"
/*
- * Look at all the extents for this logical region,
- * invalidate any buffers that are incore/in transactions.
+ * Invalidate any incore buffers associated with this remote attribute value
+ * extent. We never log remote attribute value buffers, which means that they
+ * won't be attached to a transaction and are therefore safe to mark stale.
+ * The actual bunmapi will be taken care of later.
*/
STATIC int
-xfs_attr3_leaf_freextent(
- struct xfs_trans **trans,
+xfs_attr3_rmt_stale(
struct xfs_inode *dp,
xfs_dablk_t blkno,
int blkcnt)
{
struct xfs_bmbt_irec map;
- struct xfs_buf *bp;
- xfs_dablk_t tblkno;
- xfs_daddr_t dblkno;
- int tblkcnt;
- int dblkcnt;
int nmap;
int error;
@@ -47,47 +44,28 @@ xfs_attr3_leaf_freextent(
* Roll through the "value", invalidating the attribute value's
* blocks.
*/
- tblkno = blkno;
- tblkcnt = blkcnt;
- while (tblkcnt > 0) {
+ while (blkcnt > 0) {
/*
* Try to remember where we decided to put the value.
*/
nmap = 1;
- error = xfs_bmapi_read(dp, (xfs_fileoff_t)tblkno, tblkcnt,
+ error = xfs_bmapi_read(dp, (xfs_fileoff_t)blkno, blkcnt,
&map, &nmap, XFS_BMAPI_ATTRFORK);
- if (error) {
+ if (error)
return error;
- }
ASSERT(nmap == 1);
- ASSERT(map.br_startblock != DELAYSTARTBLOCK);
/*
- * If it's a hole, these are already unmapped
- * so there's nothing to invalidate.
+ * Mark any incore buffers for the remote value as stale. We
+ * never log remote attr value buffers, so the buffer should be
+ * easy to kill.
*/
- if (map.br_startblock != HOLESTARTBLOCK) {
-
- dblkno = XFS_FSB_TO_DADDR(dp->i_mount,
- map.br_startblock);
- dblkcnt = XFS_FSB_TO_BB(dp->i_mount,
- map.br_blockcount);
- bp = xfs_trans_get_buf(*trans,
- dp->i_mount->m_ddev_targp,
- dblkno, dblkcnt, 0);
- if (!bp)
- return -ENOMEM;
- xfs_trans_binval(*trans, bp);
- /*
- * Roll to next transaction.
- */
- error = xfs_trans_roll_inode(trans, dp);
- if (error)
- return error;
- }
+ error = xfs_attr_rmtval_stale(dp, &map, 0);
+ if (error)
+ return error;
- tblkno += map.br_blockcount;
- tblkcnt -= map.br_blockcount;
+ blkno += map.br_blockcount;
+ blkcnt -= map.br_blockcount;
}
return 0;
@@ -101,86 +79,45 @@ xfs_attr3_leaf_freextent(
*/
STATIC int
xfs_attr3_leaf_inactive(
- struct xfs_trans **trans,
- struct xfs_inode *dp,
- struct xfs_buf *bp)
+ struct xfs_trans **trans,
+ struct xfs_inode *dp,
+ struct xfs_buf *bp)
{
- struct xfs_attr_leafblock *leaf;
- struct xfs_attr3_icleaf_hdr ichdr;
- struct xfs_attr_leaf_entry *entry;
+ struct xfs_attr3_icleaf_hdr ichdr;
+ struct xfs_mount *mp = bp->b_mount;
+ struct xfs_attr_leafblock *leaf = bp->b_addr;
+ struct xfs_attr_leaf_entry *entry;
struct xfs_attr_leaf_name_remote *name_rmt;
- struct xfs_attr_inactive_list *list;
- struct xfs_attr_inactive_list *lp;
- int error;
- int count;
- int size;
- int tmp;
- int i;
- struct xfs_mount *mp = bp->b_mount;
+ int error = 0;
+ int i;
- leaf = bp->b_addr;
xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
/*
- * Count the number of "remote" value extents.
+ * Find the remote value extents for this leaf and invalidate their
+ * incore buffers.
*/
- count = 0;
entry = xfs_attr3_leaf_entryp(leaf);
for (i = 0; i < ichdr.count; entry++, i++) {
- if (be16_to_cpu(entry->nameidx) &&
- ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
- if (name_rmt->valueblk)
- count++;
- }
- }
-
- /*
- * If there are no "remote" values, we're done.
- */
- if (count == 0) {
- xfs_trans_brelse(*trans, bp);
- return 0;
- }
+ int blkcnt;
- /*
- * Allocate storage for a list of all the "remote" value extents.
- */
- size = count * sizeof(xfs_attr_inactive_list_t);
- list = kmem_alloc(size, 0);
-
- /*
- * Identify each of the "remote" value extents.
- */
- lp = list;
- entry = xfs_attr3_leaf_entryp(leaf);
- for (i = 0; i < ichdr.count; entry++, i++) {
- if (be16_to_cpu(entry->nameidx) &&
- ((entry->flags & XFS_ATTR_LOCAL) == 0)) {
- name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
- if (name_rmt->valueblk) {
- lp->valueblk = be32_to_cpu(name_rmt->valueblk);
- lp->valuelen = xfs_attr3_rmt_blocks(dp->i_mount,
- be32_to_cpu(name_rmt->valuelen));
- lp++;
- }
- }
- }
- xfs_trans_brelse(*trans, bp); /* unlock for trans. in freextent() */
+ if (!entry->nameidx || (entry->flags & XFS_ATTR_LOCAL))
+ continue;
- /*
- * Invalidate each of the "remote" value extents.
- */
- error = 0;
- for (lp = list, i = 0; i < count; i++, lp++) {
- tmp = xfs_attr3_leaf_freextent(trans, dp,
- lp->valueblk, lp->valuelen);
+ name_rmt = xfs_attr3_leaf_name_remote(leaf, i);
+ if (!name_rmt->valueblk)
+ continue;
- if (error == 0)
- error = tmp; /* save only the 1st errno */
+ blkcnt = xfs_attr3_rmt_blocks(dp->i_mount,
+ be32_to_cpu(name_rmt->valuelen));
+ error = xfs_attr3_rmt_stale(dp,
+ be32_to_cpu(name_rmt->valueblk), blkcnt);
+ if (error)
+ goto err;
}
- kmem_free(list);
+ xfs_trans_brelse(*trans, bp);
+err:
return error;
}
@@ -208,8 +145,9 @@ xfs_attr3_node_inactive(
* Since this code is recursive (gasp!) we must protect ourselves.
*/
if (level > XFS_DA_NODE_MAXDEPTH) {
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp); /* no locks for later trans */
- return -EIO;
+ return -EFSCORRUPTED;
}
node = bp->b_addr;
@@ -258,8 +196,9 @@ xfs_attr3_node_inactive(
error = xfs_attr3_leaf_inactive(trans, dp, child_bp);
break;
default:
- error = -EIO;
+ xfs_buf_mark_corrupt(child_bp);
xfs_trans_brelse(*trans, child_bp);
+ error = -EFSCORRUPTED;
break;
}
if (error)
@@ -341,7 +280,8 @@ xfs_attr3_root_inactive(
error = xfs_attr3_leaf_inactive(trans, dp, bp);
break;
default:
- error = -EIO;
+ error = -EFSCORRUPTED;
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(*trans, bp);
break;
}
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 00758fdc2fec..8c0972834449 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -258,8 +258,10 @@ xfs_attr_node_list_lookup(
return 0;
/* We can't point back to the root. */
- if (cursor->blkno == 0)
+ if (cursor->blkno == 0) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return -EFSCORRUPTED;
+ }
}
if (expected_level != 0)
@@ -269,6 +271,7 @@ xfs_attr_node_list_lookup(
return 0;
out_corruptbuf:
+ xfs_buf_mark_corrupt(bp);
xfs_trans_brelse(tp, bp);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index 83d24e983d4c..7b0c4d9679d9 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -21,7 +21,8 @@
#include "xfs_icache.h"
#include "xfs_bmap_btree.h"
#include "xfs_trans_space.h"
-
+#include "xfs_error.h"
+#include "xfs_quota.h"
kmem_zone_t *xfs_bui_zone;
kmem_zone_t *xfs_bud_zone;
@@ -124,34 +125,6 @@ xfs_bui_item_release(
xfs_bui_release(BUI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_bui_item_ops = {
- .iop_size = xfs_bui_item_size,
- .iop_format = xfs_bui_item_format,
- .iop_unpin = xfs_bui_item_unpin,
- .iop_release = xfs_bui_item_release,
-};
-
-/*
- * Allocate and initialize an bui item with the given number of extents.
- */
-struct xfs_bui_log_item *
-xfs_bui_init(
- struct xfs_mount *mp)
-
-{
- struct xfs_bui_log_item *buip;
-
- buip = kmem_zone_zalloc(xfs_bui_zone, 0);
-
- xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
- buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
- buip->bui_format.bui_id = (uintptr_t)(void *)buip;
- atomic_set(&buip->bui_next_extent, 0);
- atomic_set(&buip->bui_refcount, 2);
-
- return buip;
-}
-
static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_bud_log_item, bud_item);
@@ -278,27 +251,6 @@ xfs_bmap_update_diff_items(
return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
}
-/* Get an BUI. */
-STATIC void *
-xfs_bmap_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_bui_log_item *buip;
-
- ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
- ASSERT(tp != NULL);
-
- buip = xfs_bui_init(tp->t_mountp);
- ASSERT(buip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &buip->bui_item);
- return buip;
-}
-
/* Set the map extent flags for this mapping. */
static void
xfs_trans_set_bmap_flags(
@@ -326,16 +278,12 @@ xfs_trans_set_bmap_flags(
STATIC void
xfs_bmap_update_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_bui_log_item *buip,
+ struct xfs_bmap_intent *bmap)
{
- struct xfs_bui_log_item *buip = intent;
- struct xfs_bmap_intent *bmap;
uint next_extent;
struct xfs_map_extent *map;
- bmap = container_of(item, struct xfs_bmap_intent, bi_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
@@ -355,14 +303,35 @@ xfs_bmap_update_log_item(
bmap->bi_bmap.br_state);
}
+static struct xfs_log_item *
+xfs_bmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_bui_log_item *buip = xfs_bui_init(mp);
+ struct xfs_bmap_intent *bmap;
+
+ ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+
+ xfs_trans_add_item(tp, &buip->bui_item);
+ if (sort)
+ list_sort(mp, items, xfs_bmap_update_diff_items);
+ list_for_each_entry(bmap, items, bi_list)
+ xfs_bmap_update_log_item(tp, buip, bmap);
+ return &buip->bui_item;
+}
+
/* Get an BUD so we can process all the deferred rmap updates. */
STATIC void *
xfs_bmap_update_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_bud(tp, intent);
+ return xfs_trans_get_bud(tp, BUI_ITEM(intent));
}
/* Process a deferred rmap update. */
@@ -398,9 +367,9 @@ xfs_bmap_update_finish_item(
/* Abort all pending BUIs. */
STATIC void
xfs_bmap_update_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_bui_release(intent);
+ xfs_bui_release(BUI_ITEM(intent));
}
/* Cancel a deferred rmap update. */
@@ -416,10 +385,8 @@ xfs_bmap_update_cancel_item(
const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
.max_items = XFS_BUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_bmap_update_diff_items,
.create_intent = xfs_bmap_update_create_intent,
.abort_intent = xfs_bmap_update_abort_intent,
- .log_item = xfs_bmap_update_log_item,
.create_done = xfs_bmap_update_create_done,
.finish_item = xfs_bmap_update_finish_item,
.cancel_item = xfs_bmap_update_cancel_item,
@@ -431,8 +398,8 @@ const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
*/
int
xfs_bui_recover(
- struct xfs_trans *parent_tp,
- struct xfs_bui_log_item *buip)
+ struct xfs_bui_log_item *buip,
+ struct list_head *capture_list)
{
int error = 0;
unsigned int bui_type;
@@ -440,15 +407,13 @@ xfs_bui_recover(
xfs_fsblock_t startblock_fsb;
xfs_fsblock_t inode_fsb;
xfs_filblks_t count;
- bool op_ok;
struct xfs_bud_log_item *budp;
- enum xfs_bmap_intent_type type;
int whichfork;
xfs_exntst_t state;
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
struct xfs_bmbt_irec irec;
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = buip->bui_item.li_mountp;
ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
@@ -456,7 +421,7 @@ xfs_bui_recover(
if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
xfs_bui_release(buip);
- return -EIO;
+ return -EFSCORRUPTED;
}
/*
@@ -468,16 +433,19 @@ xfs_bui_recover(
XFS_FSB_TO_DADDR(mp, bmap->me_startblock));
inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp,
XFS_INO_TO_FSB(mp, bmap->me_owner)));
- switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
+ state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+ switch (bui_type) {
case XFS_BMAP_MAP:
case XFS_BMAP_UNMAP:
- op_ok = true;
break;
default:
- op_ok = false;
- break;
+ return -EFSCORRUPTED;
}
- if (!op_ok || startblock_fsb == 0 ||
+ if (startblock_fsb == 0 ||
bmap->me_len == 0 ||
inode_fsb == 0 ||
startblock_fsb >= mp->m_sb.sb_dblocks ||
@@ -490,54 +458,40 @@ xfs_bui_recover(
*/
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
xfs_bui_release(buip);
- return -EIO;
+ return -EFSCORRUPTED;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
- XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+ /* Grab the inode. */
+ error = xfs_iget(mp, NULL, bmap->me_owner, 0, 0, &ip);
if (error)
return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
- budp = xfs_trans_get_bud(tp, buip);
- /* Grab the inode. */
- error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip);
+ error = xfs_qm_dqattach(ip);
if (error)
- goto err_inode;
+ goto err_rele;
if (VFS_I(ip)->i_nlink == 0)
xfs_iflags_set(ip, XFS_IRECOVERY);
- /* Process deferred bmap item. */
- state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
- whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
- bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
- switch (bui_type) {
- case XFS_BMAP_MAP:
- case XFS_BMAP_UNMAP:
- type = bui_type;
- break;
- default:
- error = -EFSCORRUPTED;
- goto err_inode;
- }
+ /* Allocate transaction and do the work. */
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate,
+ XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp);
+ if (error)
+ goto err_rele;
+
+ budp = xfs_trans_get_bud(tp, buip);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
count = bmap->me_len;
- error = xfs_trans_log_finish_bmap_update(tp, budp, type, ip, whichfork,
- bmap->me_startoff, bmap->me_startblock, &count, state);
+ error = xfs_trans_log_finish_bmap_update(tp, budp, bui_type, ip,
+ whichfork, bmap->me_startoff, bmap->me_startblock,
+ &count, state);
if (error)
- goto err_inode;
+ goto err_cancel;
if (count > 0) {
- ASSERT(type == XFS_BMAP_UNMAP);
+ ASSERT(bui_type == XFS_BMAP_UNMAP);
irec.br_startblock = bmap->me_startblock;
irec.br_blockcount = count;
irec.br_startoff = bmap->me_startoff;
@@ -546,19 +500,78 @@ xfs_bui_recover(
}
set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
+ /*
+ * Commit transaction, which frees the transaction and saves the inode
+ * for later replay activities.
+ */
+ error = xfs_defer_ops_capture_and_commit(tp, ip, capture_list);
+ if (error)
+ goto err_unlock;
+
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
+ return 0;
- return error;
-
-err_inode:
- xfs_defer_move(parent_tp, tp);
+err_cancel:
xfs_trans_cancel(tp);
- if (ip) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
- }
+err_unlock:
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+err_rele:
+ xfs_irele(ip);
return error;
}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_bui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_bud_log_item *budp;
+ struct xfs_bui_log_item *buip;
+ struct xfs_map_extent *extp;
+ unsigned int count;
+
+ count = BUI_ITEM(intent)->bui_format.bui_nextents;
+ extp = BUI_ITEM(intent)->bui_format.bui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
+
+ buip = xfs_bui_init(tp->t_mountp);
+ memcpy(buip->bui_format.bui_extents, extp, count * sizeof(*extp));
+ atomic_set(&buip->bui_next_extent, count);
+ xfs_trans_add_item(tp, &buip->bui_item);
+ set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+ return &buip->bui_item;
+}
+
+static const struct xfs_item_ops xfs_bui_item_ops = {
+ .iop_size = xfs_bui_item_size,
+ .iop_format = xfs_bui_item_format,
+ .iop_unpin = xfs_bui_item_unpin,
+ .iop_release = xfs_bui_item_release,
+ .iop_relog = xfs_bui_item_relog,
+};
+
+/*
+ * Allocate and initialize an bui item with the given number of extents.
+ */
+struct xfs_bui_log_item *
+xfs_bui_init(
+ struct xfs_mount *mp)
+
+{
+ struct xfs_bui_log_item *buip;
+
+ buip = kmem_zone_zalloc(xfs_bui_zone, 0);
+
+ xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
+ buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
+ buip->bui_format.bui_id = (uintptr_t)(void *)buip;
+ atomic_set(&buip->bui_next_extent, 0);
+ atomic_set(&buip->bui_refcount, 2);
+
+ return buip;
+}
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
index ad479cc73de8..a95e99c26979 100644
--- a/fs/xfs/xfs_bmap_item.h
+++ b/fs/xfs/xfs_bmap_item.h
@@ -77,6 +77,7 @@ extern struct kmem_zone *xfs_bud_zone;
struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *);
void xfs_bui_item_free(struct xfs_bui_log_item *);
void xfs_bui_release(struct xfs_bui_log_item *);
-int xfs_bui_recover(struct xfs_trans *parent_tp, struct xfs_bui_log_item *buip);
+int xfs_bui_recover(struct xfs_bui_log_item *buip,
+ struct list_head *capture_list);
#endif /* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index d6d78e127625..32170ce2984c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -125,7 +125,7 @@ xfs_bmap_rtalloc(
* pick an extent that will space things out in the rt area.
*/
if (ap->eof && ap->offset == 0) {
- xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
+ xfs_rtblock_t rtx; /* realtime extent no */
error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
if (error)
@@ -1167,6 +1167,7 @@ xfs_prepare_shift(
struct xfs_inode *ip,
loff_t offset)
{
+ struct xfs_mount *mp = ip->i_mount;
int error;
/*
@@ -1180,6 +1181,17 @@ xfs_prepare_shift(
}
/*
+ * Shift operations must stabilize the start block offset boundary along
+ * with the full range of the operation. If we don't, a COW writeback
+ * completion could race with an insert, front merge with the start
+ * extent (after split) during the shift and corrupt the file. Start
+ * with the block just prior to the start to stabilize the boundary.
+ */
+ offset = round_down(offset, 1 << mp->m_sb.sb_blocklog);
+ if (offset)
+ offset -= (1 << mp->m_sb.sb_blocklog);
+
+ /*
* Writeback and invalidate cache for the remainder of the file as we're
* about to shift down every extent from offset to EOF.
*/
@@ -1225,7 +1237,6 @@ xfs_collapse_file_space(
int error;
xfs_fileoff_t next_fsb = XFS_B_TO_FSB(mp, offset + len);
xfs_fileoff_t shift_fsb = XFS_B_TO_FSB(mp, len);
- uint resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
bool done = false;
ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
@@ -1241,32 +1252,34 @@ xfs_collapse_file_space(
if (error)
return error;
- while (!error && !done) {
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0,
- &tp);
- if (error)
- break;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+ if (error)
+ return error;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
- ip->i_gdquot, ip->i_pdquot, resblks, 0,
- XFS_QMOPT_RES_REGBLKS);
- if (error)
- goto out_trans_cancel;
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+ while (!done) {
error = xfs_bmap_collapse_extents(tp, ip, &next_fsb, shift_fsb,
&done);
if (error)
goto out_trans_cancel;
+ if (done)
+ break;
- error = xfs_trans_commit(tp);
+ /* finish any deferred frees and roll the transaction */
+ error = xfs_defer_finish(&tp);
+ if (error)
+ goto out_trans_cancel;
}
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1309,35 +1322,41 @@ xfs_insert_file_space(
if (error)
return error;
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
+ XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
+ if (error)
+ return error;
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_trans_ijoin(tp, ip, 0);
+
/*
* The extent shifting code works on extent granularity. So, if stop_fsb
* is not the starting block of extent, we need to split the extent at
* stop_fsb.
*/
- error = xfs_bmap_split_extent(ip, stop_fsb);
+ error = xfs_bmap_split_extent(tp, ip, stop_fsb);
if (error)
- return error;
+ goto out_trans_cancel;
- while (!error && !done) {
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0,
- &tp);
+ do {
+ error = xfs_defer_finish(&tp);
if (error)
- break;
+ goto out_trans_cancel;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
error = xfs_bmap_insert_extents(tp, ip, &next_fsb, shift_fsb,
&done, stop_fsb);
if (error)
goto out_trans_cancel;
+ } while (!done);
- error = xfs_trans_commit(tp);
- }
-
+ error = xfs_trans_commit(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
out_trans_cancel:
xfs_trans_cancel(tp);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
return error;
}
@@ -1605,12 +1624,12 @@ xfs_swap_extent_forks(
* event of a crash. Set the owner change log flags now and leave the
* bmbt scan as the last step.
*/
- if (ip->i_d.di_version == 3 &&
- ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
- (*target_log_flags) |= XFS_ILOG_DOWNER;
- if (tip->i_d.di_version == 3 &&
- tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
- (*src_log_flags) |= XFS_ILOG_DOWNER;
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ (*target_log_flags) |= XFS_ILOG_DOWNER;
+ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
+ (*src_log_flags) |= XFS_ILOG_DOWNER;
+ }
/*
* Swap the data forks of the inodes
@@ -1645,7 +1664,7 @@ xfs_swap_extent_forks(
(*src_log_flags) |= XFS_ILOG_DEXT;
break;
case XFS_DINODE_FMT_BTREE:
- ASSERT(ip->i_d.di_version < 3 ||
+ ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
(*src_log_flags & XFS_ILOG_DOWNER));
(*src_log_flags) |= XFS_ILOG_DBROOT;
break;
@@ -1657,7 +1676,7 @@ xfs_swap_extent_forks(
break;
case XFS_DINODE_FMT_BTREE:
(*target_log_flags) |= XFS_ILOG_DBROOT;
- ASSERT(tip->i_d.di_version < 3 ||
+ ASSERT(!xfs_sb_version_has_v3inode(&ip->i_mount->m_sb) ||
(*target_log_flags & XFS_ILOG_DOWNER));
break;
}
@@ -1721,6 +1740,7 @@ xfs_swap_extents(
int lock_flags;
uint64_t f;
int resblks = 0;
+ unsigned int flags = 0;
/*
* Lock the inodes against other IO, page faults and truncate to
@@ -1776,17 +1796,16 @@ xfs_swap_extents(
resblks += XFS_SWAP_RMAP_SPACE_RES(mp, tipnext, w);
/*
- * Handle the corner case where either inode might straddle the
- * btree format boundary. If so, the inode could bounce between
- * btree <-> extent format on unmap -> remap cycles, freeing and
- * allocating a bmapbt block each time.
+ * If either inode straddles a bmapbt block allocation boundary,
+ * the rmapbt algorithm triggers repeated allocs and frees as
+ * extents are remapped. This can exhaust the block reservation
+ * prematurely and cause shutdown. Return freed blocks to the
+ * transaction reservation to counter this behavior.
*/
- if (ipnext == (XFS_IFORK_MAXEXT(ip, w) + 1))
- resblks += XFS_IFORK_MAXEXT(ip, w);
- if (tipnext == (XFS_IFORK_MAXEXT(tip, w) + 1))
- resblks += XFS_IFORK_MAXEXT(tip, w);
+ flags |= XFS_TRANS_RES_FDBLKS;
}
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, flags,
+ &tp);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 1264ac63e4e5..4f18457aae2a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1547,6 +1547,28 @@ xfs_buf_zero(
}
/*
+ * Log a message about and stale a buffer that a caller has decided is corrupt.
+ *
+ * This function should be called for the kinds of metadata corruption that
+ * cannot be detect from a verifier, such as incorrect inter-block relationship
+ * data. Do /not/ call this function from a verifier function.
+ *
+ * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
+ * be marked stale, but b_error will not be set. The caller is responsible for
+ * releasing the buffer or fixing it.
+ */
+void
+__xfs_buf_mark_corrupt(
+ struct xfs_buf *bp,
+ xfs_failaddr_t fa)
+{
+ ASSERT(bp->b_flags & XBF_DONE);
+
+ xfs_buf_corruption_error(bp, fa);
+ xfs_buf_stale(bp);
+}
+
+/*
* Handling of buffer targets (buftargs).
*/
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index f6ce17d8d848..621467ab17c8 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -270,6 +270,8 @@ static inline int xfs_buf_submit(struct xfs_buf *bp)
}
void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize);
+void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa);
+#define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address)
/* Buffer Utility Routines */
extern void *xfs_buf_offset(struct xfs_buf *, size_t);
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index d74fbd1e9d3e..f98260ed6d51 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -328,7 +328,7 @@ xfs_buf_item_format(
* occurs during recovery.
*/
if (bip->bli_flags & XFS_BLI_INODE_BUF) {
- if (xfs_sb_version_hascrc(&lip->li_mountp->m_sb) ||
+ if (xfs_sb_version_has_v3inode(&lip->li_mountp->m_sb) ||
!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
xfs_log_item_in_current_chkpt(lip)))
bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
@@ -956,7 +956,7 @@ xfs_buf_item_relse(
struct xfs_buf_log_item *bip = bp->b_log_item;
trace_xfs_buf_item_relse(bp, _RET_IP_);
- ASSERT(!(bip->bli_item.li_flags & XFS_LI_IN_AIL));
+ ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
bp->b_log_item = NULL;
if (list_empty(&bp->b_li_list))
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 3cbf248af51f..672286f1762f 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -48,7 +48,7 @@ static struct lock_class_key xfs_dquot_project_class;
*/
void
xfs_qm_dqdestroy(
- xfs_dquot_t *dqp)
+ struct xfs_dquot *dqp)
{
ASSERT(list_empty(&dqp->q_lru));
@@ -113,8 +113,8 @@ xfs_qm_adjust_dqlimits(
*/
void
xfs_qm_adjust_dqtimers(
- xfs_mount_t *mp,
- xfs_disk_dquot_t *d)
+ struct xfs_mount *mp,
+ struct xfs_disk_dquot *d)
{
ASSERT(d->d_id);
@@ -205,16 +205,18 @@ xfs_qm_adjust_dqtimers(
*/
STATIC void
xfs_qm_init_dquot_blk(
- xfs_trans_t *tp,
- xfs_mount_t *mp,
- xfs_dqid_t id,
- uint type,
- xfs_buf_t *bp)
+ struct xfs_trans *tp,
+ struct xfs_mount *mp,
+ xfs_dqid_t id,
+ uint type,
+ struct xfs_buf *bp)
{
struct xfs_quotainfo *q = mp->m_quotainfo;
- xfs_dqblk_t *d;
- xfs_dqid_t curid;
- int i;
+ struct xfs_dqblk *d;
+ xfs_dqid_t curid;
+ unsigned int qflag;
+ unsigned int blftype;
+ int i;
ASSERT(tp);
ASSERT(xfs_buf_islocked(bp));
@@ -238,11 +240,39 @@ xfs_qm_init_dquot_blk(
}
}
- xfs_trans_dquot_buf(tp, bp,
- (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
- ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
- XFS_BLF_GDQUOT_BUF)));
- xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
+ if (type & XFS_DQ_USER) {
+ qflag = XFS_UQUOTA_CHKD;
+ blftype = XFS_BLF_UDQUOT_BUF;
+ } else if (type & XFS_DQ_PROJ) {
+ qflag = XFS_PQUOTA_CHKD;
+ blftype = XFS_BLF_PDQUOT_BUF;
+ } else {
+ qflag = XFS_GQUOTA_CHKD;
+ blftype = XFS_BLF_GDQUOT_BUF;
+ }
+
+ xfs_trans_dquot_buf(tp, bp, blftype);
+
+ /*
+ * quotacheck uses delayed writes to update all the dquots on disk in an
+ * efficient manner instead of logging the individual dquot changes as
+ * they are made. However if we log the buffer allocated here and crash
+ * after quotacheck while the logged initialisation is still in the
+ * active region of the log, log recovery can replay the dquot buffer
+ * initialisation over the top of the checked dquots and corrupt quota
+ * accounting.
+ *
+ * To avoid this problem, quotacheck cannot log the initialised buffer.
+ * We must still dirty the buffer and write it back before the
+ * allocation transaction clears the log. Therefore, mark the buffer as
+ * ordered instead of logging it directly. This is safe for quotacheck
+ * because it detects and repairs allocated but initialized dquot blocks
+ * in the quota inodes.
+ */
+ if (!(mp->m_qflags & qflag))
+ xfs_trans_ordered_buf(tp, bp);
+ else
+ xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
/*
@@ -497,7 +527,7 @@ xfs_dquot_from_disk(
struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset;
/* copy everything from disk dquot to the incore dquot */
- memcpy(&dqp->q_core, ddqp, sizeof(xfs_disk_dquot_t));
+ memcpy(&dqp->q_core, ddqp, sizeof(struct xfs_disk_dquot));
/*
* Reservation counters are defined as reservation plus current usage
@@ -829,11 +859,11 @@ xfs_qm_id_for_quotatype(
{
switch (type) {
case XFS_DQ_USER:
- return ip->i_d.di_uid;
+ return i_uid_read(VFS_I(ip));
case XFS_DQ_GROUP:
- return ip->i_d.di_gid;
+ return i_gid_read(VFS_I(ip));
case XFS_DQ_PROJ:
- return xfs_get_projid(ip);
+ return ip->i_d.di_projid;
}
ASSERT(0);
return 0;
@@ -989,7 +1019,7 @@ xfs_qm_dqput(
*/
void
xfs_qm_dqrele(
- xfs_dquot_t *dqp)
+ struct xfs_dquot *dqp)
{
if (!dqp)
return;
@@ -1018,8 +1048,8 @@ xfs_qm_dqflush_done(
struct xfs_buf *bp,
struct xfs_log_item *lip)
{
- xfs_dq_logitem_t *qip = (struct xfs_dq_logitem *)lip;
- xfs_dquot_t *dqp = qip->qli_dquot;
+ struct xfs_dq_logitem *qip = (struct xfs_dq_logitem *)lip;
+ struct xfs_dquot *dqp = qip->qli_dquot;
struct xfs_ail *ailp = lip->li_ailp;
/*
@@ -1105,8 +1135,8 @@ xfs_qm_dqflush(
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp,
- &xfs_dquot_buf_ops);
+ mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK,
+ &bp, &xfs_dquot_buf_ops);
if (error)
goto out_unlock;
@@ -1125,11 +1155,11 @@ xfs_qm_dqflush(
xfs_buf_relse(bp);
xfs_dqfunlock(dqp);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
- return -EIO;
+ return -EFSCORRUPTED;
}
/* This is the only portion of data that needs to persist */
- memcpy(ddqp, &dqp->q_core, sizeof(xfs_disk_dquot_t));
+ memcpy(ddqp, &dqp->q_core, sizeof(struct xfs_disk_dquot));
/*
* Clear the dirty field and remember the flush lsn for later use.
@@ -1176,7 +1206,7 @@ xfs_qm_dqflush(
out_unlock:
xfs_dqfunlock(dqp);
- return -EIO;
+ return error;
}
/*
@@ -1187,8 +1217,8 @@ out_unlock:
*/
void
xfs_dqlock2(
- xfs_dquot_t *d1,
- xfs_dquot_t *d2)
+ struct xfs_dquot *d1,
+ struct xfs_dquot *d2)
{
if (d1 && d2) {
ASSERT(d1 != d2);
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 4fe85709d55d..fe3e46df604b 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -30,33 +30,36 @@ enum {
/*
* The incore dquot structure
*/
-typedef struct xfs_dquot {
- uint dq_flags; /* various flags (XFS_DQ_*) */
- struct list_head q_lru; /* global free list of dquots */
- struct xfs_mount*q_mount; /* filesystem this relates to */
- uint q_nrefs; /* # active refs from inodes */
- xfs_daddr_t q_blkno; /* blkno of dquot buffer */
- int q_bufoffset; /* off of dq in buffer (# dquots) */
- xfs_fileoff_t q_fileoffset; /* offset in quotas file */
-
- xfs_disk_dquot_t q_core; /* actual usage & quotas */
- xfs_dq_logitem_t q_logitem; /* dquot log item */
- xfs_qcnt_t q_res_bcount; /* total regular nblks used+reserved */
- xfs_qcnt_t q_res_icount; /* total inos allocd+reserved */
- xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
- xfs_qcnt_t q_prealloc_lo_wmark;/* prealloc throttle wmark */
- xfs_qcnt_t q_prealloc_hi_wmark;/* prealloc disabled wmark */
- int64_t q_low_space[XFS_QLOWSP_MAX];
- struct mutex q_qlock; /* quota lock */
- struct completion q_flush; /* flush completion queue */
- atomic_t q_pincount; /* dquot pin count */
- wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
-} xfs_dquot_t;
+struct xfs_dquot {
+ uint dq_flags;
+ struct list_head q_lru;
+ struct xfs_mount *q_mount;
+ uint q_nrefs;
+ xfs_daddr_t q_blkno;
+ int q_bufoffset;
+ xfs_fileoff_t q_fileoffset;
+
+ struct xfs_disk_dquot q_core;
+ struct xfs_dq_logitem q_logitem;
+ /* total regular nblks used+reserved */
+ xfs_qcnt_t q_res_bcount;
+ /* total inos allocd+reserved */
+ xfs_qcnt_t q_res_icount;
+ /* total realtime blks used+reserved */
+ xfs_qcnt_t q_res_rtbcount;
+ xfs_qcnt_t q_prealloc_lo_wmark;
+ xfs_qcnt_t q_prealloc_hi_wmark;
+ int64_t q_low_space[XFS_QLOWSP_MAX];
+ struct mutex q_qlock;
+ struct completion q_flush;
+ atomic_t q_pincount;
+ struct wait_queue_head q_pinwait;
+};
/*
* Lock hierarchy for q_qlock:
* XFS_QLOCK_NORMAL is the implicit default,
- * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
+ * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
*/
enum {
XFS_QLOCK_NORMAL = 0,
@@ -64,21 +67,21 @@ enum {
};
/*
- * Manage the q_flush completion queue embedded in the dquot. This completion
+ * Manage the q_flush completion queue embedded in the dquot. This completion
* queue synchronizes processes attempting to flush the in-core dquot back to
* disk.
*/
-static inline void xfs_dqflock(xfs_dquot_t *dqp)
+static inline void xfs_dqflock(struct xfs_dquot *dqp)
{
wait_for_completion(&dqp->q_flush);
}
-static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp)
+static inline bool xfs_dqflock_nowait(struct xfs_dquot *dqp)
{
return try_wait_for_completion(&dqp->q_flush);
}
-static inline void xfs_dqfunlock(xfs_dquot_t *dqp)
+static inline void xfs_dqfunlock(struct xfs_dquot *dqp)
{
complete(&dqp->q_flush);
}
@@ -112,7 +115,7 @@ static inline int xfs_this_quota_on(struct xfs_mount *mp, int type)
}
}
-static inline xfs_dquot_t *xfs_inode_dquot(struct xfs_inode *ip, int type)
+static inline struct xfs_dquot *xfs_inode_dquot(struct xfs_inode *ip, int type)
{
switch (type & XFS_DQ_ALLTYPES) {
case XFS_DQ_USER:
@@ -147,31 +150,30 @@ static inline bool xfs_dquot_lowsp(struct xfs_dquot *dqp)
#define XFS_QM_ISPDQ(dqp) ((dqp)->dq_flags & XFS_DQ_PROJ)
#define XFS_QM_ISGDQ(dqp) ((dqp)->dq_flags & XFS_DQ_GROUP)
-extern void xfs_qm_dqdestroy(xfs_dquot_t *);
-extern int xfs_qm_dqflush(struct xfs_dquot *, struct xfs_buf **);
-extern void xfs_qm_dqunpin_wait(xfs_dquot_t *);
-extern void xfs_qm_adjust_dqtimers(xfs_mount_t *,
- xfs_disk_dquot_t *);
-extern void xfs_qm_adjust_dqlimits(struct xfs_mount *,
- struct xfs_dquot *);
-extern xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip,
- uint type);
-extern int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
+void xfs_qm_dqdestroy(struct xfs_dquot *dqp);
+int xfs_qm_dqflush(struct xfs_dquot *dqp, struct xfs_buf **bpp);
+void xfs_qm_dqunpin_wait(struct xfs_dquot *dqp);
+void xfs_qm_adjust_dqtimers(struct xfs_mount *mp,
+ struct xfs_disk_dquot *d);
+void xfs_qm_adjust_dqlimits(struct xfs_mount *mp,
+ struct xfs_dquot *d);
+xfs_dqid_t xfs_qm_id_for_quotatype(struct xfs_inode *ip, uint type);
+int xfs_qm_dqget(struct xfs_mount *mp, xfs_dqid_t id,
uint type, bool can_alloc,
struct xfs_dquot **dqpp);
-extern int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
- bool can_alloc,
- struct xfs_dquot **dqpp);
-extern int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
+int xfs_qm_dqget_inode(struct xfs_inode *ip, uint type,
+ bool can_alloc,
+ struct xfs_dquot **dqpp);
+int xfs_qm_dqget_next(struct xfs_mount *mp, xfs_dqid_t id,
uint type, struct xfs_dquot **dqpp);
-extern int xfs_qm_dqget_uncached(struct xfs_mount *mp,
- xfs_dqid_t id, uint type,
- struct xfs_dquot **dqpp);
-extern void xfs_qm_dqput(xfs_dquot_t *);
+int xfs_qm_dqget_uncached(struct xfs_mount *mp,
+ xfs_dqid_t id, uint type,
+ struct xfs_dquot **dqpp);
+void xfs_qm_dqput(struct xfs_dquot *dqp);
-extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
+void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
-extern void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
+void xfs_dquot_set_prealloc_limits(struct xfs_dquot *);
static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
{
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index d60647d7197b..baad1748d0d1 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -189,7 +189,8 @@ xfs_qm_dquot_logitem_push(
if (!xfs_buf_delwri_queue(bp, buffer_list))
rval = XFS_ITEM_FLUSHING;
xfs_buf_relse(bp);
- }
+ } else if (error == -EAGAIN)
+ rval = XFS_ITEM_LOCKED;
spin_lock(&lip->li_ailp->ail_lock);
out_unlock:
@@ -307,36 +308,62 @@ xfs_qm_qoffend_logitem_committed(
{
struct xfs_qoff_logitem *qfe = QOFF_ITEM(lip);
struct xfs_qoff_logitem *qfs = qfe->qql_start_lip;
- struct xfs_ail *ailp = qfs->qql_item.li_ailp;
- /*
- * Delete the qoff-start logitem from the AIL.
- * xfs_trans_ail_delete() drops the AIL lock.
- */
- spin_lock(&ailp->ail_lock);
- xfs_trans_ail_delete(ailp, &qfs->qql_item, SHUTDOWN_LOG_IO_ERROR);
+ xfs_qm_qoff_logitem_relse(qfs);
- kmem_free(qfs->qql_item.li_lv_shadow);
kmem_free(lip->li_lv_shadow);
- kmem_free(qfs);
kmem_free(qfe);
return (xfs_lsn_t)-1;
}
+STATIC void
+xfs_qm_qoff_logitem_release(
+ struct xfs_log_item *lip)
+{
+ struct xfs_qoff_logitem *qoff = QOFF_ITEM(lip);
+
+ if (test_bit(XFS_LI_ABORTED, &lip->li_flags)) {
+ if (qoff->qql_start_lip)
+ xfs_qm_qoff_logitem_relse(qoff->qql_start_lip);
+ xfs_qm_qoff_logitem_relse(qoff);
+ }
+}
+
static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
.iop_committed = xfs_qm_qoffend_logitem_committed,
.iop_push = xfs_qm_qoff_logitem_push,
+ .iop_release = xfs_qm_qoff_logitem_release,
};
static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
.iop_size = xfs_qm_qoff_logitem_size,
.iop_format = xfs_qm_qoff_logitem_format,
.iop_push = xfs_qm_qoff_logitem_push,
+ .iop_release = xfs_qm_qoff_logitem_release,
};
/*
+ * Delete the quotaoff intent from the AIL and free it. On success,
+ * this should only be called for the start item. It can be used for
+ * either on shutdown or abort.
+ */
+void
+xfs_qm_qoff_logitem_relse(
+ struct xfs_qoff_logitem *qoff)
+{
+ struct xfs_log_item *lip = &qoff->qql_item;
+
+ ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags) ||
+ test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
+ XFS_FORCED_SHUTDOWN(lip->li_mountp));
+ xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+ kmem_free(lip->li_lv_shadow);
+ kmem_free(qoff);
+}
+
+/*
* Allocate and initialize an quotaoff item of the correct quota type(s).
*/
struct xfs_qoff_logitem *
diff --git a/fs/xfs/xfs_dquot_item.h b/fs/xfs/xfs_dquot_item.h
index 1aed34ccdabc..2b86a43d7ce2 100644
--- a/fs/xfs/xfs_dquot_item.h
+++ b/fs/xfs/xfs_dquot_item.h
@@ -11,25 +11,28 @@ struct xfs_trans;
struct xfs_mount;
struct xfs_qoff_logitem;
-typedef struct xfs_dq_logitem {
- struct xfs_log_item qli_item; /* common portion */
- struct xfs_dquot *qli_dquot; /* dquot ptr */
- xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
-} xfs_dq_logitem_t;
+struct xfs_dq_logitem {
+ struct xfs_log_item qli_item; /* common portion */
+ struct xfs_dquot *qli_dquot; /* dquot ptr */
+ xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
+};
-typedef struct xfs_qoff_logitem {
- struct xfs_log_item qql_item; /* common portion */
- struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
+struct xfs_qoff_logitem {
+ struct xfs_log_item qql_item; /* common portion */
+ struct xfs_qoff_logitem *qql_start_lip; /* qoff-start logitem, if any */
unsigned int qql_flags;
-} xfs_qoff_logitem_t;
+};
-extern void xfs_qm_dquot_logitem_init(struct xfs_dquot *);
-extern xfs_qoff_logitem_t *xfs_qm_qoff_logitem_init(struct xfs_mount *,
- struct xfs_qoff_logitem *, uint);
-extern xfs_qoff_logitem_t *xfs_trans_get_qoff_item(struct xfs_trans *,
- struct xfs_qoff_logitem *, uint);
-extern void xfs_trans_log_quotaoff_item(struct xfs_trans *,
- struct xfs_qoff_logitem *);
+void xfs_qm_dquot_logitem_init(struct xfs_dquot *dqp);
+struct xfs_qoff_logitem *xfs_qm_qoff_logitem_init(struct xfs_mount *mp,
+ struct xfs_qoff_logitem *start,
+ uint flags);
+void xfs_qm_qoff_logitem_relse(struct xfs_qoff_logitem *);
+struct xfs_qoff_logitem *xfs_trans_get_qoff_item(struct xfs_trans *tp,
+ struct xfs_qoff_logitem *startqoff,
+ uint flags);
+void xfs_trans_log_quotaoff_item(struct xfs_trans *tp,
+ struct xfs_qoff_logitem *qlp);
#endif /* __XFS_DQUOT_ITEM_H__ */
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 849fd4476950..182b70464b71 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -329,19 +329,43 @@ xfs_corruption_error(
const char *tag,
int level,
struct xfs_mount *mp,
- void *buf,
+ const void *buf,
size_t bufsize,
const char *filename,
int linenum,
xfs_failaddr_t failaddr)
{
- if (level <= xfs_error_level)
+ if (buf && level <= xfs_error_level)
xfs_hex_dump(buf, bufsize);
xfs_error_report(tag, level, mp, filename, linenum, failaddr);
xfs_alert(mp, "Corruption detected. Unmount and run xfs_repair");
}
/*
+ * Complain about the kinds of metadata corruption that we can't detect from a
+ * verifier, such as incorrect inter-block relationship data. Does not set
+ * bp->b_error.
+ *
+ * Call xfs_buf_mark_corrupt, not this function.
+ */
+void
+xfs_buf_corruption_error(
+ struct xfs_buf *bp,
+ xfs_failaddr_t fa)
+{
+ struct xfs_mount *mp = bp->b_mount;
+
+ xfs_alert_tag(mp, XFS_PTAG_VERIFIER_ERROR,
+ "Metadata corruption detected at %pS, %s block 0x%llx",
+ fa, bp->b_ops->name, bp->b_bn);
+
+ xfs_alert(mp, "Unmount and run xfs_repair");
+
+ if (xfs_error_level >= XFS_ERRLEVEL_HIGH)
+ xfs_stack_trace();
+}
+
+/*
* Warnings specifically for verifier errors. Differentiate CRC vs. invalid
* values, and omit the stack trace unless the error level is tuned high.
*/
@@ -350,7 +374,7 @@ xfs_buf_verifier_error(
struct xfs_buf *bp,
int error,
const char *name,
- void *buf,
+ const void *buf,
size_t bufsz,
xfs_failaddr_t failaddr)
{
@@ -402,7 +426,7 @@ xfs_inode_verifier_error(
struct xfs_inode *ip,
int error,
const char *name,
- void *buf,
+ const void *buf,
size_t bufsz,
xfs_failaddr_t failaddr)
{
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 602aa7d62b66..c6bb7d7a2161 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -12,16 +12,17 @@ extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp,
const char *filename, int linenum,
xfs_failaddr_t failaddr);
extern void xfs_corruption_error(const char *tag, int level,
- struct xfs_mount *mp, void *buf, size_t bufsize,
+ struct xfs_mount *mp, const void *buf, size_t bufsize,
const char *filename, int linenum,
xfs_failaddr_t failaddr);
+void xfs_buf_corruption_error(struct xfs_buf *bp, xfs_failaddr_t fa);
extern void xfs_buf_verifier_error(struct xfs_buf *bp, int error,
- const char *name, void *buf, size_t bufsz,
+ const char *name, const void *buf, size_t bufsz,
xfs_failaddr_t failaddr);
extern void xfs_verifier_error(struct xfs_buf *bp, int error,
xfs_failaddr_t failaddr);
extern void xfs_inode_verifier_error(struct xfs_inode *ip, int error,
- const char *name, void *buf, size_t bufsz,
+ const char *name, const void *buf, size_t bufsz,
xfs_failaddr_t failaddr);
#define XFS_ERROR_REPORT(e, lvl, mp) \
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index f1372f9046e3..5a4b0119143a 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -15,7 +15,6 @@
#include "xfs_trans.h"
#include "xfs_inode_item.h"
#include "xfs_icache.h"
-#include "xfs_log.h"
#include "xfs_pnfs.h"
/*
@@ -221,18 +220,7 @@ STATIC int
xfs_fs_nfs_commit_metadata(
struct inode *inode)
{
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_inode(XFS_I(inode));
}
const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 2183d87be4cf..ef17c1f6db32 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -344,7 +344,6 @@ xfs_extent_busy_trim(
ASSERT(*len > 0);
spin_lock(&args->pag->pagb_lock);
-restart:
fbno = *bno;
flen = *len;
rbp = args->pag->pagb_tree.rb_node;
@@ -363,19 +362,6 @@ restart:
continue;
}
- /*
- * If this is a metadata allocation, try to reuse the busy
- * extent instead of trimming the allocation.
- */
- if (!xfs_alloc_is_userdata(args->datatype) &&
- !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) {
- if (!xfs_extent_busy_update_extent(args->mp, args->pag,
- busyp, fbno, flen,
- false))
- goto restart;
- continue;
- }
-
if (bbno <= fbno) {
/* start overlap */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index e44efc41a041..de3cdce892fd 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -21,7 +21,7 @@
#include "xfs_alloc.h"
#include "xfs_bmap.h"
#include "xfs_trace.h"
-
+#include "xfs_error.h"
kmem_zone_t *xfs_efi_zone;
kmem_zone_t *xfs_efd_zone;
@@ -139,44 +139,6 @@ xfs_efi_item_release(
xfs_efi_release(EFI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_efi_item_ops = {
- .iop_size = xfs_efi_item_size,
- .iop_format = xfs_efi_item_format,
- .iop_unpin = xfs_efi_item_unpin,
- .iop_release = xfs_efi_item_release,
-};
-
-
-/*
- * Allocate and initialize an efi item with the given number of extents.
- */
-struct xfs_efi_log_item *
-xfs_efi_init(
- struct xfs_mount *mp,
- uint nextents)
-
-{
- struct xfs_efi_log_item *efip;
- uint size;
-
- ASSERT(nextents > 0);
- if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
- size = (uint)(sizeof(xfs_efi_log_item_t) +
- ((nextents - 1) * sizeof(xfs_extent_t)));
- efip = kmem_zalloc(size, 0);
- } else {
- efip = kmem_zone_zalloc(xfs_efi_zone, 0);
- }
-
- xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
- efip->efi_format.efi_nextents = nextents;
- efip->efi_format.efi_id = (uintptr_t)(void *)efip;
- atomic_set(&efip->efi_next_extent, 0);
- atomic_set(&efip->efi_refcount, 2);
-
- return efip;
-}
-
/*
* Copy an EFI format buffer from the given buf, and into the destination
* EFI format structure.
@@ -228,6 +190,7 @@ xfs_efi_copy_format(xfs_log_iovec_t *buf, xfs_efi_log_format_t *dst_efi_fmt)
}
return 0;
}
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}
@@ -411,41 +374,16 @@ xfs_extent_free_diff_items(
XFS_FSB_TO_AGNO(mp, rb->xefi_startblock);
}
-/* Get an EFI. */
-STATIC void *
-xfs_extent_free_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_efi_log_item *efip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- efip = xfs_efi_init(tp->t_mountp, count);
- ASSERT(efip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &efip->efi_item);
- return efip;
-}
-
/* Log a free extent to the intent item. */
STATIC void
xfs_extent_free_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_efi_log_item *efip,
+ struct xfs_extent_free_item *free)
{
- struct xfs_efi_log_item *efip = intent;
- struct xfs_extent_free_item *free;
uint next_extent;
struct xfs_extent *extp;
- free = container_of(item, struct xfs_extent_free_item, xefi_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
@@ -461,14 +399,35 @@ xfs_extent_free_log_item(
extp->ext_len = free->xefi_blockcount;
}
+static struct xfs_log_item *
+xfs_extent_free_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_efi_log_item *efip = xfs_efi_init(mp, count);
+ struct xfs_extent_free_item *free;
+
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &efip->efi_item);
+ if (sort)
+ list_sort(mp, items, xfs_extent_free_diff_items);
+ list_for_each_entry(free, items, xefi_list)
+ xfs_extent_free_log_item(tp, efip, free);
+ return &efip->efi_item;
+}
+
/* Get an EFD so we can process all the free extents. */
STATIC void *
xfs_extent_free_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_efd(tp, intent, count);
+ return xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
}
/* Process a free extent. */
@@ -494,9 +453,9 @@ xfs_extent_free_finish_item(
/* Abort all pending EFIs. */
STATIC void
xfs_extent_free_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_efi_release(intent);
+ xfs_efi_release(EFI_ITEM(intent));
}
/* Cancel a free extent. */
@@ -512,10 +471,8 @@ xfs_extent_free_cancel_item(
const struct xfs_defer_op_type xfs_extent_free_defer_type = {
.max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .diff_items = xfs_extent_free_diff_items,
.create_intent = xfs_extent_free_create_intent,
.abort_intent = xfs_extent_free_abort_intent,
- .log_item = xfs_extent_free_log_item,
.create_done = xfs_extent_free_create_done,
.finish_item = xfs_extent_free_finish_item,
.cancel_item = xfs_extent_free_cancel_item,
@@ -578,10 +535,8 @@ xfs_agfl_free_finish_item(
/* sub-type with special handling for AGFL deferred frees */
const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
.max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .diff_items = xfs_extent_free_diff_items,
.create_intent = xfs_extent_free_create_intent,
.abort_intent = xfs_extent_free_abort_intent,
- .log_item = xfs_extent_free_log_item,
.create_done = xfs_extent_free_create_done,
.finish_item = xfs_agfl_free_finish_item,
.cancel_item = xfs_extent_free_cancel_item,
@@ -593,9 +548,10 @@ const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
*/
int
xfs_efi_recover(
- struct xfs_mount *mp,
- struct xfs_efi_log_item *efip)
+ struct xfs_efi_log_item *efip,
+ struct list_head *capture_list)
{
+ struct xfs_mount *mp = efip->efi_item.li_mountp;
struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
int i;
@@ -624,7 +580,7 @@ xfs_efi_recover(
*/
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
xfs_efi_release(efip);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -644,10 +600,76 @@ xfs_efi_recover(
}
set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
- error = xfs_trans_commit(tp);
- return error;
+
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_trans_cancel(tp);
return error;
}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_efi_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_efd_log_item *efdp;
+ struct xfs_efi_log_item *efip;
+ struct xfs_extent *extp;
+ unsigned int count;
+
+ count = EFI_ITEM(intent)->efi_format.efi_nextents;
+ extp = EFI_ITEM(intent)->efi_format.efi_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
+ efdp->efd_next_extent = count;
+ memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
+ set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
+
+ efip = xfs_efi_init(tp->t_mountp, count);
+ memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
+ atomic_set(&efip->efi_next_extent, count);
+ xfs_trans_add_item(tp, &efip->efi_item);
+ set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+ return &efip->efi_item;
+}
+
+static const struct xfs_item_ops xfs_efi_item_ops = {
+ .iop_size = xfs_efi_item_size,
+ .iop_format = xfs_efi_item_format,
+ .iop_unpin = xfs_efi_item_unpin,
+ .iop_release = xfs_efi_item_release,
+ .iop_relog = xfs_efi_item_relog,
+};
+
+/*
+ * Allocate and initialize an efi item with the given number of extents.
+ */
+struct xfs_efi_log_item *
+xfs_efi_init(
+ struct xfs_mount *mp,
+ uint nextents)
+
+{
+ struct xfs_efi_log_item *efip;
+ uint size;
+
+ ASSERT(nextents > 0);
+ if (nextents > XFS_EFI_MAX_FAST_EXTENTS) {
+ size = (uint)(sizeof(struct xfs_efi_log_item) +
+ ((nextents - 1) * sizeof(xfs_extent_t)));
+ efip = kmem_zalloc(size, 0);
+ } else {
+ efip = kmem_zone_zalloc(xfs_efi_zone, 0);
+ }
+
+ xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops);
+ efip->efi_format.efi_nextents = nextents;
+ efip->efi_format.efi_id = (uintptr_t)(void *)efip;
+ atomic_set(&efip->efi_next_extent, 0);
+ atomic_set(&efip->efi_refcount, 2);
+
+ return efip;
+}
diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h
index 16aaab06d4ec..883f0f1d8989 100644
--- a/fs/xfs/xfs_extfree_item.h
+++ b/fs/xfs/xfs_extfree_item.h
@@ -50,25 +50,25 @@ struct kmem_zone;
* of commit failure or log I/O errors. Note that the EFD is not inserted in the
* AIL, so at this point both the EFI and EFD are freed.
*/
-typedef struct xfs_efi_log_item {
+struct xfs_efi_log_item {
struct xfs_log_item efi_item;
atomic_t efi_refcount;
atomic_t efi_next_extent;
unsigned long efi_flags; /* misc flags */
xfs_efi_log_format_t efi_format;
-} xfs_efi_log_item_t;
+};
/*
* This is the "extent free done" log item. It is used to log
* the fact that some extents earlier mentioned in an efi item
* have been freed.
*/
-typedef struct xfs_efd_log_item {
+struct xfs_efd_log_item {
struct xfs_log_item efd_item;
- xfs_efi_log_item_t *efd_efip;
+ struct xfs_efi_log_item *efd_efip;
uint efd_next_extent;
xfs_efd_log_format_t efd_format;
-} xfs_efd_log_item_t;
+};
/*
* Max number of extents in fast allocation path.
@@ -78,13 +78,13 @@ typedef struct xfs_efd_log_item {
extern struct kmem_zone *xfs_efi_zone;
extern struct kmem_zone *xfs_efd_zone;
-xfs_efi_log_item_t *xfs_efi_init(struct xfs_mount *, uint);
+struct xfs_efi_log_item *xfs_efi_init(struct xfs_mount *, uint);
int xfs_efi_copy_format(xfs_log_iovec_t *buf,
xfs_efi_log_format_t *dst_efi_fmt);
-void xfs_efi_item_free(xfs_efi_log_item_t *);
+void xfs_efi_item_free(struct xfs_efi_log_item *);
void xfs_efi_release(struct xfs_efi_log_item *);
-int xfs_efi_recover(struct xfs_mount *mp,
- struct xfs_efi_log_item *efip);
+int xfs_efi_recover(struct xfs_efi_log_item *efip,
+ struct list_head *capture_list);
#endif /* __XFS_EXTFREE_ITEM_H__ */
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 203065a64765..b651715da8c6 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -80,19 +80,9 @@ xfs_dir_fsync(
int datasync)
{
struct xfs_inode *ip = XFS_I(file->f_mapping->host);
- struct xfs_mount *mp = ip->i_mount;
- xfs_lsn_t lsn = 0;
trace_xfs_dir_fsync(ip);
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (!lsn)
- return 0;
- return xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_inode(ip);
}
STATIC int
@@ -187,7 +177,12 @@ xfs_file_dio_aio_read(
file_accessed(iocb->ki_filp);
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
+ return -EAGAIN;
+ } else {
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ }
ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -995,6 +990,21 @@ xfs_file_fadvise(
return ret;
}
+/* Does this file, inode, or mount want synchronous writes? */
+static inline bool xfs_file_sync_writes(struct file *filp)
+{
+ struct xfs_inode *ip = XFS_I(file_inode(filp));
+
+ if (ip->i_mount->m_flags & XFS_MOUNT_WSYNC)
+ return true;
+ if (filp->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(filp)))
+ return true;
+
+ return false;
+}
+
STATIC loff_t
xfs_file_remap_range(
struct file *file_in,
@@ -1049,7 +1059,11 @@ xfs_file_remap_range(
ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
remap_flags);
+ if (ret)
+ goto out_unlock;
+ if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
+ xfs_log_force_inode(dest);
out_unlock:
xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
@@ -1253,10 +1267,23 @@ xfs_filemap_pfn_mkwrite(
return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
}
+static void
+xfs_filemap_map_pages(
+ struct vm_fault *vmf,
+ pgoff_t start_pgoff,
+ pgoff_t end_pgoff)
+{
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+
+ xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+ filemap_map_pages(vmf, start_pgoff, end_pgoff);
+ xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+}
+
static const struct vm_operations_struct xfs_file_vm_ops = {
.fault = xfs_filemap_fault,
.huge_fault = xfs_filemap_huge_fault,
- .map_pages = filemap_map_pages,
+ .map_pages = xfs_filemap_map_pages,
.page_mkwrite = xfs_filemap_page_mkwrite,
.pfn_mkwrite = xfs_filemap_pfn_mkwrite,
};
diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c
index 01c0933a4d10..79e8af8f4669 100644
--- a/fs/xfs/xfs_fsmap.c
+++ b/fs/xfs/xfs_fsmap.c
@@ -146,6 +146,7 @@ xfs_fsmap_owner_from_rmap(
dest->fmr_owner = XFS_FMR_OWN_FREE;
break;
default:
+ ASSERT(0);
return -EFSCORRUPTED;
}
return 0;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index a1135b86e79f..f1451642ce38 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -289,6 +289,8 @@ xfs_reinit_inode(
uint64_t version = inode_peek_iversion(inode);
umode_t mode = inode->i_mode;
dev_t dev = inode->i_rdev;
+ kuid_t uid = inode->i_uid;
+ kgid_t gid = inode->i_gid;
error = inode_init_always(mp->m_super, inode);
@@ -297,6 +299,8 @@ xfs_reinit_inode(
inode_set_iversion_queried(inode, version);
inode->i_mode = mode;
inode->i_rdev = dev;
+ inode->i_uid = uid;
+ inode->i_gid = gid;
return error;
}
@@ -1430,7 +1434,7 @@ xfs_inode_match_id(
return 0;
if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
- xfs_get_projid(ip) != eofb->eof_prid)
+ ip->i_d.di_projid != eofb->eof_prid)
return 0;
return 1;
@@ -1454,7 +1458,7 @@ xfs_inode_match_id_union(
return 1;
if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
- xfs_get_projid(ip) == eofb->eof_prid)
+ ip->i_d.di_projid == eofb->eof_prid)
return 1;
return 0;
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index 3ebd1b7f49d8..7d940b289db5 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -10,6 +10,7 @@
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_icreate_item.h"
+#include "xfs_log_priv.h"
#include "xfs_log.h"
kmem_zone_t *xfs_icreate_zone; /* inode create item zone */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 7a9048c4c2f9..568a9332efd2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -750,6 +750,7 @@ xfs_ialloc(
xfs_buf_t **ialloc_context,
xfs_inode_t **ipp)
{
+ struct inode *dir = pip ? VFS_I(pip) : NULL;
struct xfs_mount *mp = tp->t_mountp;
xfs_ino_t ino;
xfs_inode_t *ip;
@@ -795,26 +796,17 @@ xfs_ialloc(
return error;
ASSERT(ip != NULL);
inode = VFS_I(ip);
-
- /*
- * We always convert v1 inodes to v2 now - we only support filesystems
- * with >= v2 inode capability, so there is no reason for ever leaving
- * an inode in v1 format.
- */
- if (ip->i_d.di_version == 1)
- ip->i_d.di_version = 2;
-
- inode->i_mode = mode;
set_nlink(inode, nlink);
- ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
- ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
inode->i_rdev = rdev;
- xfs_set_projid(ip, prid);
+ ip->i_d.di_projid = prid;
- if (pip && XFS_INHERIT_GID(pip)) {
- ip->i_d.di_gid = pip->i_d.di_gid;
- if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
+ if (dir && !(dir->i_mode & S_ISGID) &&
+ (mp->m_flags & XFS_MOUNT_GRPID)) {
+ inode->i_uid = current_fsuid();
+ inode->i_gid = dir->i_gid;
+ inode->i_mode = mode;
+ } else {
+ inode_init_owner(inode, dir, mode);
}
/*
@@ -822,9 +814,8 @@ xfs_ialloc(
* ID or one of the supplementary group IDs, the S_ISGID bit is cleared
* (and only if the irix_sgid_inherit compatibility variable is set).
*/
- if ((irix_sgid_inherit) &&
- (inode->i_mode & S_ISGID) &&
- (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid))))
+ if (irix_sgid_inherit &&
+ (inode->i_mode & S_ISGID) && !in_group_p(inode->i_gid))
inode->i_mode &= ~S_ISGID;
ip->i_d.di_size = 0;
@@ -841,7 +832,7 @@ xfs_ialloc(
ip->i_d.di_dmstate = 0;
ip->i_d.di_flags = 0;
- if (ip->i_d.di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
inode_set_iversion(inode, 1);
ip->i_d.di_flags2 = 0;
ip->i_d.di_cowextsize = 0;
@@ -849,7 +840,6 @@ xfs_ialloc(
ip->i_d.di_crtime.t_nsec = (int32_t)tv.tv_nsec;
}
-
flags = XFS_ILOG_CORE;
switch (mode & S_IFMT) {
case S_IFIFO:
@@ -902,20 +892,13 @@ xfs_ialloc(
ip->i_d.di_flags |= di_flags;
}
- if (pip &&
- (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
- pip->i_d.di_version == 3 &&
- ip->i_d.di_version == 3) {
- uint64_t di_flags2 = 0;
-
+ if (pip && (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY)) {
if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
- di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
}
if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
- di_flags2 |= XFS_DIFLAG2_DAX;
-
- ip->i_d.di_flags2 |= di_flags2;
+ ip->i_d.di_flags2 |= XFS_DIFLAG2_DAX;
}
/* FALLTHROUGH */
case S_IFLNK:
@@ -1117,7 +1100,6 @@ xfs_bumplink(
{
xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
- ASSERT(ip->i_d.di_version > 1);
inc_nlink(VFS_I(ip));
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
@@ -1153,8 +1135,7 @@ xfs_create(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1304,8 +1285,7 @@ xfs_create_tmpfile(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -1418,7 +1398,7 @@ xfs_link(
* the tree quota mechanism could be circumvented.
*/
if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
+ tdp->i_d.di_projid != sip->i_d.di_projid)) {
error = -EXDEV;
goto error_return;
}
@@ -1513,10 +1493,8 @@ xfs_itruncate_extents_flags(
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
xfs_fileoff_t first_unmap_block;
- xfs_fileoff_t last_block;
xfs_filblks_t unmap_len;
int error = 0;
- int done = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
@@ -1536,21 +1514,22 @@ xfs_itruncate_extents_flags(
* the end of the file (in a crash where the space is allocated
* but the inode size is not yet updated), simply remove any
* blocks which show up between the new EOF and the maximum
- * possible file size. If the first block to be removed is
- * beyond the maximum file size (ie it is the same as last_block),
- * then there is nothing to do.
+ * possible file size.
+ *
+ * We have to free all the blocks to the bmbt maximum offset, even if
+ * the page cache can't scale that far.
*/
first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
- last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
- if (first_unmap_block == last_block)
+ if (first_unmap_block >= XFS_MAX_FILEOFF) {
+ WARN_ON_ONCE(first_unmap_block > XFS_MAX_FILEOFF);
return 0;
+ }
- ASSERT(first_unmap_block < last_block);
- unmap_len = last_block - first_unmap_block + 1;
- while (!done) {
+ unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
+ while (unmap_len > 0) {
ASSERT(tp->t_firstblock == NULLFSBLOCK);
- error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
- XFS_ITRUNC_MAX_EXTENTS, &done);
+ error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
+ flags, XFS_ITRUNC_MAX_EXTENTS);
if (error)
goto out;
@@ -1570,7 +1549,7 @@ xfs_itruncate_extents_flags(
if (whichfork == XFS_DATA_FORK) {
/* Remove all pending CoW reservations. */
error = xfs_reflink_cancel_cow_blocks(ip, &tp,
- first_unmap_block, last_block, true);
+ first_unmap_block, XFS_MAX_FILEOFF, true);
if (error)
goto out;
@@ -2149,8 +2128,10 @@ xfs_iunlink_update_bucket(
* passed in because either we're adding or removing ourselves from the
* head of the list.
*/
- if (old_value == new_agino)
+ if (old_value == new_agino) {
+ xfs_buf_mark_corrupt(agibp);
return -EFSCORRUPTED;
+ }
agi->agi_unlinked[bucket_index] = cpu_to_be32(new_agino);
offset = offsetof(struct xfs_agi, agi_unlinked) +
@@ -2213,6 +2194,8 @@ xfs_iunlink_update_inode(
/* Make sure the old pointer isn't garbage. */
old_value = be32_to_cpu(dip->di_next_unlinked);
if (!xfs_verify_agino_or_null(mp, agno, old_value)) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__, dip,
+ sizeof(*dip), __this_address);
error = -EFSCORRUPTED;
goto out;
}
@@ -2224,8 +2207,11 @@ xfs_iunlink_update_inode(
*/
*old_next_agino = old_value;
if (old_value == next_agino) {
- if (next_agino != NULLAGINO)
+ if (next_agino != NULLAGINO) {
+ xfs_inode_verifier_error(ip, -EFSCORRUPTED, __func__,
+ dip, sizeof(*dip), __this_address);
error = -EFSCORRUPTED;
+ }
goto out;
}
@@ -2276,8 +2262,10 @@ xfs_iunlink(
*/
next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
if (next_agino == agino ||
- !xfs_verify_agino_or_null(mp, agno, next_agino))
+ !xfs_verify_agino_or_null(mp, agno, next_agino)) {
+ xfs_buf_mark_corrupt(agibp);
return -EFSCORRUPTED;
+ }
if (next_agino != NULLAGINO) {
struct xfs_perag *pag;
@@ -2547,7 +2535,7 @@ xfs_ifree_cluster(
xfs_daddr_t blkno;
xfs_buf_t *bp;
xfs_inode_t *ip;
- xfs_inode_log_item_t *iip;
+ struct xfs_inode_log_item *iip;
struct xfs_log_item *lip;
struct xfs_perag *pag;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
@@ -2584,8 +2572,10 @@ xfs_ifree_cluster(
mp->m_bsize * igeo->blocks_per_cluster,
XBF_UNMAPPED);
- if (!bp)
+ if (!bp) {
+ xfs_perag_put(pag);
return -ENOMEM;
+ }
/*
* This buffer may not have been correctly initialised as we
@@ -2607,7 +2597,7 @@ xfs_ifree_cluster(
*/
list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
if (lip->li_type == XFS_LI_INODE) {
- iip = (xfs_inode_log_item_t *)lip;
+ iip = (struct xfs_inode_log_item *)lip;
ASSERT(iip->ili_logged == 1);
lip->li_cb = xfs_istale_done;
xfs_trans_ail_copy_lsn(mp->m_ail,
@@ -3215,6 +3205,7 @@ xfs_rename(
struct xfs_trans *tp;
struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
+ int i;
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
@@ -3288,7 +3279,7 @@ xfs_rename(
* tree quota mechanism would be circumvented.
*/
if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
+ target_dp->i_d.di_projid != src_ip->i_d.di_projid)) {
error = -EXDEV;
goto out_trans_cancel;
}
@@ -3327,6 +3318,30 @@ xfs_rename(
}
/*
+ * Lock the AGI buffers we need to handle bumping the nlink of the
+ * whiteout inode off the unlinked list and to handle dropping the
+ * nlink of the target inode. Per locking order rules, do this in
+ * increasing AG order and before directory block allocation tries to
+ * grab AGFs because we grab AGIs before AGFs.
+ *
+ * The (vfs) caller must ensure that if src is a directory then
+ * target_ip is either null or an empty directory.
+ */
+ for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
+ if (inodes[i] == wip ||
+ (inodes[i] == target_ip &&
+ (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
+ struct xfs_buf *bp;
+ xfs_agnumber_t agno;
+
+ agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
+ error = xfs_read_agi(mp, tp, agno, &bp);
+ if (error)
+ goto out_trans_cancel;
+ }
+ }
+
+ /*
* Directory entry creation below may acquire the AGF. Remove
* the whiteout from the unlinked list first to preserve correct
* AGI/AGF locking order. This dirties the transaction so failures
@@ -3796,7 +3811,6 @@ xfs_iflush_int(
ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
ASSERT(iip != NULL && iip->ili_fields != 0);
- ASSERT(ip->i_d.di_version > 1);
/* set *dip = inode's place in the buffer */
dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
@@ -3857,7 +3871,7 @@ xfs_iflush_int(
* backwards compatibility with old kernels that predate logging all
* inode changes.
*/
- if (ip->i_d.di_version < 3)
+ if (!xfs_sb_version_has_v3inode(&mp->m_sb))
ip->i_d.di_flushiter++;
/* Check the inline fork data before we write out. */
@@ -3940,3 +3954,22 @@ xfs_irele(
trace_xfs_irele(ip, _RET_IP_);
iput(VFS_I(ip));
}
+
+/*
+ * Ensure all commited transactions touching the inode are written to the log.
+ */
+int
+xfs_log_force_inode(
+ struct xfs_inode *ip)
+{
+ xfs_lsn_t lsn = 0;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ if (xfs_ipincount(ip))
+ lsn = ip->i_itemp->ili_last_lsn;
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ if (!lsn)
+ return 0;
+ return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 558173f95a03..62b963d3b23d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -177,30 +177,11 @@ xfs_iflags_test_and_set(xfs_inode_t *ip, unsigned short flags)
return ret;
}
-/*
- * Project quota id helpers (previously projid was 16bit only
- * and using two 16bit values to hold new 32bit projid was chosen
- * to retain compatibility with "old" filesystems).
- */
-static inline prid_t
-xfs_get_projid(struct xfs_inode *ip)
-{
- return (prid_t)ip->i_d.di_projid_hi << 16 | ip->i_d.di_projid_lo;
-}
-
-static inline void
-xfs_set_projid(struct xfs_inode *ip,
- prid_t projid)
-{
- ip->i_d.di_projid_hi = (uint16_t) (projid >> 16);
- ip->i_d.di_projid_lo = (uint16_t) (projid & 0xffff);
-}
-
static inline prid_t
xfs_get_initial_prid(struct xfs_inode *dp)
{
if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
- return xfs_get_projid(dp);
+ return dp->i_d.di_projid;
return XFS_PROJID_DEFAULT;
}
@@ -441,6 +422,7 @@ int xfs_itruncate_extents_flags(struct xfs_trans **,
struct xfs_inode *, int, xfs_fsize_t, int);
void xfs_iext_realloc(xfs_inode_t *, int, int);
+int xfs_log_force_inode(struct xfs_inode *ip);
void xfs_iunpin_wait(xfs_inode_t *);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index bb8f076805b9..83bf96b6cf5d 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -17,6 +17,7 @@
#include "xfs_trans_priv.h"
#include "xfs_buf_item.h"
#include "xfs_log.h"
+#include "xfs_error.h"
#include <linux/iversion.h>
@@ -124,7 +125,7 @@ xfs_inode_item_size(
*nvecs += 2;
*nbytes += sizeof(struct xfs_inode_log_format) +
- xfs_log_dinode_size(ip->i_d.di_version);
+ xfs_log_dinode_size(ip->i_mount);
xfs_inode_item_data_fork_size(iip, nvecs, nbytes);
if (XFS_IFORK_Q(ip))
@@ -304,13 +305,11 @@ xfs_inode_to_log_dinode(
struct inode *inode = VFS_I(ip);
to->di_magic = XFS_DINODE_MAGIC;
-
- to->di_version = from->di_version;
to->di_format = from->di_format;
- to->di_uid = from->di_uid;
- to->di_gid = from->di_gid;
- to->di_projid_lo = from->di_projid_lo;
- to->di_projid_hi = from->di_projid_hi;
+ to->di_uid = i_uid_read(inode);
+ to->di_gid = i_gid_read(inode);
+ to->di_projid_lo = from->di_projid & 0xffff;
+ to->di_projid_hi = from->di_projid >> 16;
memset(to->di_pad, 0, sizeof(to->di_pad));
memset(to->di_pad3, 0, sizeof(to->di_pad3));
@@ -338,7 +337,8 @@ xfs_inode_to_log_dinode(
/* log a dummy value to ensure log structure is fully initialised */
to->di_next_unlinked = NULLAGINO;
- if (from->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&ip->i_mount->m_sb)) {
+ to->di_version = 3;
to->di_changecount = inode_peek_iversion(inode);
to->di_crtime.t_sec = from->di_crtime.t_sec;
to->di_crtime.t_nsec = from->di_crtime.t_nsec;
@@ -350,6 +350,7 @@ xfs_inode_to_log_dinode(
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_flushiter = 0;
} else {
+ to->di_version = 2;
to->di_flushiter = from->di_flushiter;
}
}
@@ -369,7 +370,7 @@ xfs_inode_item_format_core(
dic = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_ICORE);
xfs_inode_to_log_dinode(ip, dic, ip->i_itemp->ili_item.li_lsn);
- xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_d.di_version));
+ xlog_finish_iovec(lv, *vecp, xfs_log_dinode_size(ip->i_mount));
}
/*
@@ -394,8 +395,6 @@ xfs_inode_item_format(
struct xfs_log_iovec *vecp = NULL;
struct xfs_inode_log_format *ilf;
- ASSERT(ip->i_d.di_version > 1);
-
ilf = xlog_prepare_iovec(lv, &vecp, XLOG_REG_TYPE_IFORMAT);
ilf->ilf_type = XFS_LI_INODE;
ilf->ilf_ino = ip->i_ino;
@@ -731,29 +730,27 @@ xfs_iflush_done(
* holding the lock before removing the inode from the AIL.
*/
if (need_ail) {
- bool mlip_changed = false;
+ xfs_lsn_t tail_lsn = 0;
/* this is an opencoded batch version of xfs_trans_ail_delete */
spin_lock(&ailp->ail_lock);
list_for_each_entry(blip, &tmp, li_bio_list) {
if (INODE_ITEM(blip)->ili_logged &&
- blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn)
- mlip_changed |= xfs_ail_delete_one(ailp, blip);
- else {
+ blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) {
+ /*
+ * xfs_ail_update_finish() only cares about the
+ * lsn of the first tail item removed, any
+ * others will be at the same or higher lsn so
+ * we just ignore them.
+ */
+ xfs_lsn_t lsn = xfs_ail_delete_one(ailp, blip);
+ if (!tail_lsn && lsn)
+ tail_lsn = lsn;
+ } else {
xfs_clear_li_failed(blip);
}
}
-
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
- }
- spin_unlock(&ailp->ail_lock);
-
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
+ xfs_ail_update_finish(ailp, tail_lsn);
}
/*
@@ -782,7 +779,7 @@ xfs_iflush_abort(
xfs_inode_t *ip,
bool stale)
{
- xfs_inode_log_item_t *iip = ip->i_itemp;
+ struct xfs_inode_log_item *iip = ip->i_itemp;
if (iip) {
if (test_bit(XFS_LI_IN_AIL, &iip->ili_item.li_flags)) {
@@ -828,8 +825,10 @@ xfs_inode_item_format_convert(
{
struct xfs_inode_log_format_32 *in_f32 = buf->i_addr;
- if (buf->i_len != sizeof(*in_f32))
+ if (buf->i_len != sizeof(*in_f32)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
+ }
in_f->ilf_type = in_f32->ilf_type;
in_f->ilf_size = in_f32->ilf_size;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 07a60e74c39c..ad667fd4ae62 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -13,7 +13,7 @@ struct xfs_bmbt_rec;
struct xfs_inode;
struct xfs_mount;
-typedef struct xfs_inode_log_item {
+struct xfs_inode_log_item {
struct xfs_log_item ili_item; /* common portion */
struct xfs_inode *ili_inode; /* inode ptr */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
@@ -23,7 +23,7 @@ typedef struct xfs_inode_log_item {
unsigned int ili_last_fields; /* fields when flushed */
unsigned int ili_fields; /* fields to be logged */
unsigned int ili_fsync_fields; /* logged since last fsync */
-} xfs_inode_log_item_t;
+};
static inline int xfs_inode_clean(xfs_inode_t *ip)
{
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 7b7a009425e2..e7356e527260 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1144,7 +1144,7 @@ xfs_fill_fsxattr(
fa->fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
fa->fsx_cowextsize = ip->i_d.di_cowextsize <<
ip->i_mount->m_sb.sb_blocklog;
- fa->fsx_projid = xfs_get_projid(ip);
+ fa->fsx_projid = ip->i_d.di_projid;
if (attr) {
if (ip->i_afp) {
@@ -1299,7 +1299,7 @@ xfs_ioctl_setattr_xflags(
/* diflags2 only valid for v3 inodes. */
di_flags2 = xfs_flags2diflags2(ip, fa->fsx_xflags);
- if (di_flags2 && ip->i_d.di_version < 3)
+ if (di_flags2 && !xfs_sb_version_has_v3inode(&mp->m_sb))
return -EINVAL;
ip->i_d.di_flags = xfs_flags2diflags(ip, fa->fsx_xflags);
@@ -1510,8 +1510,7 @@ xfs_ioctl_setattr_check_cowextsize(
if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
return 0;
- if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) ||
- ip->i_d.di_version != 3)
+ if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb))
return -EINVAL;
if (fa->fsx_cowextsize == 0)
@@ -1572,9 +1571,9 @@ xfs_ioctl_setattr(
* because the i_*dquot fields will get updated anyway.
*/
if (XFS_IS_QUOTA_ON(mp)) {
- code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
- ip->i_d.di_gid, fa->fsx_projid,
- XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
+ code = xfs_qm_vop_dqalloc(ip, VFS_I(ip)->i_uid,
+ VFS_I(ip)->i_gid, fa->fsx_projid,
+ XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
if (code)
return code;
}
@@ -1597,7 +1596,7 @@ xfs_ioctl_setattr(
}
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
- xfs_get_projid(ip) != fa->fsx_projid) {
+ ip->i_d.di_projid != fa->fsx_projid) {
code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
capable(CAP_FOWNER) ? XFS_QMOPT_FORCE_RES : 0);
if (code) /* out of quota */
@@ -1634,13 +1633,12 @@ xfs_ioctl_setattr(
VFS_I(ip)->i_mode &= ~(S_ISUID|S_ISGID);
/* Change the ownerships and register project quota modifications */
- if (xfs_get_projid(ip) != fa->fsx_projid) {
+ if (ip->i_d.di_projid != fa->fsx_projid) {
if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
olddquot = xfs_qm_vop_chown(tp, ip,
&ip->i_pdquot, pdqp);
}
- ASSERT(ip->i_d.di_version > 1);
- xfs_set_projid(ip, fa->fsx_projid);
+ ip->i_d.di_projid = fa->fsx_projid;
}
/*
@@ -1652,7 +1650,7 @@ xfs_ioctl_setattr(
ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
else
ip->i_d.di_extsize = 0;
- if (ip->i_d.di_version == 3 &&
+ if (xfs_sb_version_has_v3inode(&mp->m_sb) &&
(ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
ip->i_d.di_cowextsize = fa->fsx_cowextsize >>
mp->m_sb.sb_blocklog;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 239c9548b156..70880422057d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -765,6 +765,11 @@ xfs_iomap_write_unwritten(
*/
resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+ /* Attach dquots so that bmbt splits are accounted correctly. */
+ error = xfs_qm_dqattach(ip);
+ if (error)
+ return error;
+
do {
/*
* Set up a transaction to convert the range of extents
@@ -783,6 +788,11 @@ xfs_iomap_write_unwritten(
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
+ error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
+ XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES);
+ if (error)
+ goto error_on_bmapi_transaction;
+
/*
* Modify the unwritten extent state of the buffer.
*/
@@ -1055,6 +1065,13 @@ xfs_file_iomap_begin(
trace_xfs_iomap_alloc(ip, offset, length, XFS_DATA_FORK, &imap);
out_finish:
+ /*
+ * Writes that span EOF might trigger an IO size update on completion,
+ * so consider them to be dirty for the purposes of O_DSYNC even if
+ * there is no other metadata changes pending or have been made here.
+ */
+ if ((flags & IOMAP_WRITE) && offset + length > i_size_read(inode))
+ iomap->flags |= IOMAP_F_DIRTY;
return xfs_bmbt_to_iomap(ip, iomap, &imap, shared);
out_found:
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ca8c763902b9..a7efc8896e5e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -20,6 +20,7 @@
#include "xfs_symlink.h"
#include "xfs_dir2.h"
#include "xfs_iomap.h"
+#include "xfs_error.h"
#include <linux/xattr.h>
#include <linux/posix_acl.h>
@@ -470,17 +471,20 @@ xfs_vn_get_link_inline(
struct inode *inode,
struct delayed_call *done)
{
+ struct xfs_inode *ip = XFS_I(inode);
char *link;
- ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE);
+ ASSERT(ip->i_df.if_flags & XFS_IFINLINE);
/*
* The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if
* if_data is junk.
*/
- link = XFS_I(inode)->i_df.if_u1.if_data;
- if (!link)
+ link = ip->i_df.if_u1.if_data;
+ if (!link) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, ip->i_mount);
return ERR_PTR(-EFSCORRUPTED);
+ }
return link;
}
@@ -513,7 +517,7 @@ xfs_vn_getattr(
stat->blocks =
XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks);
- if (ip->i_d.di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
if (request_mask & STATX_BTIME) {
stat->result_mask |= STATX_BTIME;
stat->btime.tv_sec = ip->i_d.di_crtime.t_sec;
@@ -662,9 +666,7 @@ xfs_setattr_nonsize(
*/
ASSERT(udqp == NULL);
ASSERT(gdqp == NULL);
- error = xfs_qm_vop_dqalloc(ip, xfs_kuid_to_uid(uid),
- xfs_kgid_to_gid(gid),
- xfs_get_projid(ip),
+ error = xfs_qm_vop_dqalloc(ip, uid, gid, ip->i_d.di_projid,
qflags, &udqp, &gdqp, NULL);
if (error)
return error;
@@ -733,7 +735,6 @@ xfs_setattr_nonsize(
olddquot1 = xfs_qm_vop_chown(tp, ip,
&ip->i_udquot, udqp);
}
- ip->i_d.di_uid = xfs_kuid_to_uid(uid);
inode->i_uid = uid;
}
if (!gid_eq(igid, gid)) {
@@ -745,7 +746,6 @@ xfs_setattr_nonsize(
olddquot2 = xfs_qm_vop_chown(tp, ip,
&ip->i_gdquot, gdqp);
}
- ip->i_d.di_gid = xfs_kgid_to_gid(gid);
inode->i_gid = gid;
}
}
@@ -1284,9 +1284,6 @@ xfs_setup_inode(
/* make the inode look hashed for the writeback code */
inode_fake_hash(inode);
- inode->i_uid = xfs_uid_to_kuid(ip->i_d.di_uid);
- inode->i_gid = xfs_gid_to_kgid(ip->i_d.di_gid);
-
i_size_write(inode, ip->i_d.di_size);
xfs_diflags_to_iflags(inode, ip);
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 884950adbd16..42e93779374c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -84,10 +84,10 @@ xfs_bulkstat_one_int(
/* xfs_iget returns the following without needing
* further change.
*/
- buf->bs_projectid = xfs_get_projid(ip);
+ buf->bs_projectid = ip->i_d.di_projid;
buf->bs_ino = ino;
- buf->bs_uid = dic->di_uid;
- buf->bs_gid = dic->di_gid;
+ buf->bs_uid = i_uid_read(inode);
+ buf->bs_gid = i_gid_read(inode);
buf->bs_size = dic->di_size;
buf->bs_nlink = inode->i_nlink;
@@ -110,7 +110,7 @@ xfs_bulkstat_one_int(
buf->bs_forkoff = XFS_IFORK_BOFF(ip);
buf->bs_version = XFS_BULKSTAT_VERSION_V5;
- if (dic->di_version == 3) {
+ if (xfs_sb_version_has_v3inode(&mp->m_sb)) {
if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
buf->bs_cowextsize_blks = dic->di_cowextsize;
}
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index aa375cf53021..cc5c0c835884 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -55,6 +55,9 @@ struct xfs_iwalk_ag {
/* Where do we start the traversal? */
xfs_ino_t startino;
+ /* What was the last inode number we saw when iterating the inobt? */
+ xfs_ino_t lastino;
+
/* Array of inobt records we cache. */
struct xfs_inobt_rec_incore *recs;
@@ -300,6 +303,9 @@ xfs_iwalk_ag_start(
return error;
XFS_WANT_CORRUPTED_RETURN(mp, *has_more == 1);
+ iwag->lastino = XFS_AGINO_TO_INO(mp, agno,
+ irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
+
/*
* If the LE lookup yielded an inobt record before the cursor position,
* skip it and see if there's another one after it.
@@ -346,15 +352,17 @@ xfs_iwalk_run_callbacks(
struct xfs_mount *mp = iwag->mp;
struct xfs_trans *tp = iwag->tp;
struct xfs_inobt_rec_incore *irec;
- xfs_agino_t restart;
+ xfs_agino_t next_agino;
int error;
+ next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
+
ASSERT(iwag->nr_recs > 0);
/* Delete cursor but remember the last record we cached... */
xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
irec = &iwag->recs[iwag->nr_recs - 1];
- restart = irec->ir_startino + XFS_INODES_PER_CHUNK - 1;
+ ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
error = xfs_iwalk_ag_recs(iwag);
if (error)
@@ -371,7 +379,7 @@ xfs_iwalk_run_callbacks(
if (error)
return error;
- return xfs_inobt_lookup(*curpp, restart, XFS_LOOKUP_GE, has_more);
+ return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
}
/* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
@@ -395,6 +403,7 @@ xfs_iwalk_ag(
while (!error && has_more) {
struct xfs_inobt_rec_incore *irec;
+ xfs_ino_t rec_fsino;
cond_resched();
if (xfs_pwork_want_abort(&iwag->pwork))
@@ -406,6 +415,15 @@ xfs_iwalk_ag(
if (error || !has_more)
break;
+ /* Make sure that we always move forward. */
+ rec_fsino = XFS_AGINO_TO_INO(mp, agno, irec->ir_startino);
+ if (iwag->lastino != NULLFSINO && iwag->lastino >= rec_fsino) {
+ ASSERT(iwag->lastino < rec_fsino);
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+ iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
+
/* No allocated inodes in this chunk; skip it. */
if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
error = xfs_btree_increment(cur, 0, &has_more);
@@ -534,6 +552,7 @@ xfs_iwalk(
.trim_start = 1,
.skip_empty = 1,
.pwork = XFS_PWORK_SINGLE_THREADED,
+ .lastino = NULLFSINO,
};
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
int error;
@@ -622,6 +641,7 @@ xfs_iwalk_threaded(
iwag->data = data;
iwag->startino = startino;
iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
+ iwag->lastino = NULLFSINO;
xfs_pwork_queue(&pctl, &iwag->pwork);
startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
if (flags & XFS_INOBT_WALK_SAME_AG)
@@ -695,6 +715,7 @@ xfs_inobt_walk(
.startino = startino,
.sz_recs = xfs_inobt_walk_prefetch(inobt_records),
.pwork = XFS_PWORK_SINGLE_THREADED,
+ .lastino = NULLFSINO,
};
xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino);
int error;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index ca15105681ca..4f6f09157f0d 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -163,32 +163,6 @@ struct xstats {
extern struct xstats xfsstats;
-/* Kernel uid/gid conversion. These are used to convert to/from the on disk
- * uid_t/gid_t types to the kuid_t/kgid_t types that the kernel uses internally.
- * The conversion here is type only, the value will remain the same since we
- * are converting to the init_user_ns. The uid is later mapped to a particular
- * user namespace value when crossing the kernel/user boundary.
- */
-static inline uint32_t xfs_kuid_to_uid(kuid_t uid)
-{
- return from_kuid(&init_user_ns, uid);
-}
-
-static inline kuid_t xfs_uid_to_kuid(uint32_t uid)
-{
- return make_kuid(&init_user_ns, uid);
-}
-
-static inline uint32_t xfs_kgid_to_gid(kgid_t gid)
-{
- return from_kgid(&init_user_ns, gid);
-}
-
-static inline kgid_t xfs_gid_to_kgid(uint32_t gid)
-{
- return make_kgid(&init_user_ns, gid);
-}
-
static inline dev_t xfs_to_linux_dev_t(xfs_dev_t dev)
{
return MKDEV(sysv_major(dev) & 0x1ff, sysv_minor(dev));
@@ -243,6 +217,12 @@ int xfs_rw_bdev(struct block_device *bdev, sector_t sector, unsigned int count,
#endif /* XFS_WARN */
#endif /* DEBUG */
+#define XFS_IS_CORRUPT(mp, expr) \
+ (unlikely(expr) ? xfs_corruption_error(#expr, XFS_ERRLEVEL_LOW, (mp), \
+ NULL, 0, __FILE__, __LINE__, \
+ __this_address), \
+ true : false)
+
#define STATIC static noinline
#ifdef CONFIG_XFS_RT
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7b0d9ad8cb1a..03a52b3919b8 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -369,6 +369,25 @@ xlog_tic_add_region(xlog_ticket_t *tic, uint len, uint type)
tic->t_res_num++;
}
+bool
+xfs_log_writable(
+ struct xfs_mount *mp)
+{
+ /*
+ * Never write to the log on norecovery mounts, if the block device is
+ * read-only, or if the filesystem is shutdown. Read-only mounts still
+ * allow internal writes for log recovery and unmount purposes, so don't
+ * restrict that case here.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return false;
+ if (xfs_readonly_buftarg(mp->m_log->l_targ))
+ return false;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return false;
+ return true;
+}
+
/*
* Replenish the byte reservation required by moving the grant write head.
*/
@@ -837,19 +856,6 @@ xfs_log_write_unmount_record(
if (error)
goto out_err;
- /*
- * If we think the summary counters are bad, clear the unmount header
- * flag in the unmount record so that the summary counters will be
- * recalculated during log recovery at next mount. Refer to
- * xlog_check_unmount_rec for more details.
- */
- if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS), mp,
- XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
- xfs_alert(mp, "%s: will fix summary counters at next mount",
- __func__);
- flags &= ~XLOG_UNMOUNT_TRANS;
- }
-
/* remove inited flag, and account for space used */
tic->t_flags = 0;
tic->t_curr_res -= sizeof(magic);
@@ -908,15 +914,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
#endif
int error;
- /*
- * Don't write out unmount record on norecovery mounts or ro devices.
- * Or, if we are doing a forced umount (typically because of IO errors).
- */
- if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
- xfs_readonly_buftarg(log->l_targ)) {
- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ if (!xfs_log_writable(mp))
return 0;
- }
error = xfs_log_force(mp, XFS_LOG_SYNC);
ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
@@ -932,6 +931,19 @@ xfs_log_unmount_write(xfs_mount_t *mp)
} while (iclog != first_iclog);
#endif
if (! (XLOG_FORCED_SHUTDOWN(log))) {
+ /*
+ * If we think the summary counters are bad, avoid writing the
+ * unmount record to force log recovery at next mount, after
+ * which the summary counters will be recalculated. Refer to
+ * xlog_check_unmount_rec for more details.
+ */
+ if (XFS_TEST_ERROR(xfs_fs_has_sickness(mp, XFS_SICK_FS_COUNTERS),
+ mp, XFS_ERRTAG_FORCE_SUMMARY_RECALC)) {
+ xfs_alert(mp,
+ "%s: will fix summary counters at next mount",
+ __func__);
+ return 0;
+ }
xfs_log_write_unmount_record(mp);
} else {
/*
@@ -1537,14 +1549,14 @@ xlog_commit_record(
}
/*
- * Push on the buffer cache code if we ever use more than 75% of the on-disk
- * log space. This code pushes on the lsn which would supposedly free up
- * the 25% which we want to leave free. We may need to adopt a policy which
- * pushes on an lsn which is further along in the log once we reach the high
- * water mark. In this manner, we would be creating a low water mark.
+ * Compute the LSN that we'd need to push the log tail towards in order to have
+ * (a) enough on-disk log space to log the number of bytes specified, (b) at
+ * least 25% of the log space free, and (c) at least 256 blocks free. If the
+ * log free space already meets all three thresholds, this function returns
+ * NULLCOMMITLSN.
*/
-STATIC void
-xlog_grant_push_ail(
+xfs_lsn_t
+xlog_grant_push_threshold(
struct xlog *log,
int need_bytes)
{
@@ -1570,7 +1582,7 @@ xlog_grant_push_ail(
free_threshold = max(free_threshold, (log->l_logBBsize >> 2));
free_threshold = max(free_threshold, 256);
if (free_blocks >= free_threshold)
- return;
+ return NULLCOMMITLSN;
xlog_crack_atomic_lsn(&log->l_tail_lsn, &threshold_cycle,
&threshold_block);
@@ -1590,13 +1602,33 @@ xlog_grant_push_ail(
if (XFS_LSN_CMP(threshold_lsn, last_sync_lsn) > 0)
threshold_lsn = last_sync_lsn;
+ return threshold_lsn;
+}
+
+/*
+ * Push the tail of the log if we need to do so to maintain the free log space
+ * thresholds set out by xlog_grant_push_threshold. We may need to adopt a
+ * policy which pushes on an lsn which is further along in the log once we
+ * reach the high water mark. In this manner, we would be creating a low water
+ * mark.
+ */
+STATIC void
+xlog_grant_push_ail(
+ struct xlog *log,
+ int need_bytes)
+{
+ xfs_lsn_t threshold_lsn;
+
+ threshold_lsn = xlog_grant_push_threshold(log, need_bytes);
+ if (threshold_lsn == NULLCOMMITLSN || XLOG_FORCED_SHUTDOWN(log))
+ return;
+
/*
* Get the transaction layer to kick the dirty buffers out to
* disk asynchronously. No point in trying to do this if
* the filesystem is shutting down.
*/
- if (!XLOG_FORCED_SHUTDOWN(log))
- xfs_ail_push(log->l_ailp, threshold_lsn);
+ xfs_ail_push(log->l_ailp, threshold_lsn);
}
/*
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 84e06805160f..dc9229e7ddaa 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -132,6 +132,7 @@ int xfs_log_reserve(struct xfs_mount *mp,
int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
void xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
+bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
@@ -146,4 +147,6 @@ void xfs_log_quiesce(struct xfs_mount *mp);
bool xfs_log_check_lsn(struct xfs_mount *, xfs_lsn_t);
bool xfs_log_in_recovery(struct xfs_mount *);
+xfs_lsn_t xlog_grant_push_threshold(struct xlog *log, int need_bytes);
+
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index ef652abd112c..ae9b8efcfa54 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -671,6 +671,12 @@ xlog_cil_push(
ASSERT(push_seq <= ctx->sequence);
/*
+ * Wake up any background push waiters now this context is being pushed.
+ */
+ if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+ wake_up_all(&cil->xc_push_wait);
+
+ /*
* Check if we've anything to push. If there is nothing, then we don't
* move on to a new sequence number and so we have to be able to push
* this sequence again later.
@@ -740,7 +746,7 @@ xlog_cil_push(
/*
* initialise the new context and attach it to the CIL. Then attach
- * the current context to the CIL committing lsit so it can be found
+ * the current context to the CIL committing list so it can be found
* during log forces to extract the commit lsn of the sequence that
* needs to be forced.
*/
@@ -900,7 +906,7 @@ xlog_cil_push_work(
*/
static void
xlog_cil_push_background(
- struct xlog *log)
+ struct xlog *log) __releases(cil->xc_ctx_lock)
{
struct xfs_cil *cil = log->l_cilp;
@@ -914,14 +920,36 @@ xlog_cil_push_background(
* don't do a background push if we haven't used up all the
* space available yet.
*/
- if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+ if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
+ up_read(&cil->xc_ctx_lock);
return;
+ }
spin_lock(&cil->xc_push_lock);
if (cil->xc_push_seq < cil->xc_current_sequence) {
cil->xc_push_seq = cil->xc_current_sequence;
queue_work(log->l_mp->m_cil_workqueue, &cil->xc_push_work);
}
+
+ /*
+ * Drop the context lock now, we can't hold that if we need to sleep
+ * because we are over the blocking threshold. The push_lock is still
+ * held, so blocking threshold sleep/wakeup is still correctly
+ * serialised here.
+ */
+ up_read(&cil->xc_ctx_lock);
+
+ /*
+ * If we are well over the space limit, throttle the work that is being
+ * done until the push work on this context has begun.
+ */
+ if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
+ trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
+ ASSERT(cil->xc_ctx->space_used < log->l_logsize);
+ xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
+ return;
+ }
+
spin_unlock(&cil->xc_push_lock);
}
@@ -1038,9 +1066,9 @@ xfs_log_commit_cil(
if (lip->li_ops->iop_committing)
lip->li_ops->iop_committing(lip, xc_commit_lsn);
}
- xlog_cil_push_background(log);
- up_read(&cil->xc_ctx_lock);
+ /* xlog_cil_push_background() releases cil->xc_ctx_lock */
+ xlog_cil_push_background(log);
}
/*
@@ -1150,21 +1178,19 @@ out_shutdown:
*/
bool
xfs_log_item_in_current_chkpt(
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip)
{
- struct xfs_cil_ctx *ctx;
+ struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp;
if (list_empty(&lip->li_cil))
return false;
- ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
-
/*
* li_seq is written on the first commit of a log item to record the
* first checkpoint it is written to. Hence if it is different to the
* current sequence, we're in a new checkpoint.
*/
- if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+ if (XFS_LSN_CMP(lip->li_seq, READ_ONCE(cil->xc_current_sequence)) != 0)
return false;
return true;
}
@@ -1194,6 +1220,7 @@ xlog_cil_init(
INIT_LIST_HEAD(&cil->xc_committing);
spin_lock_init(&cil->xc_cil_lock);
spin_lock_init(&cil->xc_push_lock);
+ init_waitqueue_head(&cil->xc_push_wait);
init_rwsem(&cil->xc_ctx_lock);
init_waitqueue_head(&cil->xc_commit_wait);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index b880c23cb6e4..3a5d7fb09c43 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -280,6 +280,7 @@ struct xfs_cil {
wait_queue_head_t xc_commit_wait;
xfs_lsn_t xc_current_sequence;
struct work_struct xc_push_work;
+ wait_queue_head_t xc_push_wait; /* background push throttle */
} ____cacheline_aligned_in_smp;
/*
@@ -323,13 +324,53 @@ struct xfs_cil {
* tries to keep 25% of the log free, so we need to keep below that limit or we
* risk running out of free log space to start any new transactions.
*
- * In order to keep background CIL push efficient, we will set a lower
- * threshold at which background pushing is attempted without blocking current
- * transaction commits. A separate, higher bound defines when CIL pushes are
- * enforced to ensure we stay within our maximum checkpoint size bounds.
- * threshold, yet give us plenty of space for aggregation on large logs.
+ * In order to keep background CIL push efficient, we only need to ensure the
+ * CIL is large enough to maintain sufficient in-memory relogging to avoid
+ * repeated physical writes of frequently modified metadata. If we allow the CIL
+ * to grow to a substantial fraction of the log, then we may be pinning hundreds
+ * of megabytes of metadata in memory until the CIL flushes. This can cause
+ * issues when we are running low on memory - pinned memory cannot be reclaimed,
+ * and the CIL consumes a lot of memory. Hence we need to set an upper physical
+ * size limit for the CIL that limits the maximum amount of memory pinned by the
+ * CIL but does not limit performance by reducing relogging efficiency
+ * significantly.
+ *
+ * As such, the CIL push threshold ends up being the smaller of two thresholds:
+ * - a threshold large enough that it allows CIL to be pushed and progress to be
+ * made without excessive blocking of incoming transaction commits. This is
+ * defined to be 12.5% of the log space - half the 25% push threshold of the
+ * AIL.
+ * - small enough that it doesn't pin excessive amounts of memory but maintains
+ * close to peak relogging efficiency. This is defined to be 16x the iclog
+ * buffer window (32MB) as measurements have shown this to be roughly the
+ * point of diminishing performance increases under highly concurrent
+ * modification workloads.
+ *
+ * To prevent the CIL from overflowing upper commit size bounds, we introduce a
+ * new threshold at which we block committing transactions until the background
+ * CIL commit commences and switches to a new context. While this is not a hard
+ * limit, it forces the process committing a transaction to the CIL to block and
+ * yeild the CPU, giving the CIL push work a chance to be scheduled and start
+ * work. This prevents a process running lots of transactions from overfilling
+ * the CIL because it is not yielding the CPU. We set the blocking limit at
+ * twice the background push space threshold so we keep in line with the AIL
+ * push thresholds.
+ *
+ * Note: this is not a -hard- limit as blocking is applied after the transaction
+ * is inserted into the CIL and the push has been triggered. It is largely a
+ * throttling mechanism that allows the CIL push to be scheduled and run. A hard
+ * limit will be difficult to implement without introducing global serialisation
+ * in the CIL commit fast path, and it's not at all clear that we actually need
+ * such hard limits given the ~7 years we've run without a hard limit before
+ * finding the first situation where a checkpoint size overflow actually
+ * occurred. Hence the simple throttle, and an ASSERT check to tell us that
+ * we've overrun the max size.
*/
-#define XLOG_CIL_SPACE_LIMIT(log) (log->l_logsize >> 3)
+#define XLOG_CIL_SPACE_LIMIT(log) \
+ min_t(int, (log)->l_logsize >> 3, BBTOB(XLOG_TOTAL_REC_SHIFT(log)) << 4)
+
+#define XLOG_CIL_BLOCKING_SPACE_LIMIT(log) \
+ (XLOG_CIL_SPACE_LIMIT(log) * 2)
/*
* ticket grant locks, queues and accounting have their own cachlines
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c1a514ffff55..d9b906d75dfa 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -471,7 +471,7 @@ xlog_find_verify_log_record(
xfs_warn(log->l_mp,
"Log inconsistent (didn't find previous header)");
ASSERT(0);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto out;
}
@@ -1347,10 +1347,11 @@ xlog_find_tail(
error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
&rhead_blk, &rhead, &wrapped);
if (error < 0)
- return error;
+ goto done;
if (!error) {
xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
- return -EIO;
+ error = -EFSCORRUPTED;
+ goto done;
}
*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
@@ -2205,6 +2206,8 @@ xlog_recover_get_buf_lsn(
case XFS_ABTC_MAGIC:
case XFS_RMAP_CRC_MAGIC:
case XFS_REFC_CRC_MAGIC:
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_FIBT_MAGIC:
case XFS_IBT_CRC_MAGIC:
case XFS_IBT_MAGIC: {
struct xfs_btree_block *btb = blk;
@@ -2576,6 +2579,7 @@ xlog_recover_do_reg_buffer(
int bit;
int nbits;
xfs_failaddr_t fa;
+ const size_t size_disk_dquot = sizeof(struct xfs_disk_dquot);
trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
@@ -2618,7 +2622,7 @@ xlog_recover_do_reg_buffer(
"XFS: NULL dquot in %s.", __func__);
goto next;
}
- if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
+ if (item->ri_buf[i].i_len < size_disk_dquot) {
xfs_alert(mp,
"XFS: dquot too small (%d) in %s.",
item->ri_buf[i].i_len, __func__);
@@ -2779,6 +2783,16 @@ xlog_recover_buffer_pass2(
if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
trace_xfs_log_recover_buf_skip(log, buf_f);
xlog_recover_validate_buf_type(mp, bp, buf_f, NULLCOMMITLSN);
+
+ /*
+ * We're skipping replay of this buffer log item due to the log
+ * item LSN being behind the ondisk buffer. Verify the buffer
+ * contents since we aren't going to run the write verifier.
+ */
+ if (bp->b_ops) {
+ bp->b_ops->verify_read(bp);
+ error = bp->b_error;
+ }
goto out_release;
}
@@ -2875,8 +2889,8 @@ xfs_recover_inode_owner_change(
return -ENOMEM;
/* instantiate the inode */
+ ASSERT(dip->di_version >= 3);
xfs_inode_from_disk(ip, dip);
- ASSERT(ip->i_d.di_version >= 3);
error = xfs_iformat_fork(ip, dip);
if (error)
@@ -3014,7 +3028,7 @@ xlog_recover_inode_pass2(
* superblock flag to determine whether we need to look at di_flushiter
* to skip replay when the on disk inode is newer than the log one
*/
- if (!xfs_sb_version_hascrc(&mp->m_sb) &&
+ if (!xfs_sb_version_has_v3inode(&mp->m_sb) &&
ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
/*
* Deal with the wrap case, DI_MAX_FLUSH is less
@@ -3085,7 +3099,7 @@ xlog_recover_inode_pass2(
error = -EFSCORRUPTED;
goto out_release;
}
- isize = xfs_log_dinode_size(ldip->di_version);
+ isize = xfs_log_dinode_size(mp);
if (unlikely(item->ri_buf[1].i_len > isize)) {
XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
XFS_ERRLEVEL_LOW, mp, ldip,
@@ -3166,7 +3180,7 @@ xlog_recover_inode_pass2(
default:
xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
ASSERT(0);
- error = -EIO;
+ error = -EFSCORRUPTED;
goto out_release;
}
}
@@ -3247,12 +3261,12 @@ xlog_recover_dquot_pass2(
recddq = item->ri_buf[1].i_addr;
if (recddq == NULL) {
xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
- return -EIO;
+ return -EFSCORRUPTED;
}
- if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
+ if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) {
xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
item->ri_buf[1].i_len, __func__);
- return -EIO;
+ return -EFSCORRUPTED;
}
/*
@@ -3279,7 +3293,7 @@ xlog_recover_dquot_pass2(
if (fa) {
xfs_alert(mp, "corrupt dquot ID 0x%x in log at %pS",
dq_f->qlf_id, fa);
- return -EIO;
+ return -EFSCORRUPTED;
}
ASSERT(dq_f->qlf_len == 1);
@@ -3382,7 +3396,7 @@ xlog_recover_efd_pass2(
struct xlog_recover_item *item)
{
xfs_efd_log_format_t *efd_formatp;
- xfs_efi_log_item_t *efip = NULL;
+ struct xfs_efi_log_item *efip = NULL;
struct xfs_log_item *lip;
uint64_t efi_id;
struct xfs_ail_cursor cur;
@@ -3403,7 +3417,7 @@ xlog_recover_efd_pass2(
lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
if (lip->li_type == XFS_LI_EFI) {
- efip = (xfs_efi_log_item_t *)lip;
+ efip = (struct xfs_efi_log_item *)lip;
if (efip->efi_format.efi_id == efi_id) {
/*
* Drop the EFD reference to the EFI. This
@@ -3537,6 +3551,7 @@ xfs_cui_copy_format(
memcpy(dst_cui_fmt, src_cui_fmt, len);
return 0;
}
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}
@@ -3601,8 +3616,10 @@ xlog_recover_cud_pass2(
struct xfs_ail *ailp = log->l_ailp;
cud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format))
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
+ }
cui_id = cud_formatp->cud_cui_id;
/*
@@ -3654,6 +3671,7 @@ xfs_bui_copy_format(
memcpy(dst_bui_fmt, src_bui_fmt, len);
return 0;
}
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
}
@@ -3677,8 +3695,10 @@ xlog_recover_bui_pass2(
bui_formatp = item->ri_buf[0].i_addr;
- if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
+ if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
+ }
buip = xfs_bui_init(mp);
error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
if (error) {
@@ -3720,8 +3740,10 @@ xlog_recover_bud_pass2(
struct xfs_ail *ailp = log->l_ailp;
bud_formatp = item->ri_buf[0].i_addr;
- if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format))
+ if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
+ }
bui_id = bud_formatp->bud_bui_id;
/*
@@ -4018,7 +4040,7 @@ xlog_recover_commit_pass1(
xfs_warn(log->l_mp, "%s: invalid item type (%d)",
__func__, ITEM_TYPE(item));
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -4066,7 +4088,7 @@ xlog_recover_commit_pass2(
xfs_warn(log->l_mp, "%s: invalid item type (%d)",
__func__, ITEM_TYPE(item));
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -4187,7 +4209,7 @@ xlog_recover_add_to_cont_trans(
ASSERT(len <= sizeof(struct xfs_trans_header));
if (len > sizeof(struct xfs_trans_header)) {
xfs_warn(log->l_mp, "%s: bad header length", __func__);
- return -EIO;
+ return -EFSCORRUPTED;
}
xlog_recover_add_item(&trans->r_itemq);
@@ -4243,13 +4265,13 @@ xlog_recover_add_to_trans(
xfs_warn(log->l_mp, "%s: bad header magic number",
__func__);
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (len > sizeof(struct xfs_trans_header)) {
xfs_warn(log->l_mp, "%s: bad header length", __func__);
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
/*
@@ -4285,7 +4307,7 @@ xlog_recover_add_to_trans(
in_f->ilf_size);
ASSERT(0);
kmem_free(ptr);
- return -EIO;
+ return -EFSCORRUPTED;
}
item->ri_total = in_f->ilf_size;
@@ -4293,7 +4315,16 @@ xlog_recover_add_to_trans(
kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
0);
}
- ASSERT(item->ri_total > item->ri_cnt);
+
+ if (item->ri_total <= item->ri_cnt) {
+ xfs_warn(log->l_mp,
+ "log item region count (%d) overflowed size (%d)",
+ item->ri_cnt, item->ri_total);
+ ASSERT(0);
+ kmem_free(ptr);
+ return -EFSCORRUPTED;
+ }
+
/* Description region is ri_buf[0] */
item->ri_buf[item->ri_cnt].i_addr = ptr;
item->ri_buf[item->ri_cnt].i_len = len;
@@ -4380,7 +4411,7 @@ xlog_recovery_process_trans(
default:
xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
ASSERT(0);
- error = -EIO;
+ error = -EFSCORRUPTED;
break;
}
if (error || freeit)
@@ -4460,7 +4491,7 @@ xlog_recover_process_ophdr(
xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
__func__, ohead->oh_clientid);
ASSERT(0);
- return -EIO;
+ return -EFSCORRUPTED;
}
/*
@@ -4470,7 +4501,7 @@ xlog_recover_process_ophdr(
if (dp + len > end) {
xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
WARN_ON(1);
- return -EIO;
+ return -EFSCORRUPTED;
}
trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
@@ -4566,9 +4597,9 @@ xlog_recover_process_data(
/* Recover the EFI if necessary. */
STATIC int
xlog_recover_process_efi(
- struct xfs_mount *mp,
struct xfs_ail *ailp,
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
{
struct xfs_efi_log_item *efip;
int error;
@@ -4581,7 +4612,7 @@ xlog_recover_process_efi(
return 0;
spin_unlock(&ailp->ail_lock);
- error = xfs_efi_recover(mp, efip);
+ error = xfs_efi_recover(efip, capture_list);
spin_lock(&ailp->ail_lock);
return error;
@@ -4606,9 +4637,9 @@ xlog_recover_cancel_efi(
/* Recover the RUI if necessary. */
STATIC int
xlog_recover_process_rui(
- struct xfs_mount *mp,
struct xfs_ail *ailp,
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
{
struct xfs_rui_log_item *ruip;
int error;
@@ -4621,7 +4652,7 @@ xlog_recover_process_rui(
return 0;
spin_unlock(&ailp->ail_lock);
- error = xfs_rui_recover(mp, ruip);
+ error = xfs_rui_recover(ruip, capture_list);
spin_lock(&ailp->ail_lock);
return error;
@@ -4646,9 +4677,9 @@ xlog_recover_cancel_rui(
/* Recover the CUI if necessary. */
STATIC int
xlog_recover_process_cui(
- struct xfs_trans *parent_tp,
struct xfs_ail *ailp,
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
{
struct xfs_cui_log_item *cuip;
int error;
@@ -4661,7 +4692,7 @@ xlog_recover_process_cui(
return 0;
spin_unlock(&ailp->ail_lock);
- error = xfs_cui_recover(parent_tp, cuip);
+ error = xfs_cui_recover(cuip, capture_list);
spin_lock(&ailp->ail_lock);
return error;
@@ -4686,9 +4717,9 @@ xlog_recover_cancel_cui(
/* Recover the BUI if necessary. */
STATIC int
xlog_recover_process_bui(
- struct xfs_trans *parent_tp,
struct xfs_ail *ailp,
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip,
+ struct list_head *capture_list)
{
struct xfs_bui_log_item *buip;
int error;
@@ -4701,7 +4732,7 @@ xlog_recover_process_bui(
return 0;
spin_unlock(&ailp->ail_lock);
- error = xfs_bui_recover(parent_tp, buip);
+ error = xfs_bui_recover(buip, capture_list);
spin_lock(&ailp->ail_lock);
return error;
@@ -4740,37 +4771,66 @@ static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
/* Take all the collected deferred ops and finish them in order. */
static int
xlog_finish_defer_ops(
- struct xfs_trans *parent_tp)
+ struct xfs_mount *mp,
+ struct list_head *capture_list)
{
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_defer_capture *dfc, *next;
struct xfs_trans *tp;
- int64_t freeblks;
- uint resblks;
- int error;
+ struct xfs_inode *ip;
+ int error = 0;
- /*
- * We're finishing the defer_ops that accumulated as a result of
- * recovering unfinished intent items during log recovery. We
- * reserve an itruncate transaction because it is the largest
- * permanent transaction type. Since we're the only user of the fs
- * right now, take 93% (15/16) of the available free blocks. Use
- * weird math to avoid a 64-bit division.
- */
- freeblks = percpu_counter_sum(&mp->m_fdblocks);
- if (freeblks <= 0)
- return -ENOSPC;
- resblks = min_t(int64_t, UINT_MAX, freeblks);
- resblks = (resblks * 15) >> 4;
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, resblks,
- 0, XFS_TRANS_RESERVE, &tp);
- if (error)
- return error;
- /* transfer all collected dfops to this transaction */
- xfs_defer_move(tp, parent_tp);
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+ struct xfs_trans_res resv;
+
+ /*
+ * Create a new transaction reservation from the captured
+ * information. Set logcount to 1 to force the new transaction
+ * to regrant every roll so that we can make forward progress
+ * in recovery no matter how full the log might be.
+ */
+ resv.tr_logres = dfc->dfc_logres;
+ resv.tr_logcount = 1;
+ resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
+
+ error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
+ dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
+ if (error)
+ return error;
- return xfs_trans_commit(tp);
+ /*
+ * Transfer to this new transaction all the dfops we captured
+ * from recovering a single intent item.
+ */
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_ops_continue(dfc, tp, &ip);
+
+ error = xfs_trans_commit(tp);
+ if (ip) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_irele(ip);
+ }
+ if (error)
+ return error;
+ }
+
+ ASSERT(list_empty(capture_list));
+ return 0;
}
+/* Release all the captured defer ops and capture structures in this list. */
+static void
+xlog_abort_defer_ops(
+ struct xfs_mount *mp,
+ struct list_head *capture_list)
+{
+ struct xfs_defer_capture *dfc;
+ struct xfs_defer_capture *next;
+
+ list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
+ list_del_init(&dfc->dfc_list);
+ xfs_defer_ops_release(mp, dfc);
+ }
+}
/*
* When this is called, all of the log intent items which did not have
* corresponding log done items should be in the AIL. What we do now
@@ -4791,35 +4851,23 @@ STATIC int
xlog_recover_process_intents(
struct xlog *log)
{
- struct xfs_trans *parent_tp;
+ LIST_HEAD(capture_list);
struct xfs_ail_cursor cur;
struct xfs_log_item *lip;
struct xfs_ail *ailp;
- int error;
+ int error = 0;
#if defined(DEBUG) || defined(XFS_WARN)
xfs_lsn_t last_lsn;
#endif
- /*
- * The intent recovery handlers commit transactions to complete recovery
- * for individual intents, but any new deferred operations that are
- * queued during that process are held off until the very end. The
- * purpose of this transaction is to serve as a container for deferred
- * operations. Each intent recovery handler must transfer dfops here
- * before its local transaction commits, and we'll finish the entire
- * list below.
- */
- error = xfs_trans_alloc_empty(log->l_mp, &parent_tp);
- if (error)
- return error;
-
ailp = log->l_ailp;
spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
#endif
- while (lip != NULL) {
+ for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+ lip != NULL;
+ lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
/*
* We're done when we see something other than an intent.
* There should be no intents left in the AIL now.
@@ -4841,35 +4889,40 @@ xlog_recover_process_intents(
/*
* NOTE: If your intent processing routine can create more
- * deferred ops, you /must/ attach them to the dfops in this
- * routine or else those subsequent intents will get
+ * deferred ops, you /must/ attach them to the capture list in
+ * the recover routine or else those subsequent intents will be
* replayed in the wrong order!
*/
switch (lip->li_type) {
case XFS_LI_EFI:
- error = xlog_recover_process_efi(log->l_mp, ailp, lip);
+ error = xlog_recover_process_efi(ailp, lip, &capture_list);
break;
case XFS_LI_RUI:
- error = xlog_recover_process_rui(log->l_mp, ailp, lip);
+ error = xlog_recover_process_rui(ailp, lip, &capture_list);
break;
case XFS_LI_CUI:
- error = xlog_recover_process_cui(parent_tp, ailp, lip);
+ error = xlog_recover_process_cui(ailp, lip, &capture_list);
break;
case XFS_LI_BUI:
- error = xlog_recover_process_bui(parent_tp, ailp, lip);
+ error = xlog_recover_process_bui(ailp, lip, &capture_list);
break;
}
if (error)
- goto out;
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ break;
}
-out:
+
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
- if (!error)
- error = xlog_finish_defer_ops(parent_tp);
- xfs_trans_cancel(parent_tp);
+ if (error)
+ goto err;
+
+ error = xlog_finish_defer_ops(log->l_mp, &capture_list);
+ if (error)
+ goto err;
+ return 0;
+err:
+ xlog_abort_defer_ops(log->l_mp, &capture_list);
return error;
}
@@ -5172,8 +5225,10 @@ xlog_recover_process(
* If the filesystem is CRC enabled, this mismatch becomes a
* fatal log corruption failure.
*/
- if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
return -EFSCORRUPTED;
+ }
}
xlog_unpack_data(rhead, dp, log);
@@ -5200,7 +5255,7 @@ xlog_valid_rec_header(
(be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
__func__, be32_to_cpu(rhead->h_version));
- return -EIO;
+ return -EFSCORRUPTED;
}
/* LR body must have data or it wouldn't have been written */
@@ -5296,8 +5351,12 @@ xlog_do_recovery_pass(
"invalid iclog size (%d bytes), using lsunit (%d bytes)",
h_size, log->l_mp->m_logbsize);
h_size = log->l_mp->m_logbsize;
- } else
- return -EFSCORRUPTED;
+ } else {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW,
+ log->l_mp);
+ error = -EFSCORRUPTED;
+ goto bread_err1;
+ }
}
if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c
index 9804efe525a9..c57e8ad39712 100644
--- a/fs/xfs/xfs_message.c
+++ b/fs/xfs/xfs_message.c
@@ -105,7 +105,7 @@ assfail(char *expr, char *file, int line)
}
void
-xfs_hex_dump(void *p, int length)
+xfs_hex_dump(const void *p, int length)
{
print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
}
diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h
index 34447dca97d1..7f040b04b739 100644
--- a/fs/xfs/xfs_message.h
+++ b/fs/xfs/xfs_message.h
@@ -60,6 +60,6 @@ do { \
extern void assfail(char *expr, char *f, int l);
extern void asswarn(char *expr, char *f, int l);
-extern void xfs_hex_dump(void *p, int length);
+extern void xfs_hex_dump(const void *p, int length);
#endif /* __XFS_MESSAGE_H */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5a0ce0c2c4bb..2277f21c4f14 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -31,7 +31,7 @@
#include "xfs_reflink.h"
#include "xfs_extent_busy.h"
#include "xfs_health.h"
-
+#include "xfs_trace.h"
static DEFINE_MUTEX(xfs_uuid_table_mutex);
static int xfs_uuid_table_size;
@@ -365,66 +365,119 @@ release_buf:
}
/*
- * Update alignment values based on mount options and sb values
+ * If the sunit/swidth change would move the precomputed root inode value, we
+ * must reject the ondisk change because repair will stumble over that.
+ * However, we allow the mount to proceed because we never rejected this
+ * combination before. Returns true to update the sb, false otherwise.
+ */
+static inline int
+xfs_check_new_dalign(
+ struct xfs_mount *mp,
+ int new_dalign,
+ bool *update_sb)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+ xfs_ino_t calc_ino;
+
+ calc_ino = xfs_ialloc_calc_rootino(mp, new_dalign);
+ trace_xfs_check_new_dalign(mp, new_dalign, calc_ino);
+
+ if (sbp->sb_rootino == calc_ino) {
+ *update_sb = true;
+ return 0;
+ }
+
+ xfs_warn(mp,
+"Cannot change stripe alignment; would require moving root inode.");
+
+ /*
+ * XXX: Next time we add a new incompat feature, this should start
+ * returning -EINVAL to fail the mount. Until then, spit out a warning
+ * that we're ignoring the administrator's instructions.
+ */
+ xfs_warn(mp, "Skipping superblock stripe alignment update.");
+ *update_sb = false;
+ return 0;
+}
+
+/*
+ * If we were provided with new sunit/swidth values as mount options, make sure
+ * that they pass basic alignment and superblock feature checks, and convert
+ * them into the same units (FSB) that everything else expects. This step
+ * /must/ be done before computing the inode geometry.
*/
STATIC int
-xfs_update_alignment(xfs_mount_t *mp)
+xfs_validate_new_dalign(
+ struct xfs_mount *mp)
{
- xfs_sb_t *sbp = &(mp->m_sb);
+ if (mp->m_dalign == 0)
+ return 0;
- if (mp->m_dalign) {
+ /*
+ * If stripe unit and stripe width are not multiples
+ * of the fs blocksize turn off alignment.
+ */
+ if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
+ (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+ xfs_warn(mp,
+ "alignment check failed: sunit/swidth vs. blocksize(%d)",
+ mp->m_sb.sb_blocksize);
+ return -EINVAL;
+ } else {
/*
- * If stripe unit and stripe width are not multiples
- * of the fs blocksize turn off alignment.
+ * Convert the stripe unit and width to FSBs.
*/
- if ((BBTOB(mp->m_dalign) & mp->m_blockmask) ||
- (BBTOB(mp->m_swidth) & mp->m_blockmask)) {
+ mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
+ if (mp->m_dalign && (mp->m_sb.sb_agblocks % mp->m_dalign)) {
xfs_warn(mp,
- "alignment check failed: sunit/swidth vs. blocksize(%d)",
- sbp->sb_blocksize);
+ "alignment check failed: sunit/swidth vs. agsize(%d)",
+ mp->m_sb.sb_agblocks);
return -EINVAL;
- } else {
- /*
- * Convert the stripe unit and width to FSBs.
- */
- mp->m_dalign = XFS_BB_TO_FSBT(mp, mp->m_dalign);
- if (mp->m_dalign && (sbp->sb_agblocks % mp->m_dalign)) {
- xfs_warn(mp,
- "alignment check failed: sunit/swidth vs. agsize(%d)",
- sbp->sb_agblocks);
- return -EINVAL;
- } else if (mp->m_dalign) {
- mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
- } else {
- xfs_warn(mp,
- "alignment check failed: sunit(%d) less than bsize(%d)",
- mp->m_dalign, sbp->sb_blocksize);
- return -EINVAL;
- }
- }
-
- /*
- * Update superblock with new values
- * and log changes
- */
- if (xfs_sb_version_hasdalign(sbp)) {
- if (sbp->sb_unit != mp->m_dalign) {
- sbp->sb_unit = mp->m_dalign;
- mp->m_update_sb = true;
- }
- if (sbp->sb_width != mp->m_swidth) {
- sbp->sb_width = mp->m_swidth;
- mp->m_update_sb = true;
- }
+ } else if (mp->m_dalign) {
+ mp->m_swidth = XFS_BB_TO_FSBT(mp, mp->m_swidth);
} else {
xfs_warn(mp,
- "cannot change alignment: superblock does not support data alignment");
+ "alignment check failed: sunit(%d) less than bsize(%d)",
+ mp->m_dalign, mp->m_sb.sb_blocksize);
return -EINVAL;
}
+ }
+
+ if (!xfs_sb_version_hasdalign(&mp->m_sb)) {
+ xfs_warn(mp,
+"cannot change alignment: superblock does not support data alignment");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/* Update alignment values based on mount options and sb values. */
+STATIC int
+xfs_update_alignment(
+ struct xfs_mount *mp)
+{
+ struct xfs_sb *sbp = &mp->m_sb;
+
+ if (mp->m_dalign) {
+ bool update_sb;
+ int error;
+
+ if (sbp->sb_unit == mp->m_dalign &&
+ sbp->sb_width == mp->m_swidth)
+ return 0;
+
+ error = xfs_check_new_dalign(mp, mp->m_dalign, &update_sb);
+ if (error || !update_sb)
+ return error;
+
+ sbp->sb_unit = mp->m_dalign;
+ sbp->sb_width = mp->m_swidth;
+ mp->m_update_sb = true;
} else if ((mp->m_flags & XFS_MOUNT_NOALIGN) != XFS_MOUNT_NOALIGN &&
xfs_sb_version_hasdalign(&mp->m_sb)) {
- mp->m_dalign = sbp->sb_unit;
- mp->m_swidth = sbp->sb_width;
+ mp->m_dalign = sbp->sb_unit;
+ mp->m_swidth = sbp->sb_width;
}
return 0;
@@ -622,6 +675,47 @@ xfs_check_summary_counts(
}
/*
+ * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
+ * internal inode structures can be sitting in the CIL and AIL at this point,
+ * so we need to unpin them, write them back and/or reclaim them before unmount
+ * can proceed.
+ *
+ * An inode cluster that has been freed can have its buffer still pinned in
+ * memory because the transaction is still sitting in a iclog. The stale inodes
+ * on that buffer will be pinned to the buffer until the transaction hits the
+ * disk and the callbacks run. Pushing the AIL will skip the stale inodes and
+ * may never see the pinned buffer, so nothing will push out the iclog and
+ * unpin the buffer.
+ *
+ * Hence we need to force the log to unpin everything first. However, log
+ * forces don't wait for the discards they issue to complete, so we have to
+ * explicitly wait for them to complete here as well.
+ *
+ * Then we can tell the world we are unmounting so that error handling knows
+ * that the filesystem is going away and we should error out anything that we
+ * have been retrying in the background. This will prevent never-ending
+ * retries in AIL pushing from hanging the unmount.
+ *
+ * Finally, we can push the AIL to clean all the remaining dirty objects, then
+ * reclaim the remaining inodes that are still in memory at this point in time.
+ */
+static void
+xfs_unmount_flush_inodes(
+ struct xfs_mount *mp)
+{
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ xfs_extent_busy_wait_all(mp);
+ flush_workqueue(xfs_discard_wq);
+
+ mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+
+ xfs_ail_push_all_sync(mp->m_ail);
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
+ xfs_health_unmount(mp);
+}
+
+/*
* This function does the following on an initial mount of a file system:
* - reads the superblock from disk and init the mount struct
* - if we're a 32-bit kernel, do a size check on the superblock
@@ -692,12 +786,12 @@ xfs_mountfs(
}
/*
- * Check if sb_agblocks is aligned at stripe boundary
- * If sb_agblocks is NOT aligned turn off m_dalign since
- * allocator alignment is within an ag, therefore ag has
- * to be aligned at stripe boundary.
+ * If we were given new sunit/swidth options, do some basic validation
+ * checks and convert the incore dalign and swidth values to the
+ * same units (FSB) that everything else uses. This /must/ happen
+ * before computing the inode geometry.
*/
- error = xfs_update_alignment(mp);
+ error = xfs_validate_new_dalign(mp);
if (error)
goto out;
@@ -708,6 +802,17 @@ xfs_mountfs(
xfs_rmapbt_compute_maxlevels(mp);
xfs_refcountbt_compute_maxlevels(mp);
+ /*
+ * Check if sb_agblocks is aligned at stripe boundary. If sb_agblocks
+ * is NOT aligned turn off m_dalign since allocator alignment is within
+ * an ag, therefore ag has to be aligned at stripe boundary. Note that
+ * we must compute the free space and rmap btree geometry before doing
+ * this.
+ */
+ error = xfs_update_alignment(mp);
+ if (error)
+ goto out;
+
/* enable fail_at_unmount as default */
mp->m_fail_unmount = true;
@@ -983,7 +1088,7 @@ xfs_mountfs(
/* Clean out dquots that might be in memory after quotacheck. */
xfs_qm_unmount(mp);
/*
- * Cancel all delayed reclaim work and reclaim the inodes directly.
+ * Flush all inode reclamation work and flush the log.
* We have to do this /after/ rtunmount and qm_unmount because those
* two will have scheduled delayed reclaim for the rt/quota inodes.
*
@@ -993,11 +1098,8 @@ xfs_mountfs(
* qm_unmount_quotas and therefore rely on qm_unmount to release the
* quota inodes.
*/
- cancel_delayed_work_sync(&mp->m_reclaim_work);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
- xfs_health_unmount(mp);
+ xfs_unmount_flush_inodes(mp);
out_log_dealloc:
- mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_log_mount_cancel(mp);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
@@ -1038,47 +1140,7 @@ xfs_unmountfs(
xfs_rtunmount_inodes(mp);
xfs_irele(mp->m_rootip);
- /*
- * We can potentially deadlock here if we have an inode cluster
- * that has been freed has its buffer still pinned in memory because
- * the transaction is still sitting in a iclog. The stale inodes
- * on that buffer will have their flush locks held until the
- * transaction hits the disk and the callbacks run. the inode
- * flush takes the flush lock unconditionally and with nothing to
- * push out the iclog we will never get that unlocked. hence we
- * need to force the log first.
- */
- xfs_log_force(mp, XFS_LOG_SYNC);
-
- /*
- * Wait for all busy extents to be freed, including completion of
- * any discard operation.
- */
- xfs_extent_busy_wait_all(mp);
- flush_workqueue(xfs_discard_wq);
-
- /*
- * We now need to tell the world we are unmounting. This will allow
- * us to detect that the filesystem is going away and we should error
- * out anything that we have been retrying in the background. This will
- * prevent neverending retries in AIL pushing from hanging the unmount.
- */
- mp->m_flags |= XFS_MOUNT_UNMOUNTING;
-
- /*
- * Flush all pending changes from the AIL.
- */
- xfs_ail_push_all_sync(mp->m_ail);
-
- /*
- * And reclaim all inodes. At this point there should be no dirty
- * inodes and none should be pinned or locked, but use synchronous
- * reclaim just to be sure. We can stop background inode reclaim
- * here as well if it is still running.
- */
- cancel_delayed_work_sync(&mp->m_reclaim_work);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
- xfs_health_unmount(mp);
+ xfs_unmount_flush_inodes(mp);
xfs_qm_unmount(mp);
@@ -1154,8 +1216,7 @@ xfs_fs_writable(
int
xfs_log_sbcount(xfs_mount_t *mp)
{
- /* allow this to proceed during the freeze sequence... */
- if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
+ if (!xfs_log_writable(mp))
return 0;
/*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index fdb60e09a9c5..ca7e0c656cee 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -179,6 +179,11 @@ typedef struct xfs_mount {
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
struct xstats m_stats; /* per-fs stats */
+ /*
+ * Workqueue item so that we can coalesce multiple inode flush attempts
+ * into a single flush.
+ */
+ struct work_struct m_flush_inodes_work;
struct workqueue_struct *m_buf_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
struct workqueue_struct *m_cil_workqueue;
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index f63fe8d924a3..058af699e046 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -147,11 +147,11 @@ xfs_fs_map_blocks(
if (error)
goto out_unlock;
+ ASSERT(!nimaps || imap.br_startblock != DELAYSTARTBLOCK);
+
if (write) {
enum xfs_prealloc_flags flags = 0;
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
-
if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
/*
* xfs_iomap_write_direct() expects to take ownership of
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index ecd8ce152ab1..6b108a4de08f 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -22,6 +22,7 @@
#include "xfs_qm.h"
#include "xfs_trace.h"
#include "xfs_icache.h"
+#include "xfs_error.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -120,12 +121,11 @@ xfs_qm_dqpurge(
{
struct xfs_mount *mp = dqp->q_mount;
struct xfs_quotainfo *qi = mp->m_quotainfo;
+ int error = -EAGAIN;
xfs_dqlock(dqp);
- if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0) {
- xfs_dqunlock(dqp);
- return -EAGAIN;
- }
+ if ((dqp->dq_flags & XFS_DQ_FREEING) || dqp->q_nrefs != 0)
+ goto out_unlock;
dqp->dq_flags |= XFS_DQ_FREEING;
@@ -138,7 +138,6 @@ xfs_qm_dqpurge(
*/
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
- int error;
/*
* We don't care about getting disk errors here. We need
@@ -148,6 +147,9 @@ xfs_qm_dqpurge(
if (!error) {
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
+ } else if (error == -EAGAIN) {
+ dqp->dq_flags &= ~XFS_DQ_FREEING;
+ goto out_unlock;
}
xfs_dqflock(dqp);
}
@@ -173,6 +175,10 @@ xfs_qm_dqpurge(
xfs_qm_dqdestroy(dqp);
return 0;
+
+out_unlock:
+ xfs_dqunlock(dqp);
+ return error;
}
/*
@@ -243,14 +249,14 @@ xfs_qm_unmount_quotas(
STATIC int
xfs_qm_dqattach_one(
- xfs_inode_t *ip,
- xfs_dqid_t id,
- uint type,
- bool doalloc,
- xfs_dquot_t **IO_idqpp)
+ struct xfs_inode *ip,
+ xfs_dqid_t id,
+ uint type,
+ bool doalloc,
+ struct xfs_dquot **IO_idqpp)
{
- xfs_dquot_t *dqp;
- int error;
+ struct xfs_dquot *dqp;
+ int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
error = 0;
@@ -325,23 +331,23 @@ xfs_qm_dqattach_locked(
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
if (XFS_IS_UQUOTA_ON(mp) && !ip->i_udquot) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_uid, XFS_DQ_USER,
- doalloc, &ip->i_udquot);
+ error = xfs_qm_dqattach_one(ip, i_uid_read(VFS_I(ip)),
+ XFS_DQ_USER, doalloc, &ip->i_udquot);
if (error)
goto done;
ASSERT(ip->i_udquot);
}
if (XFS_IS_GQUOTA_ON(mp) && !ip->i_gdquot) {
- error = xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP,
- doalloc, &ip->i_gdquot);
+ error = xfs_qm_dqattach_one(ip, i_gid_read(VFS_I(ip)),
+ XFS_DQ_GROUP, doalloc, &ip->i_gdquot);
if (error)
goto done;
ASSERT(ip->i_gdquot);
}
if (XFS_IS_PQUOTA_ON(mp) && !ip->i_pdquot) {
- error = xfs_qm_dqattach_one(ip, xfs_get_projid(ip), XFS_DQ_PROJ,
+ error = xfs_qm_dqattach_one(ip, ip->i_d.di_projid, XFS_DQ_PROJ,
doalloc, &ip->i_pdquot);
if (error)
goto done;
@@ -543,8 +549,8 @@ xfs_qm_set_defquota(
uint type,
xfs_quotainfo_t *qinf)
{
- xfs_dquot_t *dqp;
- struct xfs_def_quota *defq;
+ struct xfs_dquot *dqp;
+ struct xfs_def_quota *defq;
struct xfs_disk_dquot *ddqp;
int error;
@@ -754,11 +760,19 @@ xfs_qm_qino_alloc(
if ((flags & XFS_QMOPT_PQUOTA) &&
(mp->m_sb.sb_gquotino != NULLFSINO)) {
ino = mp->m_sb.sb_gquotino;
- ASSERT(mp->m_sb.sb_pquotino == NULLFSINO);
+ if (mp->m_sb.sb_pquotino != NULLFSINO) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW,
+ mp);
+ return -EFSCORRUPTED;
+ }
} else if ((flags & XFS_QMOPT_GQUOTA) &&
(mp->m_sb.sb_pquotino != NULLFSINO)) {
ino = mp->m_sb.sb_pquotino;
- ASSERT(mp->m_sb.sb_gquotino == NULLFSINO);
+ if (mp->m_sb.sb_gquotino != NULLFSINO) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW,
+ mp);
+ return -EFSCORRUPTED;
+ }
}
if (ino != NULLFSINO) {
error = xfs_iget(mp, NULL, ino, 0, 0, ip);
@@ -866,12 +880,20 @@ xfs_qm_reset_dqcounts(
ddq->d_bcount = 0;
ddq->d_icount = 0;
ddq->d_rtbcount = 0;
- ddq->d_btimer = 0;
- ddq->d_itimer = 0;
- ddq->d_rtbtimer = 0;
- ddq->d_bwarns = 0;
- ddq->d_iwarns = 0;
- ddq->d_rtbwarns = 0;
+
+ /*
+ * dquot id 0 stores the default grace period and the maximum
+ * warning limit that were set by the administrator, so we
+ * should not reset them.
+ */
+ if (ddq->d_id != 0) {
+ ddq->d_btimer = 0;
+ ddq->d_itimer = 0;
+ ddq->d_rtbtimer = 0;
+ ddq->d_bwarns = 0;
+ ddq->d_iwarns = 0;
+ ddq->d_rtbwarns = 0;
+ }
if (xfs_sb_version_hascrc(&mp->m_sb)) {
xfs_update_cksum((char *)&dqb[j],
@@ -1608,8 +1630,8 @@ xfs_qm_dqfree_one(
int
xfs_qm_vop_dqalloc(
struct xfs_inode *ip,
- xfs_dqid_t uid,
- xfs_dqid_t gid,
+ kuid_t uid,
+ kgid_t gid,
prid_t prid,
uint flags,
struct xfs_dquot **O_udqpp,
@@ -1617,6 +1639,8 @@ xfs_qm_vop_dqalloc(
struct xfs_dquot **O_pdqpp)
{
struct xfs_mount *mp = ip->i_mount;
+ struct inode *inode = VFS_I(ip);
+ struct user_namespace *user_ns = inode->i_sb->s_user_ns;
struct xfs_dquot *uq = NULL;
struct xfs_dquot *gq = NULL;
struct xfs_dquot *pq = NULL;
@@ -1630,7 +1654,7 @@ xfs_qm_vop_dqalloc(
xfs_ilock(ip, lockflags);
if ((flags & XFS_QMOPT_INHERIT) && XFS_INHERIT_GID(ip))
- gid = ip->i_d.di_gid;
+ gid = inode->i_gid;
/*
* Attach the dquot(s) to this inode, doing a dquot allocation
@@ -1645,7 +1669,7 @@ xfs_qm_vop_dqalloc(
}
if ((flags & XFS_QMOPT_UQUOTA) && XFS_IS_UQUOTA_ON(mp)) {
- if (ip->i_d.di_uid != uid) {
+ if (!uid_eq(inode->i_uid, uid)) {
/*
* What we need is the dquot that has this uid, and
* if we send the inode to dqget, the uid of the inode
@@ -1656,7 +1680,8 @@ xfs_qm_vop_dqalloc(
* holding ilock.
*/
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, uid, XFS_DQ_USER, true, &uq);
+ error = xfs_qm_dqget(mp, from_kuid(user_ns, uid),
+ XFS_DQ_USER, true, &uq);
if (error) {
ASSERT(error != -ENOENT);
return error;
@@ -1677,9 +1702,10 @@ xfs_qm_vop_dqalloc(
}
}
if ((flags & XFS_QMOPT_GQUOTA) && XFS_IS_GQUOTA_ON(mp)) {
- if (ip->i_d.di_gid != gid) {
+ if (!gid_eq(inode->i_gid, gid)) {
xfs_iunlock(ip, lockflags);
- error = xfs_qm_dqget(mp, gid, XFS_DQ_GROUP, true, &gq);
+ error = xfs_qm_dqget(mp, from_kgid(user_ns, gid),
+ XFS_DQ_GROUP, true, &gq);
if (error) {
ASSERT(error != -ENOENT);
goto error_rele;
@@ -1693,7 +1719,7 @@ xfs_qm_vop_dqalloc(
}
}
if ((flags & XFS_QMOPT_PQUOTA) && XFS_IS_PQUOTA_ON(mp)) {
- if (xfs_get_projid(ip) != prid) {
+ if (ip->i_d.di_projid != prid) {
xfs_iunlock(ip, lockflags);
error = xfs_qm_dqget(mp, (xfs_dqid_t)prid, XFS_DQ_PROJ,
true, &pq);
@@ -1737,14 +1763,14 @@ error_rele:
* Actually transfer ownership, and do dquot modifications.
* These were already reserved.
*/
-xfs_dquot_t *
+struct xfs_dquot *
xfs_qm_vop_chown(
- xfs_trans_t *tp,
- xfs_inode_t *ip,
- xfs_dquot_t **IO_olddq,
- xfs_dquot_t *newdq)
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ struct xfs_dquot **IO_olddq,
+ struct xfs_dquot *newdq)
{
- xfs_dquot_t *prevdq;
+ struct xfs_dquot *prevdq;
uint bfield = XFS_IS_REALTIME_INODE(ip) ?
XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
@@ -1805,7 +1831,7 @@ xfs_qm_vop_chown_reserve(
XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
if (XFS_IS_UQUOTA_ON(mp) && udqp &&
- ip->i_d.di_uid != be32_to_cpu(udqp->q_core.d_id)) {
+ i_uid_read(VFS_I(ip)) != be32_to_cpu(udqp->q_core.d_id)) {
udq_delblks = udqp;
/*
* If there are delayed allocation blocks, then we have to
@@ -1818,7 +1844,7 @@ xfs_qm_vop_chown_reserve(
}
}
if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
- ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) {
+ i_gid_read(VFS_I(ip)) != be32_to_cpu(gdqp->q_core.d_id)) {
gdq_delblks = gdqp;
if (delblks) {
ASSERT(ip->i_gdquot);
@@ -1827,7 +1853,7 @@ xfs_qm_vop_chown_reserve(
}
if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
- xfs_get_projid(ip) != be32_to_cpu(pdqp->q_core.d_id)) {
+ ip->i_d.di_projid != be32_to_cpu(pdqp->q_core.d_id)) {
prjflags = XFS_QMOPT_ENOSPC;
pdq_delblks = pdqp;
if (delblks) {
@@ -1915,20 +1941,21 @@ xfs_qm_vop_create_dqattach(
if (udqp && XFS_IS_UQUOTA_ON(mp)) {
ASSERT(ip->i_udquot == NULL);
- ASSERT(ip->i_d.di_uid == be32_to_cpu(udqp->q_core.d_id));
+ ASSERT(i_uid_read(VFS_I(ip)) == be32_to_cpu(udqp->q_core.d_id));
ip->i_udquot = xfs_qm_dqhold(udqp);
xfs_trans_mod_dquot(tp, udqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (gdqp && XFS_IS_GQUOTA_ON(mp)) {
ASSERT(ip->i_gdquot == NULL);
- ASSERT(ip->i_d.di_gid == be32_to_cpu(gdqp->q_core.d_id));
+ ASSERT(i_gid_read(VFS_I(ip)) == be32_to_cpu(gdqp->q_core.d_id));
+
ip->i_gdquot = xfs_qm_dqhold(gdqp);
xfs_trans_mod_dquot(tp, gdqp, XFS_TRANS_DQ_ICOUNT, 1);
}
if (pdqp && XFS_IS_PQUOTA_ON(mp)) {
ASSERT(ip->i_pdquot == NULL);
- ASSERT(xfs_get_projid(ip) == be32_to_cpu(pdqp->q_core.d_id));
+ ASSERT(ip->i_d.di_projid == be32_to_cpu(pdqp->q_core.d_id));
ip->i_pdquot = xfs_qm_dqhold(pdqp);
xfs_trans_mod_dquot(tp, pdqp, XFS_TRANS_DQ_ICOUNT, 1);
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 5d72e88598b4..fc2fa418919f 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -54,13 +54,13 @@ xfs_fill_statvfs_from_dquot(
*/
void
xfs_qm_statvfs(
- xfs_inode_t *ip,
+ struct xfs_inode *ip,
struct kstatfs *statp)
{
- xfs_mount_t *mp = ip->i_mount;
- xfs_dquot_t *dqp;
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_dquot *dqp;
- if (!xfs_qm_dqget(mp, xfs_get_projid(ip), XFS_DQ_PROJ, false, &dqp)) {
+ if (!xfs_qm_dqget(mp, ip->i_d.di_projid, XFS_DQ_PROJ, false, &dqp)) {
xfs_fill_statvfs_from_dquot(statp, dqp);
xfs_qm_dqput(dqp);
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index da7ad0383037..5d5ac65aa1cc 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -19,9 +19,71 @@
#include "xfs_qm.h"
#include "xfs_icache.h"
-STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
-STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
- uint);
+STATIC int
+xfs_qm_log_quotaoff(
+ struct xfs_mount *mp,
+ struct xfs_qoff_logitem **qoffstartp,
+ uint flags)
+{
+ struct xfs_trans *tp;
+ int error;
+ struct xfs_qoff_logitem *qoffi;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
+ if (error)
+ goto out;
+
+ qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
+ xfs_trans_log_quotaoff_item(tp, qoffi);
+
+ spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
+ spin_unlock(&mp->m_sb_lock);
+
+ xfs_log_sb(tp);
+
+ /*
+ * We have to make sure that the transaction is secure on disk before we
+ * return and actually stop quota accounting. So, make it synchronous.
+ * We don't care about quotoff's performance.
+ */
+ xfs_trans_set_sync(tp);
+ error = xfs_trans_commit(tp);
+ if (error)
+ goto out;
+
+ *qoffstartp = qoffi;
+out:
+ return error;
+}
+
+STATIC int
+xfs_qm_log_quotaoff_end(
+ struct xfs_mount *mp,
+ struct xfs_qoff_logitem **startqoff,
+ uint flags)
+{
+ struct xfs_trans *tp;
+ int error;
+ struct xfs_qoff_logitem *qoffi;
+
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
+ if (error)
+ return error;
+
+ qoffi = xfs_trans_get_qoff_item(tp, *startqoff,
+ flags & XFS_ALL_QUOTA_ACCT);
+ xfs_trans_log_quotaoff_item(tp, qoffi);
+ *startqoff = NULL;
+
+ /*
+ * We have to make sure that the transaction is secure on disk before we
+ * return and actually stop quota accounting. So, make it synchronous.
+ * We don't care about quotoff's performance.
+ */
+ xfs_trans_set_sync(tp);
+ return xfs_trans_commit(tp);
+}
/*
* Turn off quota accounting and/or enforcement for all udquots and/or
@@ -40,7 +102,7 @@ xfs_qm_scall_quotaoff(
uint dqtype;
int error;
uint inactivate_flags;
- xfs_qoff_logitem_t *qoffstart;
+ struct xfs_qoff_logitem *qoffstart = NULL;
/*
* No file system can have quotas enabled on disk but not in core.
@@ -165,7 +227,7 @@ xfs_qm_scall_quotaoff(
* So, we have QUOTAOFF start and end logitems; the start
* logitem won't get overwritten until the end logitem appears...
*/
- error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
+ error = xfs_qm_log_quotaoff_end(mp, &qoffstart, flags);
if (error) {
/* We're screwed now. Shutdown is the only option. */
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -198,6 +260,8 @@ xfs_qm_scall_quotaoff(
}
out_unlock:
+ if (error && qoffstart)
+ xfs_qm_qoff_logitem_relse(qoffstart);
mutex_unlock(&q->qi_quotaofflock);
return error;
}
@@ -538,74 +602,6 @@ out_unlock:
return error;
}
-STATIC int
-xfs_qm_log_quotaoff_end(
- xfs_mount_t *mp,
- xfs_qoff_logitem_t *startqoff,
- uint flags)
-{
- xfs_trans_t *tp;
- int error;
- xfs_qoff_logitem_t *qoffi;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_equotaoff, 0, 0, 0, &tp);
- if (error)
- return error;
-
- qoffi = xfs_trans_get_qoff_item(tp, startqoff,
- flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- return xfs_trans_commit(tp);
-}
-
-
-STATIC int
-xfs_qm_log_quotaoff(
- xfs_mount_t *mp,
- xfs_qoff_logitem_t **qoffstartp,
- uint flags)
-{
- xfs_trans_t *tp;
- int error;
- xfs_qoff_logitem_t *qoffi;
-
- *qoffstartp = NULL;
-
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_qm_quotaoff, 0, 0, 0, &tp);
- if (error)
- goto out;
-
- qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
- xfs_trans_log_quotaoff_item(tp, qoffi);
-
- spin_lock(&mp->m_sb_lock);
- mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
- spin_unlock(&mp->m_sb_lock);
-
- xfs_log_sb(tp);
-
- /*
- * We have to make sure that the transaction is secure on disk before we
- * return and actually stop quota accounting. So, make it synchronous.
- * We don't care about quotoff's performance.
- */
- xfs_trans_set_sync(tp);
- error = xfs_trans_commit(tp);
- if (error)
- goto out;
-
- *qoffstartp = qoffi;
-out:
- return error;
-}
-
/* Fill out the quota context. */
static void
xfs_qm_scall_getquota_fill_qc(
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index efe42ae7a2f3..aa8fc1f55fbd 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -86,7 +86,7 @@ extern int xfs_trans_reserve_quota_bydquots(struct xfs_trans *,
struct xfs_mount *, struct xfs_dquot *,
struct xfs_dquot *, struct xfs_dquot *, int64_t, long, uint);
-extern int xfs_qm_vop_dqalloc(struct xfs_inode *, xfs_dqid_t, xfs_dqid_t,
+extern int xfs_qm_vop_dqalloc(struct xfs_inode *, kuid_t, kgid_t,
prid_t, uint, struct xfs_dquot **, struct xfs_dquot **,
struct xfs_dquot **);
extern void xfs_qm_vop_create_dqattach(struct xfs_trans *, struct xfs_inode *,
@@ -109,7 +109,7 @@ extern void xfs_qm_unmount_quotas(struct xfs_mount *);
#else
static inline int
-xfs_qm_vop_dqalloc(struct xfs_inode *ip, xfs_dqid_t uid, xfs_dqid_t gid,
+xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
prid_t prid, uint flags, struct xfs_dquot **udqp,
struct xfs_dquot **gdqp, struct xfs_dquot **pdqp)
{
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 2328268e6245..fa1018a6e677 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -17,7 +17,7 @@
#include "xfs_refcount_item.h"
#include "xfs_log.h"
#include "xfs_refcount.h"
-
+#include "xfs_error.h"
kmem_zone_t *xfs_cui_zone;
kmem_zone_t *xfs_cud_zone;
@@ -123,40 +123,6 @@ xfs_cui_item_release(
xfs_cui_release(CUI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_cui_item_ops = {
- .iop_size = xfs_cui_item_size,
- .iop_format = xfs_cui_item_format,
- .iop_unpin = xfs_cui_item_unpin,
- .iop_release = xfs_cui_item_release,
-};
-
-/*
- * Allocate and initialize an cui item with the given number of extents.
- */
-struct xfs_cui_log_item *
-xfs_cui_init(
- struct xfs_mount *mp,
- uint nextents)
-
-{
- struct xfs_cui_log_item *cuip;
-
- ASSERT(nextents > 0);
- if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
- cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
- 0);
- else
- cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
-
- xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
- cuip->cui_format.cui_nextents = nextents;
- cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
- atomic_set(&cuip->cui_next_extent, 0);
- atomic_set(&cuip->cui_refcount, 2);
-
- return cuip;
-}
-
static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip)
{
return container_of(lip, struct xfs_cud_log_item, cud_item);
@@ -284,27 +250,6 @@ xfs_refcount_update_diff_items(
XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
}
-/* Get an CUI. */
-STATIC void *
-xfs_refcount_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_cui_log_item *cuip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- cuip = xfs_cui_init(tp->t_mountp, count);
- ASSERT(cuip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &cuip->cui_item);
- return cuip;
-}
-
/* Set the phys extent flags for this reverse mapping. */
static void
xfs_trans_set_refcount_flags(
@@ -328,16 +273,12 @@ xfs_trans_set_refcount_flags(
STATIC void
xfs_refcount_update_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_cui_log_item *cuip,
+ struct xfs_refcount_intent *refc)
{
- struct xfs_cui_log_item *cuip = intent;
- struct xfs_refcount_intent *refc;
uint next_extent;
struct xfs_phys_extent *ext;
- refc = container_of(item, struct xfs_refcount_intent, ri_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
@@ -354,14 +295,35 @@ xfs_refcount_update_log_item(
xfs_trans_set_refcount_flags(ext, refc->ri_type);
}
+static struct xfs_log_item *
+xfs_refcount_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_cui_log_item *cuip = xfs_cui_init(mp, count);
+ struct xfs_refcount_intent *refc;
+
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &cuip->cui_item);
+ if (sort)
+ list_sort(mp, items, xfs_refcount_update_diff_items);
+ list_for_each_entry(refc, items, ri_list)
+ xfs_refcount_update_log_item(tp, cuip, refc);
+ return &cuip->cui_item;
+}
+
/* Get an CUD so we can process all the deferred refcount updates. */
STATIC void *
xfs_refcount_update_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_cud(tp, intent);
+ return xfs_trans_get_cud(tp, CUI_ITEM(intent));
}
/* Process a deferred refcount update. */
@@ -411,9 +373,9 @@ xfs_refcount_update_finish_cleanup(
/* Abort all pending CUIs. */
STATIC void
xfs_refcount_update_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_cui_release(intent);
+ xfs_cui_release(CUI_ITEM(intent));
}
/* Cancel a deferred refcount update. */
@@ -429,10 +391,8 @@ xfs_refcount_update_cancel_item(
const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
.max_items = XFS_CUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_refcount_update_diff_items,
.create_intent = xfs_refcount_update_create_intent,
.abort_intent = xfs_refcount_update_abort_intent,
- .log_item = xfs_refcount_update_log_item,
.create_done = xfs_refcount_update_create_done,
.finish_item = xfs_refcount_update_finish_item,
.finish_cleanup = xfs_refcount_update_finish_cleanup,
@@ -445,8 +405,8 @@ const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
*/
int
xfs_cui_recover(
- struct xfs_trans *parent_tp,
- struct xfs_cui_log_item *cuip)
+ struct xfs_cui_log_item *cuip,
+ struct list_head *capture_list)
{
int i;
int error = 0;
@@ -462,7 +422,7 @@ xfs_cui_recover(
xfs_extlen_t new_len;
struct xfs_bmbt_irec irec;
bool requeue_only = false;
- struct xfs_mount *mp = parent_tp->t_mountp;
+ struct xfs_mount *mp = cuip->cui_item.li_mountp;
ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
@@ -497,7 +457,7 @@ xfs_cui_recover(
*/
set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
xfs_cui_release(cuip);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -517,12 +477,7 @@ xfs_cui_recover(
mp->m_refc_maxlevels * 2, 0, XFS_TRANS_RESERVE, &tp);
if (error)
return error;
- /*
- * Recovery stashes all deferred ops during intent processing and
- * finishes them on completion. Transfer current dfops state to this
- * transaction and transfer the result back before we return.
- */
- xfs_defer_move(tp, parent_tp);
+
cudp = xfs_trans_get_cud(tp, cuip);
for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
@@ -536,6 +491,7 @@ xfs_cui_recover(
type = refc_type;
break;
default:
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
error = -EFSCORRUPTED;
goto abort_error;
}
@@ -579,13 +535,71 @@ xfs_cui_recover(
xfs_refcount_finish_one_cleanup(tp, rcur, error);
set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
- xfs_defer_move(parent_tp, tp);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_refcount_finish_one_cleanup(tp, rcur, error);
- xfs_defer_move(parent_tp, tp);
xfs_trans_cancel(tp);
return error;
}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_cui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_cud_log_item *cudp;
+ struct xfs_cui_log_item *cuip;
+ struct xfs_phys_extent *extp;
+ unsigned int count;
+
+ count = CUI_ITEM(intent)->cui_format.cui_nextents;
+ extp = CUI_ITEM(intent)->cui_format.cui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
+
+ cuip = xfs_cui_init(tp->t_mountp, count);
+ memcpy(cuip->cui_format.cui_extents, extp, count * sizeof(*extp));
+ atomic_set(&cuip->cui_next_extent, count);
+ xfs_trans_add_item(tp, &cuip->cui_item);
+ set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+ return &cuip->cui_item;
+}
+
+static const struct xfs_item_ops xfs_cui_item_ops = {
+ .iop_size = xfs_cui_item_size,
+ .iop_format = xfs_cui_item_format,
+ .iop_unpin = xfs_cui_item_unpin,
+ .iop_release = xfs_cui_item_release,
+ .iop_relog = xfs_cui_item_relog,
+};
+
+/*
+ * Allocate and initialize an cui item with the given number of extents.
+ */
+struct xfs_cui_log_item *
+xfs_cui_init(
+ struct xfs_mount *mp,
+ uint nextents)
+
+{
+ struct xfs_cui_log_item *cuip;
+
+ ASSERT(nextents > 0);
+ if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
+ cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
+ 0);
+ else
+ cuip = kmem_zone_zalloc(xfs_cui_zone, 0);
+
+ xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
+ cuip->cui_format.cui_nextents = nextents;
+ cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
+ atomic_set(&cuip->cui_next_extent, 0);
+ atomic_set(&cuip->cui_refcount, 2);
+
+ return cuip;
+}
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
index e47530f30489..de5f48ff4f74 100644
--- a/fs/xfs/xfs_refcount_item.h
+++ b/fs/xfs/xfs_refcount_item.h
@@ -80,6 +80,7 @@ extern struct kmem_zone *xfs_cud_zone;
struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
void xfs_cui_item_free(struct xfs_cui_log_item *);
void xfs_cui_release(struct xfs_cui_log_item *);
-int xfs_cui_recover(struct xfs_trans *parent_tp, struct xfs_cui_log_item *cuip);
+int xfs_cui_recover(struct xfs_cui_log_item *cuip,
+ struct list_head *capture_list);
#endif /* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 904d8285c226..01191ff46647 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -181,7 +181,7 @@ xfs_reflink_trim_around_shared(
int error = 0;
/* Holes, unwritten, and delalloc extents cannot be shared */
- if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_real_extent(irec)) {
+ if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
*shared = false;
return 0;
}
@@ -657,7 +657,7 @@ xfs_reflink_end_cow_extent(
* preallocations can leak into the range we are called upon, and we
* need to skip them.
*/
- if (!xfs_bmap_is_real_extent(&got)) {
+ if (!xfs_bmap_is_written_extent(&got)) {
*end_fsb = del.br_startoff;
goto out_cancel;
}
@@ -986,41 +986,28 @@ xfs_reflink_ag_has_free_space(
}
/*
- * Unmap a range of blocks from a file, then map other blocks into the hole.
- * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
- * The extent irec is mapped into dest at irec->br_startoff.
+ * Remap the given extent into the file. The dmap blockcount will be set to
+ * the number of blocks that were actually remapped.
*/
STATIC int
xfs_reflink_remap_extent(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *irec,
- xfs_fileoff_t destoff,
+ struct xfs_bmbt_irec *dmap,
xfs_off_t new_isize)
{
+ struct xfs_bmbt_irec smap;
struct xfs_mount *mp = ip->i_mount;
- bool real_extent = xfs_bmap_is_real_extent(irec);
struct xfs_trans *tp;
- unsigned int resblks;
- struct xfs_bmbt_irec uirec;
- xfs_filblks_t rlen;
- xfs_filblks_t unmap_len;
xfs_off_t newlen;
- int64_t qres;
+ int64_t qres, qdelta;
+ unsigned int resblks;
+ bool smap_real;
+ bool dmap_written = xfs_bmap_is_written_extent(dmap);
+ int nimaps;
int error;
- unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
- trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
-
- /* No reflinking if we're low on space */
- if (real_extent) {
- error = xfs_reflink_ag_has_free_space(mp,
- XFS_FSB_TO_AGNO(mp, irec->br_startblock));
- if (error)
- goto out;
- }
-
/* Start a rolling transaction to switch the mappings */
- resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+ resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
if (error)
goto out;
@@ -1029,92 +1016,121 @@ xfs_reflink_remap_extent(
xfs_trans_ijoin(tp, ip, 0);
/*
- * Reserve quota for this operation. We don't know if the first unmap
- * in the dest file will cause a bmap btree split, so we always reserve
- * at least enough blocks for that split. If the extent being mapped
- * in is written, we need to reserve quota for that too.
+ * Read what's currently mapped in the destination file into smap.
+ * If smap isn't a hole, we will have to remove it before we can add
+ * dmap to the destination file.
*/
- qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
- if (real_extent)
- qres += irec->br_blockcount;
- error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
- XFS_QMOPT_RES_REGBLKS);
+ nimaps = 1;
+ error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
+ &smap, &nimaps, 0);
if (error)
goto out_cancel;
+ ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
+ smap_real = xfs_bmap_is_real_extent(&smap);
- trace_xfs_reflink_remap(ip, irec->br_startoff,
- irec->br_blockcount, irec->br_startblock);
+ /*
+ * We can only remap as many blocks as the smaller of the two extent
+ * maps, because we can only remap one extent at a time.
+ */
+ dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
+ ASSERT(dmap->br_blockcount == smap.br_blockcount);
- /* Unmap the old blocks in the data fork. */
- rlen = unmap_len;
- while (rlen) {
- ASSERT(tp->t_firstblock == NULLFSBLOCK);
- error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1);
+ trace_xfs_reflink_remap_extent_dest(ip, &smap);
+
+ /* No reflinking if the AG of the dest mapping is low on space. */
+ if (dmap_written) {
+ error = xfs_reflink_ag_has_free_space(mp,
+ XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
if (error)
goto out_cancel;
+ }
+ /*
+ * Compute quota reservation if we think the quota block counter for
+ * this file could increase.
+ *
+ * We start by reserving enough blocks to handle a bmbt split.
+ *
+ * If we are mapping a written extent into the file, we need to have
+ * enough quota block count reservation to handle the blocks in that
+ * extent.
+ *
+ * Note that if we're replacing a delalloc reservation with a written
+ * extent, we have to take the full quota reservation because removing
+ * the delalloc reservation gives the block count back to the quota
+ * count. This is suboptimal, but the VFS flushed the dest range
+ * before we started. That should have removed all the delalloc
+ * reservations, but we code defensively.
+ */
+ qdelta = 0;
+ qres = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
+ if (dmap_written)
+ qres += dmap->br_blockcount;
+ error = xfs_trans_reserve_quota_nblks(tp, ip, qres, 0,
+ XFS_QMOPT_RES_REGBLKS);
+ if (error)
+ goto out_cancel;
+
+ if (smap_real) {
/*
- * Trim the extent to whatever got unmapped.
- * Remember, bunmapi works backwards.
+ * If the extent we're unmapping is backed by storage (written
+ * or not), unmap the extent and drop its refcount.
*/
- uirec.br_startblock = irec->br_startblock + rlen;
- uirec.br_startoff = irec->br_startoff + rlen;
- uirec.br_blockcount = unmap_len - rlen;
- uirec.br_state = irec->br_state;
- unmap_len = rlen;
-
- /* If this isn't a real mapping, we're done. */
- if (!real_extent || uirec.br_blockcount == 0)
- goto next_extent;
-
- trace_xfs_reflink_remap(ip, uirec.br_startoff,
- uirec.br_blockcount, uirec.br_startblock);
+ xfs_bmap_unmap_extent(tp, ip, &smap);
+ xfs_refcount_decrease_extent(tp, &smap);
+ qdelta -= smap.br_blockcount;
+ } else if (smap.br_startblock == DELAYSTARTBLOCK) {
+ xfs_filblks_t len = smap.br_blockcount;
- /* Update the refcount tree */
- xfs_refcount_increase_extent(tp, &uirec);
-
- /* Map the new blocks into the data fork. */
- xfs_bmap_map_extent(tp, ip, &uirec);
+ /*
+ * If the extent we're unmapping is a delalloc reservation,
+ * we can use the regular bunmapi function to release the
+ * incore state. Dropping the delalloc reservation takes care
+ * of the quota reservation for us.
+ */
+ error = __xfs_bunmapi(NULL, ip, smap.br_startoff, &len, 0, 1);
+ if (error)
+ goto out_cancel;
+ ASSERT(len == 0);
+ }
- /* Update quota accounting. */
- xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
- uirec.br_blockcount);
+ /*
+ * If the extent we're sharing is backed by written storage, increase
+ * its refcount and map it into the file.
+ */
+ if (dmap_written) {
+ xfs_refcount_increase_extent(tp, dmap);
+ xfs_bmap_map_extent(tp, ip, dmap);
+ qdelta += dmap->br_blockcount;
+ }
- /* Update dest isize if needed. */
- newlen = XFS_FSB_TO_B(mp,
- uirec.br_startoff + uirec.br_blockcount);
- newlen = min_t(xfs_off_t, newlen, new_isize);
- if (newlen > i_size_read(VFS_I(ip))) {
- trace_xfs_reflink_update_inode_size(ip, newlen);
- i_size_write(VFS_I(ip), newlen);
- ip->i_d.di_size = newlen;
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- }
+ xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
-next_extent:
- /* Process all the deferred stuff. */
- error = xfs_defer_finish(&tp);
- if (error)
- goto out_cancel;
+ /* Update dest isize if needed. */
+ newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
+ newlen = min_t(xfs_off_t, newlen, new_isize);
+ if (newlen > i_size_read(VFS_I(ip))) {
+ trace_xfs_reflink_update_inode_size(ip, newlen);
+ i_size_write(VFS_I(ip), newlen);
+ ip->i_d.di_size = newlen;
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
}
+ /* Commit everything and unlock. */
error = xfs_trans_commit(tp);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- if (error)
- goto out;
- return 0;
+ goto out_unlock;
out_cancel:
xfs_trans_cancel(tp);
+out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
out:
- trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+ if (error)
+ trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
return error;
}
-/*
- * Iteratively remap one file's extents (and holes) to another's.
- */
+/* Remap a range of one file to the other. */
int
xfs_reflink_remap_blocks(
struct xfs_inode *src,
@@ -1125,25 +1141,22 @@ xfs_reflink_remap_blocks(
loff_t *remapped)
{
struct xfs_bmbt_irec imap;
- xfs_fileoff_t srcoff;
- xfs_fileoff_t destoff;
+ struct xfs_mount *mp = src->i_mount;
+ xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in);
+ xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out);
xfs_filblks_t len;
- xfs_filblks_t range_len;
xfs_filblks_t remapped_len = 0;
xfs_off_t new_isize = pos_out + remap_len;
int nimaps;
int error = 0;
- destoff = XFS_B_TO_FSBT(src->i_mount, pos_out);
- srcoff = XFS_B_TO_FSBT(src->i_mount, pos_in);
- len = XFS_B_TO_FSB(src->i_mount, remap_len);
+ len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
+ XFS_MAX_FILEOFF);
- /* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
- while (len) {
- uint lock_mode;
+ trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
- trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
- dest, destoff);
+ while (len > 0) {
+ unsigned int lock_mode;
/* Read extent from the source file */
nimaps = 1;
@@ -1152,18 +1165,25 @@ xfs_reflink_remap_blocks(
xfs_iunlock(src, lock_mode);
if (error)
break;
- ASSERT(nimaps == 1);
-
- trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_DATA_FORK,
- &imap);
+ /*
+ * The caller supposedly flushed all dirty pages in the source
+ * file range, which means that writeback should have allocated
+ * or deleted all delalloc reservations in that range. If we
+ * find one, that's a good sign that something is seriously
+ * wrong here.
+ */
+ ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
+ if (imap.br_startblock == DELAYSTARTBLOCK) {
+ ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+ error = -EFSCORRUPTED;
+ break;
+ }
- /* Translate imap into the destination file. */
- range_len = imap.br_startoff + imap.br_blockcount - srcoff;
- imap.br_startoff += destoff - srcoff;
+ trace_xfs_reflink_remap_extent_src(src, &imap);
- /* Clear dest from destoff to the end of imap and map it in. */
- error = xfs_reflink_remap_extent(dest, &imap, destoff,
- new_isize);
+ /* Remap into the destination file at the given offset. */
+ imap.br_startoff = destoff;
+ error = xfs_reflink_remap_extent(dest, &imap, new_isize);
if (error)
break;
@@ -1173,10 +1193,10 @@ xfs_reflink_remap_blocks(
}
/* Advance drange/srange */
- srcoff += range_len;
- destoff += range_len;
- len -= range_len;
- remapped_len += range_len;
+ srcoff += imap.br_blockcount;
+ destoff += imap.br_blockcount;
+ len -= imap.br_blockcount;
+ remapped_len += imap.br_blockcount;
}
if (error)
@@ -1427,7 +1447,7 @@ xfs_reflink_dirty_extents(
goto out;
if (nmaps == 0)
break;
- if (!xfs_bmap_is_real_extent(&map[0]))
+ if (!xfs_bmap_is_written_extent(&map[0]))
goto next;
map[1] = map[0];
@@ -1544,7 +1564,8 @@ xfs_reflink_clear_inode_flag(
* We didn't find any shared blocks so turn off the reflink flag.
* First, get rid of any leftover CoW mappings.
*/
- error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF, true);
+ error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
+ true);
if (error)
return error;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 8939e0ea09cd..ba1dbb6c4063 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -17,7 +17,7 @@
#include "xfs_rmap_item.h"
#include "xfs_log.h"
#include "xfs_rmap.h"
-
+#include "xfs_error.h"
kmem_zone_t *xfs_rui_zone;
kmem_zone_t *xfs_rud_zone;
@@ -122,39 +122,6 @@ xfs_rui_item_release(
xfs_rui_release(RUI_ITEM(lip));
}
-static const struct xfs_item_ops xfs_rui_item_ops = {
- .iop_size = xfs_rui_item_size,
- .iop_format = xfs_rui_item_format,
- .iop_unpin = xfs_rui_item_unpin,
- .iop_release = xfs_rui_item_release,
-};
-
-/*
- * Allocate and initialize an rui item with the given number of extents.
- */
-struct xfs_rui_log_item *
-xfs_rui_init(
- struct xfs_mount *mp,
- uint nextents)
-
-{
- struct xfs_rui_log_item *ruip;
-
- ASSERT(nextents > 0);
- if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
- ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
- else
- ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
-
- xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
- ruip->rui_format.rui_nextents = nextents;
- ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
- atomic_set(&ruip->rui_next_extent, 0);
- atomic_set(&ruip->rui_refcount, 2);
-
- return ruip;
-}
-
/*
* Copy an RUI format buffer from the given buf, and into the destination
* RUI format structure. The RUI/RUD items were designed not to need any
@@ -171,8 +138,10 @@ xfs_rui_copy_format(
src_rui_fmt = buf->i_addr;
len = xfs_rui_log_format_sizeof(src_rui_fmt->rui_nextents);
- if (buf->i_len != len)
+ if (buf->i_len != len) {
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
return -EFSCORRUPTED;
+ }
memcpy(dst_rui_fmt, src_rui_fmt, len);
return 0;
@@ -350,41 +319,16 @@ xfs_rmap_update_diff_items(
XFS_FSB_TO_AGNO(mp, rb->ri_bmap.br_startblock);
}
-/* Get an RUI. */
-STATIC void *
-xfs_rmap_update_create_intent(
- struct xfs_trans *tp,
- unsigned int count)
-{
- struct xfs_rui_log_item *ruip;
-
- ASSERT(tp != NULL);
- ASSERT(count > 0);
-
- ruip = xfs_rui_init(tp->t_mountp, count);
- ASSERT(ruip != NULL);
-
- /*
- * Get a log_item_desc to point at the new item.
- */
- xfs_trans_add_item(tp, &ruip->rui_item);
- return ruip;
-}
-
/* Log rmap updates in the intent item. */
STATIC void
xfs_rmap_update_log_item(
struct xfs_trans *tp,
- void *intent,
- struct list_head *item)
+ struct xfs_rui_log_item *ruip,
+ struct xfs_rmap_intent *rmap)
{
- struct xfs_rui_log_item *ruip = intent;
- struct xfs_rmap_intent *rmap;
uint next_extent;
struct xfs_map_extent *map;
- rmap = container_of(item, struct xfs_rmap_intent, ri_list);
-
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
@@ -404,14 +348,35 @@ xfs_rmap_update_log_item(
rmap->ri_bmap.br_state);
}
+static struct xfs_log_item *
+xfs_rmap_update_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_rui_log_item *ruip = xfs_rui_init(mp, count);
+ struct xfs_rmap_intent *rmap;
+
+ ASSERT(count > 0);
+
+ xfs_trans_add_item(tp, &ruip->rui_item);
+ if (sort)
+ list_sort(mp, items, xfs_rmap_update_diff_items);
+ list_for_each_entry(rmap, items, ri_list)
+ xfs_rmap_update_log_item(tp, ruip, rmap);
+ return &ruip->rui_item;
+}
+
/* Get an RUD so we can process all the deferred rmap updates. */
STATIC void *
xfs_rmap_update_create_done(
struct xfs_trans *tp,
- void *intent,
+ struct xfs_log_item *intent,
unsigned int count)
{
- return xfs_trans_get_rud(tp, intent);
+ return xfs_trans_get_rud(tp, RUI_ITEM(intent));
}
/* Process a deferred rmap update. */
@@ -453,9 +418,9 @@ xfs_rmap_update_finish_cleanup(
/* Abort all pending RUIs. */
STATIC void
xfs_rmap_update_abort_intent(
- void *intent)
+ struct xfs_log_item *intent)
{
- xfs_rui_release(intent);
+ xfs_rui_release(RUI_ITEM(intent));
}
/* Cancel a deferred rmap update. */
@@ -471,10 +436,8 @@ xfs_rmap_update_cancel_item(
const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
.max_items = XFS_RUI_MAX_FAST_EXTENTS,
- .diff_items = xfs_rmap_update_diff_items,
.create_intent = xfs_rmap_update_create_intent,
.abort_intent = xfs_rmap_update_abort_intent,
- .log_item = xfs_rmap_update_log_item,
.create_done = xfs_rmap_update_create_done,
.finish_item = xfs_rmap_update_finish_item,
.finish_cleanup = xfs_rmap_update_finish_cleanup,
@@ -487,9 +450,10 @@ const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
*/
int
xfs_rui_recover(
- struct xfs_mount *mp,
- struct xfs_rui_log_item *ruip)
+ struct xfs_rui_log_item *ruip,
+ struct list_head *capture_list)
{
+ struct xfs_mount *mp = ruip->rui_item.li_mountp;
int i;
int error = 0;
struct xfs_map_extent *rmap;
@@ -539,7 +503,7 @@ xfs_rui_recover(
*/
set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
xfs_rui_release(ruip);
- return -EIO;
+ return -EFSCORRUPTED;
}
}
@@ -581,6 +545,7 @@ xfs_rui_recover(
type = XFS_RMAP_FREE;
break;
default:
+ XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, NULL);
error = -EFSCORRUPTED;
goto abort_error;
}
@@ -595,11 +560,70 @@ xfs_rui_recover(
xfs_rmap_finish_one_cleanup(tp, rcur, error);
set_bit(XFS_RUI_RECOVERED, &ruip->rui_flags);
- error = xfs_trans_commit(tp);
- return error;
+ return xfs_defer_ops_capture_and_commit(tp, NULL, capture_list);
abort_error:
xfs_rmap_finish_one_cleanup(tp, rcur, error);
xfs_trans_cancel(tp);
return error;
}
+
+/* Relog an intent item to push the log tail forward. */
+static struct xfs_log_item *
+xfs_rui_item_relog(
+ struct xfs_log_item *intent,
+ struct xfs_trans *tp)
+{
+ struct xfs_rud_log_item *rudp;
+ struct xfs_rui_log_item *ruip;
+ struct xfs_map_extent *extp;
+ unsigned int count;
+
+ count = RUI_ITEM(intent)->rui_format.rui_nextents;
+ extp = RUI_ITEM(intent)->rui_format.rui_extents;
+
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
+ set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
+
+ ruip = xfs_rui_init(tp->t_mountp, count);
+ memcpy(ruip->rui_format.rui_extents, extp, count * sizeof(*extp));
+ atomic_set(&ruip->rui_next_extent, count);
+ xfs_trans_add_item(tp, &ruip->rui_item);
+ set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+ return &ruip->rui_item;
+}
+
+static const struct xfs_item_ops xfs_rui_item_ops = {
+ .iop_size = xfs_rui_item_size,
+ .iop_format = xfs_rui_item_format,
+ .iop_unpin = xfs_rui_item_unpin,
+ .iop_release = xfs_rui_item_release,
+ .iop_relog = xfs_rui_item_relog,
+};
+
+/*
+ * Allocate and initialize an rui item with the given number of extents.
+ */
+struct xfs_rui_log_item *
+xfs_rui_init(
+ struct xfs_mount *mp,
+ uint nextents)
+
+{
+ struct xfs_rui_log_item *ruip;
+
+ ASSERT(nextents > 0);
+ if (nextents > XFS_RUI_MAX_FAST_EXTENTS)
+ ruip = kmem_zalloc(xfs_rui_log_item_sizeof(nextents), 0);
+ else
+ ruip = kmem_zone_zalloc(xfs_rui_zone, 0);
+
+ xfs_log_item_init(mp, &ruip->rui_item, XFS_LI_RUI, &xfs_rui_item_ops);
+ ruip->rui_format.rui_nextents = nextents;
+ ruip->rui_format.rui_id = (uintptr_t)(void *)ruip;
+ atomic_set(&ruip->rui_next_extent, 0);
+ atomic_set(&ruip->rui_refcount, 2);
+
+ return ruip;
+}
diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h
index 8708e4a5aa5c..5cf4acb0e915 100644
--- a/fs/xfs/xfs_rmap_item.h
+++ b/fs/xfs/xfs_rmap_item.h
@@ -82,6 +82,7 @@ int xfs_rui_copy_format(struct xfs_log_iovec *buf,
struct xfs_rui_log_format *dst_rui_fmt);
void xfs_rui_item_free(struct xfs_rui_log_item *);
void xfs_rui_release(struct xfs_rui_log_item *);
-int xfs_rui_recover(struct xfs_mount *mp, struct xfs_rui_log_item *ruip);
+int xfs_rui_recover(struct xfs_rui_log_item *ruip,
+ struct list_head *capture_list);
#endif /* __XFS_RMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 113883c4f202..20e0534a772c 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -23,6 +23,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
uint64_t xs_xstrat_bytes = 0;
uint64_t xs_write_bytes = 0;
uint64_t xs_read_bytes = 0;
+ uint64_t defer_relog = 0;
static const struct xstats_entry {
char *desc;
@@ -57,24 +58,27 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
/* Loop over all stats groups */
for (i = j = 0; i < ARRAY_SIZE(xstats); i++) {
- len += snprintf(buf + len, PATH_MAX - len, "%s",
+ len += scnprintf(buf + len, PATH_MAX - len, "%s",
xstats[i].desc);
/* inner loop does each group */
for (; j < xstats[i].endpoint; j++)
- len += snprintf(buf + len, PATH_MAX - len, " %u",
+ len += scnprintf(buf + len, PATH_MAX - len, " %u",
counter_val(stats, j));
- len += snprintf(buf + len, PATH_MAX - len, "\n");
+ len += scnprintf(buf + len, PATH_MAX - len, "\n");
}
/* extra precision counters */
for_each_possible_cpu(i) {
xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
+ defer_relog += per_cpu_ptr(stats, i)->s.defer_relog;
}
- len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
+ len += scnprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
- len += snprintf(buf + len, PATH_MAX-len, "debug %u\n",
+ len += scnprintf(buf + len, PATH_MAX-len, "defer_relog %llu\n",
+ defer_relog);
+ len += scnprintf(buf + len, PATH_MAX-len, "debug %u\n",
#if defined(DEBUG)
1);
#else
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 34d704f703d2..43ffba74f045 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -137,6 +137,7 @@ struct __xfsstats {
uint64_t xs_xstrat_bytes;
uint64_t xs_write_bytes;
uint64_t xs_read_bytes;
+ uint64_t defer_relog;
};
#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t))
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 8d1df9f8be07..e802cbc9daad 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -490,10 +490,12 @@ xfs_showargs(
seq_printf(m, ",swidth=%d",
(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
- if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
- seq_puts(m, ",usrquota");
- else if (mp->m_qflags & XFS_UQUOTA_ACCT)
- seq_puts(m, ",uqnoenforce");
+ if (mp->m_qflags & XFS_UQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_UQUOTA_ENFD)
+ seq_puts(m, ",usrquota");
+ else
+ seq_puts(m, ",uqnoenforce");
+ }
if (mp->m_qflags & XFS_PQUOTA_ACCT) {
if (mp->m_qflags & XFS_PQUOTA_ENFD)
@@ -512,32 +514,6 @@ xfs_showargs(
seq_puts(m, ",noquota");
}
-static uint64_t
-xfs_max_file_offset(
- unsigned int blockshift)
-{
- unsigned int pagefactor = 1;
- unsigned int bitshift = BITS_PER_LONG - 1;
-
- /* Figure out maximum filesize, on Linux this can depend on
- * the filesystem blocksize (on 32 bit platforms).
- * __block_write_begin does this in an [unsigned] long long...
- * page->index << (PAGE_SHIFT - bbits)
- * So, for page sized blocks (4K on 32 bit platforms),
- * this wraps at around 8Tb (hence MAX_LFS_FILESIZE which is
- * (((u64)PAGE_SIZE << (BITS_PER_LONG-1))-1)
- * but for smaller blocksizes it is less (bbits = log2 bsize).
- */
-
-#if BITS_PER_LONG == 32
- ASSERT(sizeof(sector_t) == 8);
- pagefactor = PAGE_SIZE;
- bitshift = BITS_PER_LONG;
-#endif
-
- return (((uint64_t)pagefactor) << bitshift) - 1;
-}
-
/*
* Set parameters for inode allocation heuristics, taking into account
* filesystem size and inode32/inode64 mount options; i.e. specifically
@@ -866,6 +842,20 @@ xfs_destroy_mount_workqueues(
destroy_workqueue(mp->m_buf_workqueue);
}
+static void
+xfs_flush_inodes_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(work, struct xfs_mount,
+ m_flush_inodes_work);
+ struct super_block *sb = mp->m_super;
+
+ if (down_read_trylock(&sb->s_umount)) {
+ sync_inodes_sb(sb);
+ up_read(&sb->s_umount);
+ }
+}
+
/*
* Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
* or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
@@ -876,12 +866,15 @@ void
xfs_flush_inodes(
struct xfs_mount *mp)
{
- struct super_block *sb = mp->m_super;
+ /*
+ * If flush_work() returns true then that means we waited for a flush
+ * which was already in progress. Don't bother running another scan.
+ */
+ if (flush_work(&mp->m_flush_inodes_work))
+ return;
- if (down_read_trylock(&sb->s_umount)) {
- sync_inodes_sb(sb);
- up_read(&sb->s_umount);
- }
+ queue_work(mp->m_sync_workqueue, &mp->m_flush_inodes_work);
+ flush_work(&mp->m_flush_inodes_work);
}
/* Catch misguided souls that try to use this interface on XFS */
@@ -1237,6 +1230,10 @@ xfs_fs_remount(
char *p;
int error;
+ /* version 5 superblocks always support version counters. */
+ if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5)
+ *flags |= SB_I_VERSION;
+
/* First, check for complete junk; i.e. invalid options */
error = xfs_test_remount_options(sb, options);
if (error)
@@ -1558,6 +1555,7 @@ xfs_mount_alloc(
spin_lock_init(&mp->m_perag_lock);
mutex_init(&mp->m_growlock);
atomic_set(&mp->m_active_trans, 0);
+ INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker);
INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
@@ -1650,6 +1648,26 @@ xfs_fs_fill_super(
if (error)
goto out_free_sb;
+ /*
+ * XFS block mappings use 54 bits to store the logical block offset.
+ * This should suffice to handle the maximum file size that the VFS
+ * supports (currently 2^63 bytes on 64-bit and ULONG_MAX << PAGE_SHIFT
+ * bytes on 32-bit), but as XFS and VFS have gotten the s_maxbytes
+ * calculation wrong on 32-bit kernels in the past, we'll add a WARN_ON
+ * to check this assertion.
+ *
+ * Avoid integer overflow by comparing the maximum bmbt offset to the
+ * maximum pagecache offset in units of fs blocks.
+ */
+ if (XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE) > XFS_MAX_FILEOFF) {
+ xfs_warn(mp,
+"MAX_LFS_FILESIZE block offset (%llu) exceeds extent map maximum (%llu)!",
+ XFS_B_TO_FSBT(mp, MAX_LFS_FILESIZE),
+ XFS_MAX_FILEOFF);
+ error = -EINVAL;
+ goto out_free_sb;
+ }
+
error = xfs_filestream_mount(mp);
if (error)
goto out_free_sb;
@@ -1661,7 +1679,7 @@ xfs_fs_fill_super(
sb->s_magic = XFS_SUPER_MAGIC;
sb->s_blocksize = mp->m_sb.sb_blocksize;
sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
- sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_max_links = XFS_MAXLINK;
sb->s_time_gran = 1;
sb->s_time_min = S32_MIN;
@@ -1898,13 +1916,13 @@ xfs_init_zones(void)
if (!xfs_buf_item_zone)
goto out_destroy_trans_zone;
- xfs_efd_zone = kmem_zone_init((sizeof(xfs_efd_log_item_t) +
+ xfs_efd_zone = kmem_zone_init((sizeof(struct xfs_efd_log_item) +
((XFS_EFD_MAX_FAST_EXTENTS - 1) *
sizeof(xfs_extent_t))), "xfs_efd_item");
if (!xfs_efd_zone)
goto out_destroy_buf_item_zone;
- xfs_efi_zone = kmem_zone_init((sizeof(xfs_efi_log_item_t) +
+ xfs_efi_zone = kmem_zone_init((sizeof(struct xfs_efi_log_item) +
((XFS_EFI_MAX_FAST_EXTENTS - 1) *
sizeof(xfs_extent_t))), "xfs_efi_item");
if (!xfs_efi_zone)
@@ -1918,8 +1936,8 @@ xfs_init_zones(void)
goto out_destroy_efi_zone;
xfs_ili_zone =
- kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
- KM_ZONE_SPREAD, NULL);
+ kmem_zone_init_flags(sizeof(struct xfs_inode_log_item),
+ "xfs_ili", KM_ZONE_SPREAD, NULL);
if (!xfs_ili_zone)
goto out_destroy_inode_zone;
xfs_icreate_zone = kmem_zone_init(sizeof(struct xfs_icreate_item),
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index ed66fd2de327..a2037e22ebda 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -191,9 +191,7 @@ xfs_symlink(
/*
* Make sure that we have allocated dquot(s) on disk.
*/
- error = xfs_qm_vop_dqalloc(dp,
- xfs_kuid_to_uid(current_fsuid()),
- xfs_kgid_to_gid(current_fsgid()), prid,
+ error = xfs_qm_vop_dqalloc(dp, current_fsuid(), current_fsgid(), prid,
XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
&udqp, &gdqp, &pdqp);
if (error)
@@ -203,7 +201,7 @@ xfs_symlink(
* The symlink will fit into the inode data fork?
* There can't be any attributes so we get the whole variable part.
*/
- if (pathlen <= XFS_LITINO(mp, dp->i_d.di_version))
+ if (pathlen <= XFS_LITINO(mp))
fs_blocks = 0;
else
fs_blocks = xfs_symlink_blocks(mp, pathlen);
@@ -316,6 +314,7 @@ xfs_symlink(
}
ASSERT(pathlen == 0);
}
+ i_size_write(VFS_I(ip), ip->i_d.di_size);
/*
* Create the directory entry for the symlink.
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index eaae275ed430..4b5818395406 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1011,6 +1011,7 @@ DEFINE_LOGGRANT_EVENT(xfs_log_regrant_reserve_sub);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
+DEFINE_LOGGRANT_EVENT(xfs_log_cil_wait);
DECLARE_EVENT_CLASS(xfs_log_item_class,
TP_PROTO(struct xfs_log_item *lip),
@@ -2417,6 +2418,7 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_create_intent);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
@@ -3077,8 +3079,7 @@ DEFINE_EVENT(xfs_inode_irec_class, name, \
DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
-DEFINE_IMAP_EVENT(xfs_reflink_remap_imap);
-TRACE_EVENT(xfs_reflink_remap_blocks_loop,
+TRACE_EVENT(xfs_reflink_remap_blocks,
TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
xfs_filblks_t len, struct xfs_inode *dest,
xfs_fileoff_t doffset),
@@ -3109,59 +3110,14 @@ TRACE_EVENT(xfs_reflink_remap_blocks_loop,
__entry->dest_ino,
__entry->dest_lblk)
);
-TRACE_EVENT(xfs_reflink_punch_range,
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
- xfs_extlen_t len),
- TP_ARGS(ip, lblk, len),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fileoff_t, lblk)
- __field(xfs_extlen_t, len)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->lblk = lblk;
- __entry->len = len;
- ),
- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->lblk,
- __entry->len)
-);
-TRACE_EVENT(xfs_reflink_remap,
- TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
- xfs_extlen_t len, xfs_fsblock_t new_pblk),
- TP_ARGS(ip, lblk, len, new_pblk),
- TP_STRUCT__entry(
- __field(dev_t, dev)
- __field(xfs_ino_t, ino)
- __field(xfs_fileoff_t, lblk)
- __field(xfs_extlen_t, len)
- __field(xfs_fsblock_t, new_pblk)
- ),
- TP_fast_assign(
- __entry->dev = VFS_I(ip)->i_sb->s_dev;
- __entry->ino = ip->i_ino;
- __entry->lblk = lblk;
- __entry->len = len;
- __entry->new_pblk = new_pblk;
- ),
- TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu",
- MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->ino,
- __entry->lblk,
- __entry->len,
- __entry->new_pblk)
-);
DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_src);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_remap_extent_dest);
/* dedupe tracepoints */
DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
@@ -3609,6 +3565,27 @@ DEFINE_KMEM_EVENT(kmem_alloc_large);
DEFINE_KMEM_EVENT(kmem_realloc);
DEFINE_KMEM_EVENT(kmem_zone_alloc);
+TRACE_EVENT(xfs_check_new_dalign,
+ TP_PROTO(struct xfs_mount *mp, int new_dalign, xfs_ino_t calc_rootino),
+ TP_ARGS(mp, new_dalign, calc_rootino),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(int, new_dalign)
+ __field(xfs_ino_t, sb_rootino)
+ __field(xfs_ino_t, calc_rootino)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->new_dalign = new_dalign;
+ __entry->sb_rootino = mp->m_sb.sb_rootino;
+ __entry->calc_rootino = calc_rootino;
+ ),
+ TP_printk("dev %d:%d new_dalign %d sb_rootino %llu calc_rootino %llu",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->new_dalign, __entry->sb_rootino,
+ __entry->calc_rootino)
+)
+
#endif /* _TRACE_XFS_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index b32a66452d44..47acf4096022 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -107,7 +107,8 @@ xfs_trans_dup(
ntp->t_flags = XFS_TRANS_PERM_LOG_RES |
(tp->t_flags & XFS_TRANS_RESERVE) |
- (tp->t_flags & XFS_TRANS_NO_WRITECOUNT);
+ (tp->t_flags & XFS_TRANS_NO_WRITECOUNT) |
+ (tp->t_flags & XFS_TRANS_RES_FDBLKS);
/* We gave our writer reference to the new transaction */
tp->t_flags |= XFS_TRANS_NO_WRITECOUNT;
ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
@@ -273,6 +274,8 @@ xfs_trans_alloc(
*/
WARN_ON(resp->tr_logres > 0 &&
mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
+ ASSERT(!(flags & XFS_TRANS_RES_FDBLKS) ||
+ xfs_sb_version_haslazysbcount(&mp->m_sb));
atomic_inc(&mp->m_active_trans);
tp->t_magic = XFS_TRANS_HEADER_MAGIC;
@@ -368,6 +371,20 @@ xfs_trans_mod_sb(
tp->t_blk_res_used += (uint)-delta;
if (tp->t_blk_res_used > tp->t_blk_res)
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ } else if (delta > 0 && (tp->t_flags & XFS_TRANS_RES_FDBLKS)) {
+ int64_t blkres_delta;
+
+ /*
+ * Return freed blocks directly to the reservation
+ * instead of the global pool, being careful not to
+ * overflow the trans counter. This is used to preserve
+ * reservation across chains of transaction rolls that
+ * repeatedly free and allocate blocks.
+ */
+ blkres_delta = min_t(int64_t, delta,
+ UINT_MAX - tp->t_blk_res);
+ tp->t_blk_res += blkres_delta;
+ delta -= blkres_delta;
}
tp->t_fdblocks_delta += delta;
if (xfs_sb_version_haslazysbcount(&mp->m_sb))
@@ -532,57 +549,9 @@ xfs_trans_apply_sb_deltas(
sizeof(sbp->sb_frextents) - 1);
}
-STATIC int
-xfs_sb_mod8(
- uint8_t *field,
- int8_t delta)
-{
- int8_t counter = *field;
-
- counter += delta;
- if (counter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- *field = counter;
- return 0;
-}
-
-STATIC int
-xfs_sb_mod32(
- uint32_t *field,
- int32_t delta)
-{
- int32_t counter = *field;
-
- counter += delta;
- if (counter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- *field = counter;
- return 0;
-}
-
-STATIC int
-xfs_sb_mod64(
- uint64_t *field,
- int64_t delta)
-{
- int64_t counter = *field;
-
- counter += delta;
- if (counter < 0) {
- ASSERT(0);
- return -EINVAL;
- }
- *field = counter;
- return 0;
-}
-
/*
- * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations
- * and apply superblock counter changes to the in-core superblock. The
+ * xfs_trans_unreserve_and_mod_sb() is called to release unused reservations and
+ * apply superblock counter changes to the in-core superblock. The
* t_res_fdblocks_delta and t_res_frextents_delta fields are explicitly NOT
* applied to the in-core superblock. The idea is that that has already been
* done.
@@ -627,20 +596,17 @@ xfs_trans_unreserve_and_mod_sb(
/* apply the per-cpu counters */
if (blkdelta) {
error = xfs_mod_fdblocks(mp, blkdelta, rsvd);
- if (error)
- goto out;
+ ASSERT(!error);
}
if (idelta) {
error = xfs_mod_icount(mp, idelta);
- if (error)
- goto out_undo_fdblocks;
+ ASSERT(!error);
}
if (ifreedelta) {
error = xfs_mod_ifree(mp, ifreedelta);
- if (error)
- goto out_undo_icount;
+ ASSERT(!error);
}
if (rtxdelta == 0 && !(tp->t_flags & XFS_TRANS_SB_DIRTY))
@@ -648,95 +614,23 @@ xfs_trans_unreserve_and_mod_sb(
/* apply remaining deltas */
spin_lock(&mp->m_sb_lock);
- if (rtxdelta) {
- error = xfs_sb_mod64(&mp->m_sb.sb_frextents, rtxdelta);
- if (error)
- goto out_undo_ifree;
- }
-
- if (tp->t_dblocks_delta != 0) {
- error = xfs_sb_mod64(&mp->m_sb.sb_dblocks, tp->t_dblocks_delta);
- if (error)
- goto out_undo_frextents;
- }
- if (tp->t_agcount_delta != 0) {
- error = xfs_sb_mod32(&mp->m_sb.sb_agcount, tp->t_agcount_delta);
- if (error)
- goto out_undo_dblocks;
- }
- if (tp->t_imaxpct_delta != 0) {
- error = xfs_sb_mod8(&mp->m_sb.sb_imax_pct, tp->t_imaxpct_delta);
- if (error)
- goto out_undo_agcount;
- }
- if (tp->t_rextsize_delta != 0) {
- error = xfs_sb_mod32(&mp->m_sb.sb_rextsize,
- tp->t_rextsize_delta);
- if (error)
- goto out_undo_imaxpct;
- }
- if (tp->t_rbmblocks_delta != 0) {
- error = xfs_sb_mod32(&mp->m_sb.sb_rbmblocks,
- tp->t_rbmblocks_delta);
- if (error)
- goto out_undo_rextsize;
- }
- if (tp->t_rblocks_delta != 0) {
- error = xfs_sb_mod64(&mp->m_sb.sb_rblocks, tp->t_rblocks_delta);
- if (error)
- goto out_undo_rbmblocks;
- }
- if (tp->t_rextents_delta != 0) {
- error = xfs_sb_mod64(&mp->m_sb.sb_rextents,
- tp->t_rextents_delta);
- if (error)
- goto out_undo_rblocks;
- }
- if (tp->t_rextslog_delta != 0) {
- error = xfs_sb_mod8(&mp->m_sb.sb_rextslog,
- tp->t_rextslog_delta);
- if (error)
- goto out_undo_rextents;
- }
+ mp->m_sb.sb_frextents += rtxdelta;
+ mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
+ mp->m_sb.sb_agcount += tp->t_agcount_delta;
+ mp->m_sb.sb_imax_pct += tp->t_imaxpct_delta;
+ mp->m_sb.sb_rextsize += tp->t_rextsize_delta;
+ mp->m_sb.sb_rbmblocks += tp->t_rbmblocks_delta;
+ mp->m_sb.sb_rblocks += tp->t_rblocks_delta;
+ mp->m_sb.sb_rextents += tp->t_rextents_delta;
+ mp->m_sb.sb_rextslog += tp->t_rextslog_delta;
spin_unlock(&mp->m_sb_lock);
- return;
-out_undo_rextents:
- if (tp->t_rextents_delta)
- xfs_sb_mod64(&mp->m_sb.sb_rextents, -tp->t_rextents_delta);
-out_undo_rblocks:
- if (tp->t_rblocks_delta)
- xfs_sb_mod64(&mp->m_sb.sb_rblocks, -tp->t_rblocks_delta);
-out_undo_rbmblocks:
- if (tp->t_rbmblocks_delta)
- xfs_sb_mod32(&mp->m_sb.sb_rbmblocks, -tp->t_rbmblocks_delta);
-out_undo_rextsize:
- if (tp->t_rextsize_delta)
- xfs_sb_mod32(&mp->m_sb.sb_rextsize, -tp->t_rextsize_delta);
-out_undo_imaxpct:
- if (tp->t_rextsize_delta)
- xfs_sb_mod8(&mp->m_sb.sb_imax_pct, -tp->t_imaxpct_delta);
-out_undo_agcount:
- if (tp->t_agcount_delta)
- xfs_sb_mod32(&mp->m_sb.sb_agcount, -tp->t_agcount_delta);
-out_undo_dblocks:
- if (tp->t_dblocks_delta)
- xfs_sb_mod64(&mp->m_sb.sb_dblocks, -tp->t_dblocks_delta);
-out_undo_frextents:
- if (rtxdelta)
- xfs_sb_mod64(&mp->m_sb.sb_frextents, -rtxdelta);
-out_undo_ifree:
- spin_unlock(&mp->m_sb_lock);
- if (ifreedelta)
- xfs_mod_ifree(mp, -ifreedelta);
-out_undo_icount:
- if (idelta)
- xfs_mod_icount(mp, -idelta);
-out_undo_fdblocks:
- if (blkdelta)
- xfs_mod_fdblocks(mp, -blkdelta, rsvd);
-out:
- ASSERT(error == 0);
+ /*
+ * Debug checks outside of the spinlock so they don't lock up the
+ * machine if they fail.
+ */
+ ASSERT(mp->m_sb.sb_imax_pct >= 0);
+ ASSERT(mp->m_sb.sb_rextslog >= 0);
return;
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 64d7f171ebd3..941647027f00 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -77,6 +77,8 @@ struct xfs_item_ops {
void (*iop_release)(struct xfs_log_item *);
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
void (*iop_error)(struct xfs_log_item *, xfs_buf_t *);
+ struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
+ struct xfs_trans *tp);
};
/*
@@ -244,4 +246,12 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
extern kmem_zone_t *xfs_trans_zone;
+static inline struct xfs_log_item *
+xfs_trans_item_relog(
+ struct xfs_log_item *lip,
+ struct xfs_trans *tp)
+{
+ return lip->li_ops->iop_relog(lip, tp);
+}
+
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 812108f6cc89..a41ba155d3a3 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -108,17 +108,25 @@ xfs_ail_next(
* We need the AIL lock in order to get a coherent read of the lsn of the last
* item in the AIL.
*/
+static xfs_lsn_t
+__xfs_ail_min_lsn(
+ struct xfs_ail *ailp)
+{
+ struct xfs_log_item *lip = xfs_ail_min(ailp);
+
+ if (lip)
+ return lip->li_lsn;
+ return 0;
+}
+
xfs_lsn_t
xfs_ail_min_lsn(
struct xfs_ail *ailp)
{
- xfs_lsn_t lsn = 0;
- struct xfs_log_item *lip;
+ xfs_lsn_t lsn;
spin_lock(&ailp->ail_lock);
- lip = xfs_ail_min(ailp);
- if (lip)
- lsn = lip->li_lsn;
+ lsn = __xfs_ail_min_lsn(ailp);
spin_unlock(&ailp->ail_lock);
return lsn;
@@ -394,16 +402,10 @@ xfsaild_push(
target = ailp->ail_target;
ailp->ail_target_prev = target;
+ /* we're done if the AIL is empty or our push has reached the end */
lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->ail_last_pushed_lsn);
- if (!lip) {
- /*
- * If the AIL is empty or our push has reached the end we are
- * done now.
- */
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
+ if (!lip)
goto out_done;
- }
XFS_STATS_INC(mp, xs_push_ail);
@@ -485,6 +487,8 @@ xfsaild_push(
break;
lsn = lip->li_lsn;
}
+
+out_done:
xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->ail_lock);
@@ -492,7 +496,6 @@ xfsaild_push(
ailp->ail_log_flush++;
if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
-out_done:
/*
* We reached the target or the AIL is empty, so wait a bit
* longer for I/O to complete and remove pushed items from the
@@ -584,7 +587,8 @@ xfsaild(
*/
smp_rmb();
if (!xfs_ail_min(ailp) &&
- ailp->ail_target == ailp->ail_target_prev) {
+ ailp->ail_target == ailp->ail_target_prev &&
+ list_empty(&ailp->ail_buf_list)) {
spin_unlock(&ailp->ail_lock);
freezable_schedule();
tout = 0;
@@ -680,6 +684,28 @@ xfs_ail_push_all_sync(
finish_wait(&ailp->ail_empty, &wait);
}
+void
+xfs_ail_update_finish(
+ struct xfs_ail *ailp,
+ xfs_lsn_t old_lsn) __releases(ailp->ail_lock)
+{
+ struct xfs_mount *mp = ailp->ail_mount;
+
+ /* if the tail lsn hasn't changed, don't do updates or wakeups. */
+ if (!old_lsn || old_lsn == __xfs_ail_min_lsn(ailp)) {
+ spin_unlock(&ailp->ail_lock);
+ return;
+ }
+
+ if (!XFS_FORCED_SHUTDOWN(mp))
+ xlog_assign_tail_lsn_locked(mp);
+
+ if (list_empty(&ailp->ail_head))
+ wake_up_all(&ailp->ail_empty);
+ spin_unlock(&ailp->ail_lock);
+ xfs_log_space_wake(mp);
+}
+
/*
* xfs_trans_ail_update - bulk AIL insertion operation.
*
@@ -711,7 +737,7 @@ xfs_trans_ail_update_bulk(
xfs_lsn_t lsn) __releases(ailp->ail_lock)
{
struct xfs_log_item *mlip;
- int mlip_changed = 0;
+ xfs_lsn_t tail_lsn = 0;
int i;
LIST_HEAD(tmp);
@@ -726,9 +752,10 @@ xfs_trans_ail_update_bulk(
continue;
trace_xfs_ail_move(lip, lip->li_lsn, lsn);
+ if (mlip == lip && !tail_lsn)
+ tail_lsn = lip->li_lsn;
+
xfs_ail_delete(ailp, lip);
- if (mlip == lip)
- mlip_changed = 1;
} else {
trace_xfs_ail_insert(lip, 0, lsn);
}
@@ -739,23 +766,23 @@ xfs_trans_ail_update_bulk(
if (!list_empty(&tmp))
xfs_ail_splice(ailp, cur, &tmp, lsn);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(ailp->ail_mount))
- xlog_assign_tail_lsn_locked(ailp->ail_mount);
- spin_unlock(&ailp->ail_lock);
-
- xfs_log_space_wake(ailp->ail_mount);
- } else {
- spin_unlock(&ailp->ail_lock);
- }
+ xfs_ail_update_finish(ailp, tail_lsn);
}
-bool
+/*
+ * Delete one log item from the AIL.
+ *
+ * If this item was at the tail of the AIL, return the LSN of the log item so
+ * that we can use it to check if the LSN of the tail of the log has moved
+ * when finishing up the AIL delete process in xfs_ail_update_finish().
+ */
+xfs_lsn_t
xfs_ail_delete_one(
struct xfs_ail *ailp,
struct xfs_log_item *lip)
{
struct xfs_log_item *mlip = xfs_ail_min(ailp);
+ xfs_lsn_t lsn = lip->li_lsn;
trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn);
xfs_ail_delete(ailp, lip);
@@ -763,7 +790,9 @@ xfs_ail_delete_one(
clear_bit(XFS_LI_IN_AIL, &lip->li_flags);
lip->li_lsn = 0;
- return mlip == lip;
+ if (mlip == lip)
+ return lsn;
+ return 0;
}
/**
@@ -791,10 +820,10 @@ void
xfs_trans_ail_delete(
struct xfs_ail *ailp,
struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock)
+ int shutdown_type)
{
struct xfs_mount *mp = ailp->ail_mount;
- bool mlip_changed;
+ xfs_lsn_t tail_lsn;
if (!test_bit(XFS_LI_IN_AIL, &lip->li_flags)) {
spin_unlock(&ailp->ail_lock);
@@ -807,17 +836,8 @@ xfs_trans_ail_delete(
return;
}
- mlip_changed = xfs_ail_delete_one(ailp, lip);
- if (mlip_changed) {
- if (!XFS_FORCED_SHUTDOWN(mp))
- xlog_assign_tail_lsn_locked(mp);
- if (list_empty(&ailp->ail_head))
- wake_up_all(&ailp->ail_empty);
- }
-
- spin_unlock(&ailp->ail_lock);
- if (mlip_changed)
- xfs_log_space_wake(ailp->ail_mount);
+ tail_lsn = xfs_ail_delete_one(ailp, lip);
+ xfs_ail_update_finish(ailp, tail_lsn);
}
int
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 904780dd74aa..4e43d415161d 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -15,6 +15,7 @@
#include "xfs_trans_priv.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
+#include "xfs_error.h"
STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
@@ -25,8 +26,8 @@ STATIC void xfs_trans_alloc_dqinfo(xfs_trans_t *);
*/
void
xfs_trans_dqjoin(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
ASSERT(dqp->q_logitem.qli_dquot == dqp);
@@ -49,8 +50,8 @@ xfs_trans_dqjoin(
*/
void
xfs_trans_log_dquot(
- xfs_trans_t *tp,
- xfs_dquot_t *dqp)
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp)
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
@@ -486,12 +487,12 @@ xfs_trans_apply_dquot_deltas(
*/
void
xfs_trans_unreserve_and_mod_dquots(
- xfs_trans_t *tp)
+ struct xfs_trans *tp)
{
int i, j;
- xfs_dquot_t *dqp;
+ struct xfs_dquot *dqp;
struct xfs_dqtrx *qtrx, *qa;
- bool locked;
+ bool locked;
if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
return;
@@ -571,21 +572,21 @@ xfs_quota_warn(
*/
STATIC int
xfs_trans_dqresv(
- xfs_trans_t *tp,
- xfs_mount_t *mp,
- xfs_dquot_t *dqp,
- int64_t nblks,
- long ninos,
- uint flags)
+ struct xfs_trans *tp,
+ struct xfs_mount *mp,
+ struct xfs_dquot *dqp,
+ int64_t nblks,
+ long ninos,
+ uint flags)
{
- xfs_qcnt_t hardlimit;
- xfs_qcnt_t softlimit;
- time_t timer;
- xfs_qwarncnt_t warns;
- xfs_qwarncnt_t warnlimit;
- xfs_qcnt_t total_count;
- xfs_qcnt_t *resbcountp;
- xfs_quotainfo_t *q = mp->m_quotainfo;
+ xfs_qcnt_t hardlimit;
+ xfs_qcnt_t softlimit;
+ time_t timer;
+ xfs_qwarncnt_t warns;
+ xfs_qwarncnt_t warnlimit;
+ xfs_qcnt_t total_count;
+ xfs_qcnt_t *resbcountp;
+ xfs_quotainfo_t *q = mp->m_quotainfo;
struct xfs_def_quota *defq;
@@ -700,9 +701,14 @@ xfs_trans_dqresv(
XFS_TRANS_DQ_RES_INOS,
ninos);
}
- ASSERT(dqp->q_res_bcount >= be64_to_cpu(dqp->q_core.d_bcount));
- ASSERT(dqp->q_res_rtbcount >= be64_to_cpu(dqp->q_core.d_rtbcount));
- ASSERT(dqp->q_res_icount >= be64_to_cpu(dqp->q_core.d_icount));
+
+ if (XFS_IS_CORRUPT(mp,
+ dqp->q_res_bcount < be64_to_cpu(dqp->q_core.d_bcount)) ||
+ XFS_IS_CORRUPT(mp,
+ dqp->q_res_rtbcount < be64_to_cpu(dqp->q_core.d_rtbcount)) ||
+ XFS_IS_CORRUPT(mp,
+ dqp->q_res_icount < be64_to_cpu(dqp->q_core.d_icount)))
+ goto error_corrupt;
xfs_dqunlock(dqp);
return 0;
@@ -712,6 +718,10 @@ error_return:
if (flags & XFS_QMOPT_ENOSPC)
return -ENOSPC;
return -EDQUOT;
+error_corrupt:
+ xfs_dqunlock(dqp);
+ xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+ return -EFSCORRUPTED;
}
@@ -756,7 +766,8 @@ xfs_trans_reserve_quota_bydquots(
}
if (gdqp) {
- error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags);
+ error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos,
+ (flags & ~XFS_QMOPT_ENOSPC));
if (error)
goto unwind_usr;
}
@@ -824,13 +835,13 @@ xfs_trans_reserve_quota_nblks(
/*
* This routine is called to allocate a quotaoff log item.
*/
-xfs_qoff_logitem_t *
+struct xfs_qoff_logitem *
xfs_trans_get_qoff_item(
- xfs_trans_t *tp,
- xfs_qoff_logitem_t *startqoff,
+ struct xfs_trans *tp,
+ struct xfs_qoff_logitem *startqoff,
uint flags)
{
- xfs_qoff_logitem_t *q;
+ struct xfs_qoff_logitem *q;
ASSERT(tp != NULL);
@@ -852,8 +863,8 @@ xfs_trans_get_qoff_item(
*/
void
xfs_trans_log_quotaoff_item(
- xfs_trans_t *tp,
- xfs_qoff_logitem_t *qlp)
+ struct xfs_trans *tp,
+ struct xfs_qoff_logitem *qlp)
{
tp->t_flags |= XFS_TRANS_DIRTY;
set_bit(XFS_LI_DIRTY, &qlp->qql_item.li_flags);
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 2e073c1c4614..35655eac01a6 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -91,9 +91,11 @@ xfs_trans_ail_update(
xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn);
}
-bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+xfs_lsn_t xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip);
+void xfs_ail_update_finish(struct xfs_ail *ailp, xfs_lsn_t old_lsn)
+ __releases(ailp->ail_lock);
void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip,
- int shutdown_type) __releases(ailp->ail_lock);
+ int shutdown_type);
static inline void
xfs_trans_ail_remove(