aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs_vfs.h1
-rw-r--r--fs/9p/vfs_addr.c353
-rw-r--r--fs/9p/vfs_file.c89
-rw-r--r--fs/9p/vfs_inode.c16
-rw-r--r--fs/9p/vfs_inode_dotl.c8
-rw-r--r--fs/9p/vfs_super.c14
-rw-r--r--fs/Kconfig27
-rw-r--r--fs/Makefile2
-rw-r--r--fs/adfs/inode.c11
-rw-r--r--fs/affs/affs.h1
-rw-r--r--fs/affs/namei.c3
-rw-r--r--fs/affs/super.c2
-rw-r--r--fs/afs/Makefile2
-rw-r--r--fs/afs/addr_list.c224
-rw-r--r--fs/afs/addr_prefs.c531
-rw-r--r--fs/afs/afs.h3
-rw-r--r--fs/afs/callback.c141
-rw-r--r--fs/afs/cell.c5
-rw-r--r--fs/afs/cmservice.c5
-rw-r--r--fs/afs/dir.c96
-rw-r--r--fs/afs/dir_silly.c2
-rw-r--r--fs/afs/dynroot.c16
-rw-r--r--fs/afs/file.c241
-rw-r--r--fs/afs/fs_operation.c85
-rw-r--r--fs/afs/fs_probe.c323
-rw-r--r--fs/afs/fsclient.c74
-rw-r--r--fs/afs/inode.c232
-rw-r--r--fs/afs/internal.h450
-rw-r--r--fs/afs/main.c4
-rw-r--r--fs/afs/misc.c10
-rw-r--r--fs/afs/proc.c103
-rw-r--r--fs/afs/rotate.c520
-rw-r--r--fs/afs/rxrpc.c107
-rw-r--r--fs/afs/server.c147
-rw-r--r--fs/afs/server_list.c174
-rw-r--r--fs/afs/super.c9
-rw-r--r--fs/afs/validation.c473
-rw-r--r--fs/afs/vl_alias.c69
-rw-r--r--fs/afs/vl_list.c29
-rw-r--r--fs/afs/vl_probe.c60
-rw-r--r--fs/afs/vl_rotate.c215
-rw-r--r--fs/afs/vlclient.c143
-rw-r--r--fs/afs/volume.c65
-rw-r--r--fs/afs/write.c834
-rw-r--r--fs/afs/xattr.c2
-rw-r--r--fs/afs/yfsclient.c25
-rw-r--r--fs/aio.c105
-rw-r--r--fs/anon_inodes.c51
-rw-r--r--fs/attr.c2
-rw-r--r--fs/autofs/expire.c7
-rw-r--r--fs/backing-file.c336
-rw-r--r--fs/bcachefs/Kconfig18
-rw-r--r--fs/bcachefs/Makefile3
-rw-r--r--fs/bcachefs/alloc_background.c573
-rw-r--r--fs/bcachefs/alloc_background.h39
-rw-r--r--fs/bcachefs/alloc_background_format.h92
-rw-r--r--fs/bcachefs/alloc_foreground.c53
-rw-r--r--fs/bcachefs/backpointers.c259
-rw-r--r--fs/bcachefs/backpointers.h28
-rw-r--r--fs/bcachefs/bcachefs.h209
-rw-r--r--fs/bcachefs/bcachefs_format.h1005
-rw-r--r--fs/bcachefs/bcachefs_ioctl.h60
-rw-r--r--fs/bcachefs/bkey.c2
-rw-r--r--fs/bcachefs/bkey_methods.c9
-rw-r--r--fs/bcachefs/bkey_methods.h88
-rw-r--r--fs/bcachefs/bset.c13
-rw-r--r--fs/bcachefs/bset.h3
-rw-r--r--fs/bcachefs/btree_cache.c40
-rw-r--r--fs/bcachefs/btree_cache.h23
-rw-r--r--fs/bcachefs/btree_gc.c363
-rw-r--r--fs/bcachefs/btree_io.c170
-rw-r--r--fs/bcachefs/btree_io.h2
-rw-r--r--fs/bcachefs/btree_iter.c951
-rw-r--r--fs/bcachefs/btree_iter.h412
-rw-r--r--fs/bcachefs/btree_journal_iter.c25
-rw-r--r--fs/bcachefs/btree_key_cache.c63
-rw-r--r--fs/bcachefs/btree_key_cache.h2
-rw-r--r--fs/bcachefs/btree_locking.c149
-rw-r--r--fs/bcachefs/btree_locking.h25
-rw-r--r--fs/bcachefs/btree_trans_commit.c342
-rw-r--r--fs/bcachefs/btree_types.h148
-rw-r--r--fs/bcachefs/btree_update.c245
-rw-r--r--fs/bcachefs/btree_update.h111
-rw-r--r--fs/bcachefs/btree_update_interior.c333
-rw-r--r--fs/bcachefs/btree_update_interior.h53
-rw-r--r--fs/bcachefs/btree_write_buffer.c669
-rw-r--r--fs/bcachefs/btree_write_buffer.h53
-rw-r--r--fs/bcachefs/btree_write_buffer_types.h63
-rw-r--r--fs/bcachefs/buckets.c1509
-rw-r--r--fs/bcachefs/buckets.h60
-rw-r--r--fs/bcachefs/buckets_types.h17
-rw-r--r--fs/bcachefs/chardev.c363
-rw-r--r--fs/bcachefs/checksum.h23
-rw-r--r--fs/bcachefs/clock.c4
-rw-r--r--fs/bcachefs/compress.c4
-rw-r--r--fs/bcachefs/compress.h8
-rw-r--r--fs/bcachefs/darray.h8
-rw-r--r--fs/bcachefs/data_update.c36
-rw-r--r--fs/bcachefs/debug.c157
-rw-r--r--fs/bcachefs/dirent.c51
-rw-r--r--fs/bcachefs/dirent.h7
-rw-r--r--fs/bcachefs/dirent_format.h42
-rw-r--r--fs/bcachefs/disk_groups.c13
-rw-r--r--fs/bcachefs/ec.c406
-rw-r--r--fs/bcachefs/ec.h5
-rw-r--r--fs/bcachefs/ec_format.h19
-rw-r--r--fs/bcachefs/ec_types.h2
-rw-r--r--fs/bcachefs/errcode.h7
-rw-r--r--fs/bcachefs/error.c103
-rw-r--r--fs/bcachefs/extent_update.c2
-rw-r--r--fs/bcachefs/extents.c15
-rw-r--r--fs/bcachefs/extents.h26
-rw-r--r--fs/bcachefs/extents_format.h295
-rw-r--r--fs/bcachefs/eytzinger.h14
-rw-r--r--fs/bcachefs/fs-common.c36
-rw-r--r--fs/bcachefs/fs-io-buffered.c59
-rw-r--r--fs/bcachefs/fs-io-direct.c5
-rw-r--r--fs/bcachefs/fs-io-pagecache.c37
-rw-r--r--fs/bcachefs/fs-io-pagecache.h2
-rw-r--r--fs/bcachefs/fs-io.c29
-rw-r--r--fs/bcachefs/fs-ioctl.c58
-rw-r--r--fs/bcachefs/fs.c111
-rw-r--r--fs/bcachefs/fs.h9
-rw-r--r--fs/bcachefs/fsck.c647
-rw-r--r--fs/bcachefs/inode.c154
-rw-r--r--fs/bcachefs/inode.h15
-rw-r--r--fs/bcachefs/inode_format.h166
-rw-r--r--fs/bcachefs/io_misc.c59
-rw-r--r--fs/bcachefs/io_read.c50
-rw-r--r--fs/bcachefs/io_write.c51
-rw-r--r--fs/bcachefs/journal.c211
-rw-r--r--fs/bcachefs/journal.h4
-rw-r--r--fs/bcachefs/journal_io.c165
-rw-r--r--fs/bcachefs/journal_reclaim.c130
-rw-r--r--fs/bcachefs/journal_reclaim.h16
-rw-r--r--fs/bcachefs/journal_seq_blacklist.c2
-rw-r--r--fs/bcachefs/journal_types.h16
-rw-r--r--fs/bcachefs/keylist.c2
-rw-r--r--fs/bcachefs/keylist.h4
-rw-r--r--fs/bcachefs/logged_ops.c18
-rw-r--r--fs/bcachefs/logged_ops_format.h30
-rw-r--r--fs/bcachefs/lru.c11
-rw-r--r--fs/bcachefs/mean_and_variance.c10
-rw-r--r--fs/bcachefs/mean_and_variance.h5
-rw-r--r--fs/bcachefs/migrate.c9
-rw-r--r--fs/bcachefs/move.c252
-rw-r--r--fs/bcachefs/move.h13
-rw-r--r--fs/bcachefs/movinggc.c49
-rw-r--r--fs/bcachefs/opts.c8
-rw-r--r--fs/bcachefs/opts.h29
-rw-r--r--fs/bcachefs/printbuf.c1
-rw-r--r--fs/bcachefs/quota.c28
-rw-r--r--fs/bcachefs/quota_format.h47
-rw-r--r--fs/bcachefs/rebalance.c47
-rw-r--r--fs/bcachefs/recovery.c304
-rw-r--r--fs/bcachefs/recovery.h1
-rw-r--r--fs/bcachefs/recovery_types.h25
-rw-r--r--fs/bcachefs/reflink.c243
-rw-r--r--fs/bcachefs/reflink.h26
-rw-r--r--fs/bcachefs/reflink_format.h33
-rw-r--r--fs/bcachefs/replicas.c94
-rw-r--r--fs/bcachefs/replicas.h22
-rw-r--r--fs/bcachefs/replicas_types.h6
-rw-r--r--fs/bcachefs/sb-clean.c22
-rw-r--r--fs/bcachefs/sb-counters.c (renamed from fs/bcachefs/counters.c)2
-rw-r--r--fs/bcachefs/sb-counters.h (renamed from fs/bcachefs/counters.h)7
-rw-r--r--fs/bcachefs/sb-counters_format.h98
-rw-r--r--fs/bcachefs/sb-downgrade.c90
-rw-r--r--fs/bcachefs/sb-downgrade.h1
-rw-r--r--fs/bcachefs/sb-errors_types.h4
-rw-r--r--fs/bcachefs/sb-members.c24
-rw-r--r--fs/bcachefs/sb-members.h100
-rw-r--r--fs/bcachefs/six.c117
-rw-r--r--fs/bcachefs/six.h13
-rw-r--r--fs/bcachefs/snapshot.c180
-rw-r--r--fs/bcachefs/snapshot.h8
-rw-r--r--fs/bcachefs/snapshot_format.h36
-rw-r--r--fs/bcachefs/str_hash.h47
-rw-r--r--fs/bcachefs/subvolume.c31
-rw-r--r--fs/bcachefs/subvolume_format.h35
-rw-r--r--fs/bcachefs/subvolume_types.h4
-rw-r--r--fs/bcachefs/super-io.c195
-rw-r--r--fs/bcachefs/super-io.h7
-rw-r--r--fs/bcachefs/super.c398
-rw-r--r--fs/bcachefs/super.h6
-rw-r--r--fs/bcachefs/super_types.h3
-rw-r--r--fs/bcachefs/sysfs.c173
-rw-r--r--fs/bcachefs/tests.c193
-rw-r--r--fs/bcachefs/thread_with_file.c299
-rw-r--r--fs/bcachefs/thread_with_file.h41
-rw-r--r--fs/bcachefs/thread_with_file_types.h16
-rw-r--r--fs/bcachefs/trace.h318
-rw-r--r--fs/bcachefs/util.c211
-rw-r--r--fs/bcachefs/util.h59
-rw-r--r--fs/bcachefs/vstructs.h10
-rw-r--r--fs/bcachefs/xattr.c5
-rw-r--r--fs/bcachefs/xattr_format.h19
-rw-r--r--fs/befs/linuxvfs.c3
-rw-r--r--fs/bfs/dir.c5
-rw-r--r--fs/bfs/file.c9
-rw-r--r--fs/btrfs/accessors.c98
-rw-r--r--fs/btrfs/accessors.h4
-rw-r--r--fs/btrfs/bio.c17
-rw-r--r--fs/btrfs/bio.h4
-rw-r--r--fs/btrfs/block-group.c249
-rw-r--r--fs/btrfs/block-group.h13
-rw-r--r--fs/btrfs/block-rsv.c2
-rw-r--r--fs/btrfs/block-rsv.h32
-rw-r--r--fs/btrfs/btrfs_inode.h10
-rw-r--r--fs/btrfs/compression.c162
-rw-r--r--fs/btrfs/compression.h9
-rw-r--r--fs/btrfs/ctree.c63
-rw-r--r--fs/btrfs/ctree.h17
-rw-r--r--fs/btrfs/defrag.c15
-rw-r--r--fs/btrfs/delalloc-space.c29
-rw-r--r--fs/btrfs/delayed-inode.c109
-rw-r--r--fs/btrfs/dev-replace.c52
-rw-r--r--fs/btrfs/disk-io.c184
-rw-r--r--fs/btrfs/disk-io.h5
-rw-r--r--fs/btrfs/extent-io-tree.c119
-rw-r--r--fs/btrfs/extent-io-tree.h18
-rw-r--r--fs/btrfs/extent-tree.c157
-rw-r--r--fs/btrfs/extent_io.c1273
-rw-r--r--fs/btrfs/extent_io.h80
-rw-r--r--fs/btrfs/extent_map.c195
-rw-r--r--fs/btrfs/extent_map.h77
-rw-r--r--fs/btrfs/file-item.c15
-rw-r--r--fs/btrfs/file.c27
-rw-r--r--fs/btrfs/free-space-cache.c4
-rw-r--r--fs/btrfs/fs.h18
-rw-r--r--fs/btrfs/inode.c225
-rw-r--r--fs/btrfs/ioctl.c26
-rw-r--r--fs/btrfs/lru_cache.c2
-rw-r--r--fs/btrfs/lzo.c38
-rw-r--r--fs/btrfs/messages.c2
-rw-r--r--fs/btrfs/messages.h2
-rw-r--r--fs/btrfs/ordered-data.c5
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/qgroup.c16
-rw-r--r--fs/btrfs/raid56.c7
-rw-r--r--fs/btrfs/raid56.h2
-rw-r--r--fs/btrfs/ref-verify.c6
-rw-r--r--fs/btrfs/reflink.c6
-rw-r--r--fs/btrfs/relocation.c7
-rw-r--r--fs/btrfs/scrub.c99
-rw-r--r--fs/btrfs/send.c23
-rw-r--r--fs/btrfs/space-info.c26
-rw-r--r--fs/btrfs/subpage.c374
-rw-r--r--fs/btrfs/subpage.h82
-rw-r--r--fs/btrfs/super.c2299
-rw-r--r--fs/btrfs/super.h5
-rw-r--r--fs/btrfs/sysfs.c4
-rw-r--r--fs/btrfs/tests/btrfs-tests.c5
-rw-r--r--fs/btrfs/tests/btrfs-tests.h1
-rw-r--r--fs/btrfs/tests/extent-io-tests.c4
-rw-r--r--fs/btrfs/tests/extent-map-tests.c143
-rw-r--r--fs/btrfs/tests/inode-tests.c60
-rw-r--r--fs/btrfs/transaction.c40
-rw-r--r--fs/btrfs/tree-checker.c2
-rw-r--r--fs/btrfs/tree-checker.h2
-rw-r--r--fs/btrfs/tree-log.c17
-rw-r--r--fs/btrfs/volumes.c932
-rw-r--r--fs/btrfs/volumes.h47
-rw-r--r--fs/btrfs/xattr.c55
-rw-r--r--fs/btrfs/zlib.c79
-rw-r--r--fs/btrfs/zoned.c107
-rw-r--r--fs/btrfs/zoned.h14
-rw-r--r--fs/btrfs/zstd.c7
-rw-r--r--fs/buffer.c283
-rw-r--r--fs/cachefiles/Kconfig2
-rw-r--r--fs/cachefiles/cache.c2
-rw-r--r--fs/cachefiles/daemon.c16
-rw-r--r--fs/cachefiles/error_inject.c1
-rw-r--r--fs/cachefiles/interface.c7
-rw-r--r--fs/cachefiles/internal.h61
-rw-r--r--fs/cachefiles/io.c39
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/cachefiles/ondemand.c171
-rw-r--r--fs/ceph/Kconfig1
-rw-r--r--fs/ceph/addr.c37
-rw-r--r--fs/ceph/cache.h45
-rw-r--r--fs/ceph/caps.c83
-rw-r--r--fs/ceph/dir.c23
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/file.c21
-rw-r--r--fs/ceph/inode.c6
-rw-r--r--fs/ceph/mds_client.c94
-rw-r--r--fs/ceph/mds_client.h7
-rw-r--r--fs/ceph/mdsmap.c7
-rw-r--r--fs/ceph/mdsmap.h6
-rw-r--r--fs/ceph/quota.c39
-rw-r--r--fs/ceph/super.h16
-rw-r--r--fs/coda/cache.c8
-rw-r--r--fs/coda/file.c2
-rw-r--r--fs/coda/sysctl.c1
-rw-r--r--fs/coredump.c46
-rw-r--r--fs/crypto/Kconfig2
-rw-r--r--fs/crypto/keyring.c6
-rw-r--r--fs/dax.c2
-rw-r--r--fs/dcache.c654
-rw-r--r--fs/debugfs/file.c28
-rw-r--r--fs/devpts/inode.c1
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/dlm/debug_fs.c6
-rw-r--r--fs/dlm/lowcomms.c14
-rw-r--r--fs/dlm/plock.c20
-rw-r--r--fs/ecryptfs/inode.c10
-rw-r--r--fs/efivarfs/inode.c3
-rw-r--r--fs/efivarfs/internal.h10
-rw-r--r--fs/efivarfs/super.c65
-rw-r--r--fs/efivarfs/vars.c28
-rw-r--r--fs/erofs/Kconfig7
-rw-r--r--fs/erofs/compress.h5
-rw-r--r--fs/erofs/data.c1
-rw-r--r--fs/erofs/decompressor.c126
-rw-r--r--fs/erofs/decompressor_deflate.c21
-rw-r--r--fs/erofs/decompressor_lzma.c17
-rw-r--r--fs/erofs/fscache.c15
-rw-r--r--fs/erofs/inode.c8
-rw-r--r--fs/erofs/namei.c28
-rw-r--r--fs/erofs/super.c10
-rw-r--r--fs/erofs/utils.c2
-rw-r--r--fs/erofs/zdata.c335
-rw-r--r--fs/erofs/zmap.c55
-rw-r--r--fs/eventfd.c46
-rw-r--r--fs/eventpoll.c1
-rw-r--r--fs/exec.c111
-rw-r--r--fs/exfat/balloc.c87
-rw-r--r--fs/exfat/exfat_fs.h6
-rw-r--r--fs/exfat/file.c200
-rw-r--r--fs/exfat/inode.c137
-rw-r--r--fs/exfat/namei.c6
-rw-r--r--fs/exfat/nls.c14
-rw-r--r--fs/exfat/super.c20
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext2/namei.c11
-rw-r--r--fs/ext4/ext4.h8
-rw-r--r--fs/ext4/ext4_jbd2.c5
-rw-r--r--fs/ext4/extents.c130
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/indirect.c2
-rw-r--r--fs/ext4/inline.c3
-rw-r--r--fs/ext4/inode.c125
-rw-r--r--fs/ext4/ioctl.c6
-rw-r--r--fs/ext4/mballoc.c205
-rw-r--r--fs/ext4/mballoc.h1
-rw-r--r--fs/ext4/move_extent.c10
-rw-r--r--fs/ext4/namei.c23
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/resize.c49
-rw-r--r--fs/ext4/super.c21
-rw-r--r--fs/ext4/symlink.c8
-rw-r--r--fs/f2fs/compress.c8
-rw-r--r--fs/f2fs/data.c50
-rw-r--r--fs/f2fs/f2fs.h46
-rw-r--r--fs/f2fs/file.c70
-rw-r--r--fs/f2fs/gc.c16
-rw-r--r--fs/f2fs/inode.c59
-rw-r--r--fs/f2fs/namei.c36
-rw-r--r--fs/f2fs/node.c6
-rw-r--r--fs/f2fs/recovery.c25
-rw-r--r--fs/f2fs/segment.c138
-rw-r--r--fs/f2fs/super.c47
-rw-r--r--fs/f2fs/sysfs.c50
-rw-r--r--fs/f2fs/xattr.c17
-rw-r--r--fs/file.c97
-rw-r--r--fs/file_table.c28
-rw-r--r--fs/freevxfs/vxfs_bmap.c8
-rw-r--r--fs/freevxfs/vxfs_immed.c2
-rw-r--r--fs/freevxfs/vxfs_lookup.c3
-rw-r--r--fs/fs-writeback.c10
-rw-r--r--fs/fscache/Kconfig40
-rw-r--r--fs/fscache/Makefile16
-rw-r--r--fs/fscache/internal.h277
-rw-r--r--fs/fuse/cuse.c3
-rw-r--r--fs/fuse/file.c5
-rw-r--r--fs/fuse/fuse_i.h1
-rw-r--r--fs/fuse/inode.c15
-rw-r--r--fs/gfs2/aops.c49
-rw-r--r--fs/gfs2/export.c3
-rw-r--r--fs/gfs2/file.c2
-rw-r--r--fs/gfs2/glock.c49
-rw-r--r--fs/gfs2/glock.h1
-rw-r--r--fs/gfs2/glops.c4
-rw-r--r--fs/gfs2/lock_dlm.c8
-rw-r--r--fs/gfs2/log.c63
-rw-r--r--fs/gfs2/lops.c21
-rw-r--r--fs/gfs2/meta_io.c9
-rw-r--r--fs/gfs2/ops_fstype.c4
-rw-r--r--fs/gfs2/quota.c22
-rw-r--r--fs/gfs2/recovery.c2
-rw-r--r--fs/gfs2/rgrp.c12
-rw-r--r--fs/gfs2/super.c88
-rw-r--r--fs/gfs2/sys.c2
-rw-r--r--fs/gfs2/trans.c2
-rw-r--r--fs/gfs2/util.c4
-rw-r--r--fs/gfs2/util.h15
-rw-r--r--fs/hfs/inode.c8
-rw-r--r--fs/hfsplus/hfsplus_fs.h1
-rw-r--r--fs/hfsplus/inode.c8
-rw-r--r--fs/hfsplus/super.c12
-rw-r--r--fs/hfsplus/wrapper.c5
-rw-r--r--fs/hostfs/hostfs_kern.c8
-rw-r--r--fs/hugetlbfs/inode.c31
-rw-r--r--fs/inode.c76
-rw-r--r--fs/internal.h20
-rw-r--r--fs/ioctl.c3
-rw-r--r--fs/iomap/buffered-io.c14
-rw-r--r--fs/jbd2/checkpoint.c11
-rw-r--r--fs/jbd2/journal.c11
-rw-r--r--fs/jbd2/recovery.c7
-rw-r--r--fs/jbd2/transaction.c14
-rw-r--r--fs/jffs2/debug.c2
-rw-r--r--fs/jfs/jfs_dmap.c57
-rw-r--r--fs/jfs/jfs_dtree.c7
-rw-r--r--fs/jfs/jfs_imap.c3
-rw-r--r--fs/jfs/jfs_mount.c6
-rw-r--r--fs/jfs/jfs_txnmgr.c2
-rw-r--r--fs/kernfs/dir.c62
-rw-r--r--fs/kernfs/file.c2
-rw-r--r--fs/kernfs/mount.c3
-rw-r--r--fs/libfs.c62
-rw-r--r--fs/lockd/svc.c11
-rw-r--r--fs/locks.c1
-rw-r--r--fs/minix/dir.c83
-rw-r--r--fs/minix/inode.c9
-rw-r--r--fs/minix/namei.c12
-rw-r--r--fs/mnt_idmapping.c159
-rw-r--r--fs/mount.h27
-rw-r--r--fs/mpage.c62
-rw-r--r--fs/namei.c141
-rw-r--r--fs/namespace.c659
-rw-r--r--fs/netfs/Kconfig39
-rw-r--r--fs/netfs/Makefile22
-rw-r--r--fs/netfs/buffered_read.c237
-rw-r--r--fs/netfs/buffered_write.c1257
-rw-r--r--fs/netfs/direct_read.c125
-rw-r--r--fs/netfs/direct_write.c174
-rw-r--r--fs/netfs/fscache_cache.c (renamed from fs/fscache/cache.c)3
-rw-r--r--fs/netfs/fscache_cookie.c (renamed from fs/fscache/cookie.c)0
-rw-r--r--fs/netfs/fscache_internal.h14
-rw-r--r--fs/netfs/fscache_io.c (renamed from fs/fscache/io.c)42
-rw-r--r--fs/netfs/fscache_main.c (renamed from fs/fscache/main.c)25
-rw-r--r--fs/netfs/fscache_proc.c (renamed from fs/fscache/proc.c)23
-rw-r--r--fs/netfs/fscache_stats.c (renamed from fs/fscache/stats.c)13
-rw-r--r--fs/netfs/fscache_volume.c (renamed from fs/fscache/volume.c)0
-rw-r--r--fs/netfs/internal.h284
-rw-r--r--fs/netfs/io.c217
-rw-r--r--fs/netfs/iterator.c97
-rw-r--r--fs/netfs/locking.c216
-rw-r--r--fs/netfs/main.c109
-rw-r--r--fs/netfs/misc.c260
-rw-r--r--fs/netfs/objects.c59
-rw-r--r--fs/netfs/output.c478
-rw-r--r--fs/netfs/stats.c42
-rw-r--r--fs/nfs/Kconfig4
-rw-r--r--fs/nfs/blocklayout/blocklayout.c7
-rw-r--r--fs/nfs/blocklayout/dev.c3
-rw-r--r--fs/nfs/blocklayout/rpc_pipefs.c2
-rw-r--r--fs/nfs/callback.c13
-rw-r--r--fs/nfs/callback.h24
-rw-r--r--fs/nfs/callback_proc.c55
-rw-r--r--fs/nfs/callback_xdr.c5
-rw-r--r--fs/nfs/client.c13
-rw-r--r--fs/nfs/dir.c8
-rw-r--r--fs/nfs/direct.c11
-rw-r--r--fs/nfs/file.c3
-rw-r--r--fs/nfs/fscache.c7
-rw-r--r--fs/nfs/fscache.h2
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/nfs/nfs42xattr.c8
-rw-r--r--fs/nfs/nfs4file.c5
-rw-r--r--fs/nfs/nfs4proc.c3
-rw-r--r--fs/nfs/nfs4sysctl.c1
-rw-r--r--fs/nfs/nfs4xdr.c23
-rw-r--r--fs/nfs/nfstrace.h22
-rw-r--r--fs/nfs/pnfs.c3
-rw-r--r--fs/nfs/sysctl.c1
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nfs/write.c23
-rw-r--r--fs/nfsd/Kconfig16
-rw-r--r--fs/nfsd/filecache.c6
-rw-r--r--fs/nfsd/netns.h11
-rw-r--r--fs/nfsd/nfs4callback.c26
-rw-r--r--fs/nfsd/nfs4proc.c7
-rw-r--r--fs/nfsd/nfs4recover.c97
-rw-r--r--fs/nfsd/nfs4state.c39
-rw-r--r--fs/nfsd/nfs4xdr.c13
-rw-r--r--fs/nfsd/nfscache.c6
-rw-r--r--fs/nfsd/nfsctl.c98
-rw-r--r--fs/nfsd/nfsd.h2
-rw-r--r--fs/nfsd/nfssvc.c69
-rw-r--r--fs/nfsd/trace.h22
-rw-r--r--fs/nfsd/vfs.c57
-rw-r--r--fs/nfsd/vfs.h1
-rw-r--r--fs/nfsd/xdr4.h1
-rw-r--r--fs/nilfs2/btnode.c62
-rw-r--r--fs/nilfs2/cpfile.c28
-rw-r--r--fs/nilfs2/dir.c244
-rw-r--r--fs/nilfs2/file.c34
-rw-r--r--fs/nilfs2/gcinode.c4
-rw-r--r--fs/nilfs2/inode.c15
-rw-r--r--fs/nilfs2/ioctl.c10
-rw-r--r--fs/nilfs2/mdt.c23
-rw-r--r--fs/nilfs2/namei.c45
-rw-r--r--fs/nilfs2/nilfs.h20
-rw-r--r--fs/nilfs2/page.c93
-rw-r--r--fs/nilfs2/page.h12
-rw-r--r--fs/nilfs2/recovery.c7
-rw-r--r--fs/nilfs2/segment.c166
-rw-r--r--fs/nilfs2/sufile.c9
-rw-r--r--fs/nilfs2/super.c8
-rw-r--r--fs/notify/dnotify/dnotify.c1
-rw-r--r--fs/notify/fanotify/fanotify.c34
-rw-r--r--fs/notify/fanotify/fanotify.h16
-rw-r--r--fs/notify/fanotify/fanotify_user.c125
-rw-r--r--fs/notify/fsnotify.c2
-rw-r--r--fs/notify/inotify/inotify_user.c1
-rw-r--r--fs/notify/mark.c52
-rw-r--r--fs/nsfs.c7
-rw-r--r--fs/ntfs/aops.c20
-rw-r--r--fs/ntfs/dir.c3
-rw-r--r--fs/ntfs/sysctl.c1
-rw-r--r--fs/ntfs3/attrib.c45
-rw-r--r--fs/ntfs3/attrlist.c12
-rw-r--r--fs/ntfs3/bitmap.c4
-rw-r--r--fs/ntfs3/dir.c48
-rw-r--r--fs/ntfs3/file.c76
-rw-r--r--fs/ntfs3/frecord.c19
-rw-r--r--fs/ntfs3/fslog.c232
-rw-r--r--fs/ntfs3/fsntfs.c29
-rw-r--r--fs/ntfs3/index.c8
-rw-r--r--fs/ntfs3/inode.c32
-rw-r--r--fs/ntfs3/namei.c12
-rw-r--r--fs/ntfs3/ntfs.h4
-rw-r--r--fs/ntfs3/ntfs_fs.h29
-rw-r--r--fs/ntfs3/record.c18
-rw-r--r--fs/ntfs3/super.c54
-rw-r--r--fs/ntfs3/xattr.c6
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/aops.c17
-rw-r--r--fs/ocfs2/dcache.c7
-rw-r--r--fs/ocfs2/dir.c9
-rw-r--r--fs/ocfs2/export.c1
-rw-r--r--fs/ocfs2/file.c2
-rw-r--r--fs/ocfs2/namei.c8
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
-rw-r--r--fs/ocfs2/stackglue.c1
-rw-r--r--fs/open.c47
-rw-r--r--fs/orangefs/dir.c32
-rw-r--r--fs/overlayfs/Kconfig1
-rw-r--r--fs/overlayfs/copy_up.c51
-rw-r--r--fs/overlayfs/dir.c4
-rw-r--r--fs/overlayfs/export.c27
-rw-r--r--fs/overlayfs/file.c247
-rw-r--r--fs/overlayfs/namei.c47
-rw-r--r--fs/overlayfs/overlayfs.h31
-rw-r--r--fs/overlayfs/ovl_entry.h9
-rw-r--r--fs/overlayfs/params.c2
-rw-r--r--fs/overlayfs/readdir.c9
-rw-r--r--fs/overlayfs/super.c52
-rw-r--r--fs/overlayfs/util.c62
-rw-r--r--fs/pipe.c25
-rw-r--r--fs/pnode.c2
-rw-r--r--fs/posix_acl.c4
-rw-r--r--fs/proc/array.c66
-rw-r--r--fs/proc/base.c31
-rw-r--r--fs/proc/inode.c19
-rw-r--r--fs/proc/internal.h2
-rw-r--r--fs/proc/proc_sysctl.c24
-rw-r--r--fs/proc/root.c2
-rw-r--r--fs/proc/task_mmu.c47
-rw-r--r--fs/proc_namespace.c13
-rw-r--r--fs/pstore/inode.c109
-rw-r--r--fs/pstore/ram.c1
-rw-r--r--fs/pstore/ram_core.c2
-rw-r--r--fs/qnx4/dir.c52
-rw-r--r--fs/qnx4/namei.c29
-rw-r--r--fs/qnx4/qnx4.h60
-rw-r--r--fs/quota/dquot.c7
-rw-r--r--fs/ramfs/file-nommu.c2
-rw-r--r--fs/read_write.c235
-rw-r--r--fs/readdir.c4
-rw-r--r--fs/reiserfs/namei.c61
-rw-r--r--fs/reiserfs/stree.c2
-rw-r--r--fs/remap_range.c52
-rw-r--r--fs/smb/client/cached_dir.c25
-rw-r--r--fs/smb/client/cifs_debug.c6
-rw-r--r--fs/smb/client/cifsencrypt.c2
-rw-r--r--fs/smb/client/cifsfs.c36
-rw-r--r--fs/smb/client/cifsfs.h4
-rw-r--r--fs/smb/client/cifsglob.h71
-rw-r--r--fs/smb/client/cifsproto.h32
-rw-r--r--fs/smb/client/cifssmb.c31
-rw-r--r--fs/smb/client/connect.c54
-rw-r--r--fs/smb/client/dfs.c7
-rw-r--r--fs/smb/client/dir.c7
-rw-r--r--fs/smb/client/file.c56
-rw-r--r--fs/smb/client/fs_context.c19
-rw-r--r--fs/smb/client/fs_context.h2
-rw-r--r--fs/smb/client/fscache.c2
-rw-r--r--fs/smb/client/inode.c145
-rw-r--r--fs/smb/client/link.c29
-rw-r--r--fs/smb/client/misc.c1
-rw-r--r--fs/smb/client/namespace.c16
-rw-r--r--fs/smb/client/readdir.c154
-rw-r--r--fs/smb/client/sess.c60
-rw-r--r--fs/smb/client/smb2glob.h26
-rw-r--r--fs/smb/client/smb2inode.c1176
-rw-r--r--fs/smb/client/smb2maperror.c2
-rw-r--r--fs/smb/client/smb2ops.c511
-rw-r--r--fs/smb/client/smb2pdu.c428
-rw-r--r--fs/smb/client/smb2proto.h40
-rw-r--r--fs/smb/client/smb2status.h2
-rw-r--r--fs/smb/client/smbdirect.c4
-rw-r--r--fs/smb/client/smbencrypt.c7
-rw-r--r--fs/smb/client/trace.h7
-rw-r--r--fs/smb/client/transport.c18
-rw-r--r--fs/smb/server/asn1.c5
-rw-r--r--fs/smb/server/auth.c14
-rw-r--r--fs/smb/server/connection.c7
-rw-r--r--fs/smb/server/connection.h2
-rw-r--r--fs/smb/server/ksmbd_netlink.h3
-rw-r--r--fs/smb/server/mgmt/ksmbd_ida.c21
-rw-r--r--fs/smb/server/misc.c1
-rw-r--r--fs/smb/server/oplock.c22
-rw-r--r--fs/smb/server/smb2pdu.c61
-rw-r--r--fs/smb/server/smb_common.c6
-rw-r--r--fs/smb/server/smbacl.c11
-rw-r--r--fs/smb/server/transport_ipc.c4
-rw-r--r--fs/smb/server/transport_rdma.c11
-rw-r--r--fs/smb/server/transport_tcp.c15
-rw-r--r--fs/smb/server/vfs.c33
-rw-r--r--fs/splice.c243
-rw-r--r--fs/squashfs/file.c3
-rw-r--r--fs/squashfs/file_direct.c6
-rw-r--r--fs/stat.c11
-rw-r--r--fs/super.c525
-rw-r--r--fs/sysctls.c1
-rw-r--r--fs/sysfs/dir.c2
-rw-r--r--fs/sysv/itree.c9
-rw-r--r--fs/tracefs/event_inode.c905
-rw-r--r--fs/tracefs/inode.c286
-rw-r--r--fs/tracefs/internal.h48
-rw-r--r--fs/ubifs/auth.c21
-rw-r--r--fs/ubifs/commit.c13
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c30
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/udf/namei.c18
-rw-r--r--fs/ufs/inode.c11
-rw-r--r--fs/userfaultfd.c78
-rw-r--r--fs/vboxsf/vboxsf_wrappers.c2
-rw-r--r--fs/verity/fsverity_private.h10
-rw-r--r--fs/verity/init.c2
-rw-r--r--fs/verity/measure.c84
-rw-r--r--fs/xfs/Makefile21
-rw-r--r--fs/xfs/libxfs/xfs_ag.c38
-rw-r--r--fs/xfs/libxfs/xfs_ag.h12
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c2
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c116
-rw-r--r--fs/xfs/libxfs/xfs_alloc.h24
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c13
-rw-r--r--fs/xfs/libxfs/xfs_attr.c131
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c238
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.h8
-rw-r--r--fs/xfs/libxfs/xfs_attr_sf.h24
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c201
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h9
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c123
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_btree.c28
-rw-r--r--fs/xfs/libxfs/xfs_btree.h5
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c89
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.h33
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c69
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.h2
-rw-r--r--fs/xfs/libxfs/xfs_da_format.h33
-rw-r--r--fs/xfs/libxfs/xfs_defer.c453
-rw-r--r--fs/xfs/libxfs/xfs_defer.h59
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c6
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c91
-rw-r--r--fs/xfs/libxfs/xfs_format.h19
-rw-r--r--fs/xfs/libxfs/xfs_health.h10
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c36
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.h3
-rw-r--r--fs/xfs/libxfs/xfs_ialloc_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c59
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c78
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h13
-rw-r--r--fs/xfs/libxfs/xfs_log_recover.h8
-rw-r--r--fs/xfs/libxfs/xfs_ondisk.h (renamed from fs/xfs/xfs_ondisk.h)22
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c57
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h12
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c15
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c106
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.h4
-rw-r--r--fs/xfs/libxfs/xfs_sb.c20
-rw-r--r--fs/xfs/libxfs/xfs_sb.h2
-rw-r--r--fs/xfs/libxfs/xfs_shared.h2
-rw-r--r--fs/xfs/libxfs/xfs_symlink_remote.c12
-rw-r--r--fs/xfs/libxfs/xfs_types.h20
-rw-r--r--fs/xfs/scrub/agb_bitmap.c103
-rw-r--r--fs/xfs/scrub/agb_bitmap.h68
-rw-r--r--fs/xfs/scrub/agheader_repair.c19
-rw-r--r--fs/xfs/scrub/alloc.c52
-rw-r--r--fs/xfs/scrub/alloc_repair.c934
-rw-r--r--fs/xfs/scrub/attr.c17
-rw-r--r--fs/xfs/scrub/bitmap.c467
-rw-r--r--fs/xfs/scrub/bitmap.h111
-rw-r--r--fs/xfs/scrub/bmap.c162
-rw-r--r--fs/xfs/scrub/bmap_repair.c867
-rw-r--r--fs/xfs/scrub/common.c35
-rw-r--r--fs/xfs/scrub/common.h56
-rw-r--r--fs/xfs/scrub/cow_repair.c614
-rw-r--r--fs/xfs/scrub/dir.c42
-rw-r--r--fs/xfs/scrub/dqiterate.c211
-rw-r--r--fs/xfs/scrub/fsb_bitmap.h37
-rw-r--r--fs/xfs/scrub/health.c34
-rw-r--r--fs/xfs/scrub/health.h2
-rw-r--r--fs/xfs/scrub/ialloc.c39
-rw-r--r--fs/xfs/scrub/ialloc_repair.c884
-rw-r--r--fs/xfs/scrub/inode.c20
-rw-r--r--fs/xfs/scrub/inode_repair.c1525
-rw-r--r--fs/xfs/scrub/newbt.c559
-rw-r--r--fs/xfs/scrub/newbt.h68
-rw-r--r--fs/xfs/scrub/off_bitmap.h37
-rw-r--r--fs/xfs/scrub/parent.c17
-rw-r--r--fs/xfs/scrub/quota.c107
-rw-r--r--fs/xfs/scrub/quota.h36
-rw-r--r--fs/xfs/scrub/quota_repair.c575
-rw-r--r--fs/xfs/scrub/readdir.c6
-rw-r--r--fs/xfs/scrub/reap.c168
-rw-r--r--fs/xfs/scrub/reap.h5
-rw-r--r--fs/xfs/scrub/refcount.c2
-rw-r--r--fs/xfs/scrub/refcount_repair.c794
-rw-r--r--fs/xfs/scrub/repair.c391
-rw-r--r--fs/xfs/scrub/repair.h99
-rw-r--r--fs/xfs/scrub/rmap.c1
-rw-r--r--fs/xfs/scrub/rtbitmap.c108
-rw-r--r--fs/xfs/scrub/rtbitmap.h22
-rw-r--r--fs/xfs/scrub/rtbitmap_repair.c202
-rw-r--r--fs/xfs/scrub/rtsummary.c144
-rw-r--r--fs/xfs/scrub/scrub.c62
-rw-r--r--fs/xfs/scrub/scrub.h15
-rw-r--r--fs/xfs/scrub/symlink.c22
-rw-r--r--fs/xfs/scrub/trace.c3
-rw-r--r--fs/xfs/scrub/trace.h488
-rw-r--r--fs/xfs/scrub/xfarray.h22
-rw-r--r--fs/xfs/xfs_aops.c2
-rw-r--r--fs/xfs/xfs_attr_item.c295
-rw-r--r--fs/xfs/xfs_attr_list.c13
-rw-r--r--fs/xfs/xfs_bmap_item.c200
-rw-r--r--fs/xfs/xfs_bmap_util.c141
-rw-r--r--fs/xfs/xfs_bmap_util.h2
-rw-r--r--fs/xfs/xfs_buf.c50
-rw-r--r--fs/xfs/xfs_buf.h1
-rw-r--r--fs/xfs/xfs_dir2_readdir.c9
-rw-r--r--fs/xfs/xfs_dquot.c39
-rw-r--r--fs/xfs/xfs_dquot.h8
-rw-r--r--fs/xfs/xfs_extent_busy.c13
-rw-r--r--fs/xfs/xfs_extent_busy.h2
-rw-r--r--fs/xfs/xfs_extfree_item.c332
-rw-r--r--fs/xfs/xfs_fsops.c63
-rw-r--r--fs/xfs/xfs_fsops.h14
-rw-r--r--fs/xfs/xfs_globals.c12
-rw-r--r--fs/xfs/xfs_health.c8
-rw-r--r--fs/xfs/xfs_inode.c65
-rw-r--r--fs/xfs/xfs_inode.h2
-rw-r--r--fs/xfs/xfs_inode_item.c13
-rw-r--r--fs/xfs/xfs_ioctl.c115
-rw-r--r--fs/xfs/xfs_log.c1
-rw-r--r--fs/xfs/xfs_log_priv.h1
-rw-r--r--fs/xfs/xfs_log_recover.c131
-rw-r--r--fs/xfs/xfs_mount.c8
-rw-r--r--fs/xfs/xfs_notify_failure.c108
-rw-r--r--fs/xfs/xfs_qm.c2
-rw-r--r--fs/xfs/xfs_quota.h5
-rw-r--r--fs/xfs/xfs_refcount_item.c234
-rw-r--r--fs/xfs/xfs_reflink.c2
-rw-r--r--fs/xfs/xfs_rmap_item.c257
-rw-r--r--fs/xfs/xfs_rtalloc.c659
-rw-r--r--fs/xfs/xfs_rtalloc.h37
-rw-r--r--fs/xfs/xfs_super.c58
-rw-r--r--fs/xfs/xfs_symlink.c7
-rw-r--r--fs/xfs/xfs_sysctl.c2
-rw-r--r--fs/xfs/xfs_sysctl.h2
-rw-r--r--fs/xfs/xfs_sysfs.c63
-rw-r--r--fs/xfs/xfs_trace.h42
-rw-r--r--fs/xfs/xfs_trans.c62
-rw-r--r--fs/xfs/xfs_trans.h16
-rw-r--r--fs/xfs/xfs_xattr.c6
-rw-r--r--fs/zonefs/file.c44
-rw-r--r--fs/zonefs/super.c68
797 files changed, 41885 insertions, 24172 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index 731e3d14b67d..0e8418066a48 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -42,6 +42,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb);
void v9fs_free_inode(struct inode *inode);
struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode,
dev_t rdev);
+void v9fs_set_netfs_context(struct inode *inode);
int v9fs_init_inode(struct v9fs_session_info *v9ses,
struct inode *inode, umode_t mode, dev_t rdev);
void v9fs_evict_inode(struct inode *inode);
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 8a635999a7d6..047855033d32 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -19,12 +19,45 @@
#include <linux/netfs.h>
#include <net/9p/9p.h>
#include <net/9p/client.h>
+#include <trace/events/netfs.h>
#include "v9fs.h"
#include "v9fs_vfs.h"
#include "cache.h"
#include "fid.h"
+static void v9fs_upload_to_server(struct netfs_io_subrequest *subreq)
+{
+ struct p9_fid *fid = subreq->rreq->netfs_priv;
+ int err, len;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err);
+ netfs_write_subrequest_terminated(subreq, len ?: err, false);
+}
+
+static void v9fs_upload_to_server_worker(struct work_struct *work)
+{
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
+
+ v9fs_upload_to_server(subreq);
+}
+
+/*
+ * Set up write requests for a writeback slice. We need to add a write request
+ * for each write we want to make.
+ */
+static void v9fs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
+ start, len, v9fs_upload_to_server_worker);
+ if (subreq)
+ netfs_queue_write_request(subreq);
+}
+
/**
* v9fs_issue_read - Issue a read from 9P
* @subreq: The read to make
@@ -33,14 +66,10 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
{
struct netfs_io_request *rreq = subreq->rreq;
struct p9_fid *fid = rreq->netfs_priv;
- struct iov_iter to;
- loff_t pos = subreq->start + subreq->transferred;
- size_t len = subreq->len - subreq->transferred;
int total, err;
- iov_iter_xarray(&to, ITER_DEST, &rreq->mapping->i_pages, pos, len);
-
- total = p9_client_read(fid, pos, &to, &err);
+ total = p9_client_read(fid, subreq->start + subreq->transferred,
+ &subreq->io_iter, &err);
/* if we just extended the file size, any portion not in
* cache won't be on server and is zeroes */
@@ -50,25 +79,42 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq)
}
/**
- * v9fs_init_request - Initialise a read request
+ * v9fs_init_request - Initialise a request
* @rreq: The read request
* @file: The file being read from
*/
static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- struct p9_fid *fid = file->private_data;
-
- BUG_ON(!fid);
+ struct p9_fid *fid;
+ bool writing = (rreq->origin == NETFS_READ_FOR_WRITE ||
+ rreq->origin == NETFS_WRITEBACK ||
+ rreq->origin == NETFS_WRITETHROUGH ||
+ rreq->origin == NETFS_LAUNDER_WRITE ||
+ rreq->origin == NETFS_UNBUFFERED_WRITE ||
+ rreq->origin == NETFS_DIO_WRITE);
+
+ if (file) {
+ fid = file->private_data;
+ if (!fid)
+ goto no_fid;
+ p9_fid_get(fid);
+ } else {
+ fid = v9fs_fid_find_inode(rreq->inode, writing, INVALID_UID, true);
+ if (!fid)
+ goto no_fid;
+ }
/* we might need to read from a fid that was opened write-only
* for read-modify-write of page cache, use the writeback fid
* for that */
- WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE &&
- !(fid->mode & P9_ORDWR));
-
- p9_fid_get(fid);
+ WARN_ON(rreq->origin == NETFS_READ_FOR_WRITE && !(fid->mode & P9_ORDWR));
rreq->netfs_priv = fid;
return 0;
+
+no_fid:
+ WARN_ONCE(1, "folio expected an open fid inode->i_ino=%lx\n",
+ rreq->inode->i_ino);
+ return -EINVAL;
}
/**
@@ -82,281 +128,20 @@ static void v9fs_free_request(struct netfs_io_request *rreq)
p9_fid_put(fid);
}
-/**
- * v9fs_begin_cache_operation - Begin a cache operation for a read
- * @rreq: The read request
- */
-static int v9fs_begin_cache_operation(struct netfs_io_request *rreq)
-{
-#ifdef CONFIG_9P_FSCACHE
- struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode));
-
- return fscache_begin_read_operation(&rreq->cache_resources, cookie);
-#else
- return -ENOBUFS;
-#endif
-}
-
const struct netfs_request_ops v9fs_req_ops = {
.init_request = v9fs_init_request,
.free_request = v9fs_free_request,
- .begin_cache_operation = v9fs_begin_cache_operation,
.issue_read = v9fs_issue_read,
+ .create_write_requests = v9fs_create_write_requests,
};
-/**
- * v9fs_release_folio - release the private state associated with a folio
- * @folio: The folio to be released
- * @gfp: The caller's allocation restrictions
- *
- * Returns true if the page can be released, false otherwise.
- */
-
-static bool v9fs_release_folio(struct folio *folio, gfp_t gfp)
-{
- if (folio_test_private(folio))
- return false;
-#ifdef CONFIG_9P_FSCACHE
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- fscache_note_page_release(v9fs_inode_cookie(V9FS_I(folio_inode(folio))));
-#endif
- return true;
-}
-
-static void v9fs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length)
-{
- folio_wait_fscache(folio);
-}
-
-#ifdef CONFIG_9P_FSCACHE
-static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct v9fs_inode *v9inode = priv;
- __le32 version;
-
- if (IS_ERR_VALUE(transferred_or_error) &&
- transferred_or_error != -ENOBUFS) {
- version = cpu_to_le32(v9inode->qid.version);
- fscache_invalidate(v9fs_inode_cookie(v9inode), &version,
- i_size_read(&v9inode->netfs.inode), 0);
- }
-}
-#endif
-
-static int v9fs_vfs_write_folio_locked(struct folio *folio)
-{
- struct inode *inode = folio_inode(folio);
- loff_t start = folio_pos(folio);
- loff_t i_size = i_size_read(inode);
- struct iov_iter from;
- size_t len = folio_size(folio);
- struct p9_fid *writeback_fid;
- int err;
- struct v9fs_inode __maybe_unused *v9inode = V9FS_I(inode);
- struct fscache_cookie __maybe_unused *cookie = v9fs_inode_cookie(v9inode);
-
- if (start >= i_size)
- return 0; /* Simultaneous truncation occurred */
-
- len = min_t(loff_t, i_size - start, len);
-
- iov_iter_xarray(&from, ITER_SOURCE, &folio_mapping(folio)->i_pages, start, len);
-
- writeback_fid = v9fs_fid_find_inode(inode, true, INVALID_UID, true);
- if (!writeback_fid) {
- WARN_ONCE(1, "folio expected an open fid inode->i_private=%p\n",
- inode->i_private);
- return -EINVAL;
- }
-
- folio_wait_fscache(folio);
- folio_start_writeback(folio);
-
- p9_client_write(writeback_fid, start, &from, &err);
-
-#ifdef CONFIG_9P_FSCACHE
- if (err == 0 &&
- fscache_cookie_enabled(cookie) &&
- test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) {
- folio_start_fscache(folio);
- fscache_write_to_cache(v9fs_inode_cookie(v9inode),
- folio_mapping(folio), start, len, i_size,
- v9fs_write_to_cache_done, v9inode,
- true);
- }
-#endif
-
- folio_end_writeback(folio);
- p9_fid_put(writeback_fid);
-
- return err;
-}
-
-static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
- int retval;
-
- p9_debug(P9_DEBUG_VFS, "folio %p\n", folio);
-
- retval = v9fs_vfs_write_folio_locked(folio);
- if (retval < 0) {
- if (retval == -EAGAIN) {
- folio_redirty_for_writepage(wbc, folio);
- retval = 0;
- } else {
- mapping_set_error(folio_mapping(folio), retval);
- }
- } else
- retval = 0;
-
- folio_unlock(folio);
- return retval;
-}
-
-static int v9fs_launder_folio(struct folio *folio)
-{
- int retval;
-
- if (folio_clear_dirty_for_io(folio)) {
- retval = v9fs_vfs_write_folio_locked(folio);
- if (retval)
- return retval;
- }
- folio_wait_fscache(folio);
- return 0;
-}
-
-/**
- * v9fs_direct_IO - 9P address space operation for direct I/O
- * @iocb: target I/O control block
- * @iter: The data/buffer to use
- *
- * The presence of v9fs_direct_IO() in the address space ops vector
- * allowes open() O_DIRECT flags which would have failed otherwise.
- *
- * In the non-cached mode, we shunt off direct read and write requests before
- * the VFS gets them, so this method should never be called.
- *
- * Direct IO is not 'yet' supported in the cached mode. Hence when
- * this routine is called through generic_file_aio_read(), the read/write fails
- * with an error.
- *
- */
-static ssize_t
-v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
-{
- struct file *file = iocb->ki_filp;
- loff_t pos = iocb->ki_pos;
- ssize_t n;
- int err = 0;
-
- if (iov_iter_rw(iter) == WRITE) {
- n = p9_client_write(file->private_data, pos, iter, &err);
- if (n) {
- struct inode *inode = file_inode(file);
- loff_t i_size = i_size_read(inode);
-
- if (pos + n > i_size)
- inode_add_bytes(inode, pos + n - i_size);
- }
- } else {
- n = p9_client_read(file->private_data, pos, iter, &err);
- }
- return n ? n : err;
-}
-
-static int v9fs_write_begin(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len,
- struct page **subpagep, void **fsdata)
-{
- int retval;
- struct folio *folio;
- struct v9fs_inode *v9inode = V9FS_I(mapping->host);
-
- p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
-
- /* Prefetch area to be written into the cache if we're caching this
- * file. We need to do this before we get a lock on the page in case
- * there's more than one writer competing for the same cache block.
- */
- retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata);
- if (retval < 0)
- return retval;
-
- *subpagep = &folio->page;
- return retval;
-}
-
-static int v9fs_write_end(struct file *filp, struct address_space *mapping,
- loff_t pos, unsigned int len, unsigned int copied,
- struct page *subpage, void *fsdata)
-{
- loff_t last_pos = pos + copied;
- struct folio *folio = page_folio(subpage);
- struct inode *inode = mapping->host;
-
- p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
-
- if (!folio_test_uptodate(folio)) {
- if (unlikely(copied < len)) {
- copied = 0;
- goto out;
- }
-
- folio_mark_uptodate(folio);
- }
-
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold the i_mutex.
- */
- if (last_pos > inode->i_size) {
- inode_add_bytes(inode, last_pos - inode->i_size);
- i_size_write(inode, last_pos);
-#ifdef CONFIG_9P_FSCACHE
- fscache_update_cookie(v9fs_inode_cookie(V9FS_I(inode)), NULL,
- &last_pos);
-#endif
- }
- folio_mark_dirty(folio);
-out:
- folio_unlock(folio);
- folio_put(folio);
-
- return copied;
-}
-
-#ifdef CONFIG_9P_FSCACHE
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- struct v9fs_inode *v9inode = V9FS_I(mapping->host);
-
- return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode));
-}
-#else
-#define v9fs_dirty_folio filemap_dirty_folio
-#endif
-
const struct address_space_operations v9fs_addr_operations = {
- .read_folio = netfs_read_folio,
- .readahead = netfs_readahead,
- .dirty_folio = v9fs_dirty_folio,
- .writepage = v9fs_vfs_writepage,
- .write_begin = v9fs_write_begin,
- .write_end = v9fs_write_end,
- .release_folio = v9fs_release_folio,
- .invalidate_folio = v9fs_invalidate_folio,
- .launder_folio = v9fs_launder_folio,
- .direct_IO = v9fs_direct_IO,
+ .read_folio = netfs_read_folio,
+ .readahead = netfs_readahead,
+ .dirty_folio = netfs_dirty_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
+ .launder_folio = netfs_launder_folio,
+ .direct_IO = noop_direct_IO,
+ .writepages = netfs_writepages,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 11cd8d23f6f2..bae330c2f0cf 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -353,25 +353,15 @@ static ssize_t
v9fs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct p9_fid *fid = iocb->ki_filp->private_data;
- int ret, err = 0;
p9_debug(P9_DEBUG_VFS, "fid %d count %zu offset %lld\n",
fid->fid, iov_iter_count(to), iocb->ki_pos);
- if (!(fid->mode & P9L_DIRECT)) {
- p9_debug(P9_DEBUG_VFS, "(cached)\n");
- return generic_file_read_iter(iocb, to);
- }
-
- if (iocb->ki_filp->f_flags & O_NONBLOCK)
- ret = p9_client_read_once(fid, iocb->ki_pos, to, &err);
- else
- ret = p9_client_read(fid, iocb->ki_pos, to, &err);
- if (!ret)
- return err;
+ if (fid->mode & P9L_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, to);
- iocb->ki_pos += ret;
- return ret;
+ p9_debug(P9_DEBUG_VFS, "(cached)\n");
+ return netfs_file_read_iter(iocb, to);
}
/*
@@ -407,46 +397,14 @@ v9fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct p9_fid *fid = file->private_data;
- ssize_t retval;
- loff_t origin;
- int err = 0;
p9_debug(P9_DEBUG_VFS, "fid %d\n", fid->fid);
- if (!(fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))) {
- p9_debug(P9_DEBUG_CACHE, "(cached)\n");
- return generic_file_write_iter(iocb, from);
- }
+ if (fid->mode & (P9L_DIRECT | P9L_NOWRITECACHE))
+ return netfs_unbuffered_write_iter(iocb, from);
- retval = generic_write_checks(iocb, from);
- if (retval <= 0)
- return retval;
-
- origin = iocb->ki_pos;
- retval = p9_client_write(file->private_data, iocb->ki_pos, from, &err);
- if (retval > 0) {
- struct inode *inode = file_inode(file);
- loff_t i_size;
- unsigned long pg_start, pg_end;
-
- pg_start = origin >> PAGE_SHIFT;
- pg_end = (origin + retval - 1) >> PAGE_SHIFT;
- if (inode->i_mapping && inode->i_mapping->nrpages)
- invalidate_inode_pages2_range(inode->i_mapping,
- pg_start, pg_end);
- iocb->ki_pos += retval;
- i_size = i_size_read(inode);
- if (iocb->ki_pos > i_size) {
- inode_add_bytes(inode, iocb->ki_pos - i_size);
- /*
- * Need to serialize against i_size_write() in
- * v9fs_stat2inode()
- */
- v9fs_i_size_write(inode, iocb->ki_pos);
- }
- return retval;
- }
- return err;
+ p9_debug(P9_DEBUG_CACHE, "(cached)\n");
+ return netfs_file_write_iter(iocb, from);
}
static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
@@ -519,36 +477,7 @@ v9fs_file_mmap(struct file *filp, struct vm_area_struct *vma)
static vm_fault_t
v9fs_vm_page_mkwrite(struct vm_fault *vmf)
{
- struct folio *folio = page_folio(vmf->page);
- struct file *filp = vmf->vma->vm_file;
- struct inode *inode = file_inode(filp);
-
-
- p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n",
- folio, (unsigned long)filp->private_data);
-
- /* Wait for the page to be written to the cache before we allow it to
- * be modified. We then assume the entire page will need writing back.
- */
-#ifdef CONFIG_9P_FSCACHE
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0)
- return VM_FAULT_NOPAGE;
-#endif
-
- /* Update file times before taking page lock */
- file_update_time(filp);
-
- if (folio_lock_killable(folio) < 0)
- return VM_FAULT_RETRY;
- if (folio_mapping(folio) != inode->i_mapping)
- goto out_unlock;
- folio_wait_stable(folio);
-
- return VM_FAULT_LOCKED;
-out_unlock:
- folio_unlock(folio);
- return VM_FAULT_NOPAGE;
+ return netfs_page_mkwrite(vmf, NULL);
}
static void v9fs_mmap_vm_close(struct vm_area_struct *vma)
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index b845ee18a80b..32572982f72e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -246,10 +246,10 @@ void v9fs_free_inode(struct inode *inode)
/*
* Set parameters for the netfs library
*/
-static void v9fs_set_netfs_context(struct inode *inode)
+void v9fs_set_netfs_context(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);
- netfs_inode_init(&v9inode->netfs, &v9fs_req_ops);
+ netfs_inode_init(&v9inode->netfs, &v9fs_req_ops, true);
}
int v9fs_init_inode(struct v9fs_session_info *v9ses,
@@ -326,8 +326,6 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses,
err = -EINVAL;
goto error;
}
-
- v9fs_set_netfs_context(inode);
error:
return err;
@@ -359,6 +357,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev)
iput(inode);
return ERR_PTR(err);
}
+ v9fs_set_netfs_context(inode);
return inode;
}
@@ -374,11 +373,8 @@ void v9fs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
-#ifdef CONFIG_9P_FSCACHE
version = cpu_to_le32(v9inode->qid.version);
- fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode,
- &version);
-#endif
+ netfs_clear_inode_writeback(inode, &version);
clear_inode(inode);
filemap_fdatawrite(&inode->i_data);
@@ -464,6 +460,7 @@ static struct inode *v9fs_qid_iget(struct super_block *sb,
goto error;
v9fs_stat2inode(st, inode, sb, 0);
+ v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
unlock_new_inode(inode);
return inode;
@@ -1113,7 +1110,7 @@ static int v9fs_vfs_setattr(struct mnt_idmap *idmap,
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
- truncate_pagecache(inode, iattr->ia_size);
+ netfs_resize_file(netfs_inode(inode), iattr->ia_size, true);
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cache & CACHE_FSCACHE) {
@@ -1181,6 +1178,7 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
+ v9inode->netfs.remote_i_size = stat->length;
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
v9fs_i_size_write(inode, stat->length);
/* not real number of blocks, but 512 byte ones ... */
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index c7319af2f471..3505227e1704 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -128,6 +128,7 @@ static struct inode *v9fs_qid_iget_dotl(struct super_block *sb,
goto error;
v9fs_stat2inode_dotl(st, inode, 0);
+ v9fs_set_netfs_context(inode);
v9fs_cache_inode_get_cookie(inode);
retval = v9fs_get_acl(inode, fid);
if (retval)
@@ -598,7 +599,7 @@ int v9fs_vfs_setattr_dotl(struct mnt_idmap *idmap,
if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size !=
i_size_read(inode)) {
truncate_setsize(inode, iattr->ia_size);
- truncate_pagecache(inode, iattr->ia_size);
+ netfs_resize_file(netfs_inode(inode), iattr->ia_size, true);
#ifdef CONFIG_9P_FSCACHE
if (v9ses->cache & CACHE_FSCACHE)
@@ -655,6 +656,7 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
mode |= inode->i_mode & ~S_IALLUGO;
inode->i_mode = mode;
+ v9inode->netfs.remote_i_size = stat->st_size;
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE))
v9fs_i_size_write(inode, stat->st_size);
inode->i_blocks = stat->st_blocks;
@@ -683,8 +685,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
inode->i_mode = mode;
}
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
- stat->st_result_mask & P9_STATS_SIZE)
+ stat->st_result_mask & P9_STATS_SIZE) {
+ v9inode->netfs.remote_i_size = stat->st_size;
v9fs_i_size_write(inode, stat->st_size);
+ }
if (stat->st_result_mask & P9_STATS_BLOCKS)
inode->i_blocks = stat->st_blocks;
}
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 73db55c050bf..941f7d0e0bfa 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -289,31 +289,21 @@ static int v9fs_drop_inode(struct inode *inode)
static int v9fs_write_inode(struct inode *inode,
struct writeback_control *wbc)
{
- struct v9fs_inode *v9inode;
-
/*
* send an fsync request to server irrespective of
* wbc->sync_mode.
*/
p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
-
- v9inode = V9FS_I(inode);
- fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
-
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static int v9fs_write_inode_dotl(struct inode *inode,
struct writeback_control *wbc)
{
- struct v9fs_inode *v9inode;
- v9inode = V9FS_I(inode);
p9_debug(P9_DEBUG_VFS, "%s: inode %p\n", __func__, inode);
- fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode));
-
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static const struct super_operations v9fs_super_ops = {
diff --git a/fs/Kconfig b/fs/Kconfig
index 42837617a55b..89fdbefd1075 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -18,6 +18,10 @@ config VALIDATE_FS_PARSER
config FS_IOMAP
bool
+# Stackable filesystems
+config FS_STACK
+ bool
+
config BUFFER_HEAD
bool
@@ -140,7 +144,6 @@ source "fs/overlayfs/Kconfig"
menu "Caches"
source "fs/netfs/Kconfig"
-source "fs/fscache/Kconfig"
source "fs/cachefiles/Kconfig"
endmenu
@@ -254,7 +257,7 @@ config TMPFS_QUOTA
config ARCH_SUPPORTS_HUGETLBFS
def_bool n
-config HUGETLBFS
+menuconfig HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || SPARC64 || ARCH_SUPPORTS_HUGETLBFS || BROKEN
depends on (SYSFS || SYSCTL)
@@ -266,6 +269,17 @@ config HUGETLBFS
If unsure, say N.
+if HUGETLBFS
+config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
+ bool "HugeTLB Vmemmap Optimization (HVO) defaults to on"
+ default n
+ depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
+ help
+ The HugeTLB Vmemmap Optimization (HVO) defaults to off. Say Y here to
+ enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
+ (boot command line) or hugetlb_optimize_vmemmap (sysctl).
+endif # HUGETLBFS
+
config HUGETLB_PAGE
def_bool HUGETLBFS
select XARRAY_MULTI
@@ -275,15 +289,6 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP
depends on SPARSEMEM_VMEMMAP
-config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
- bool "HugeTLB Vmemmap Optimization (HVO) defaults to on"
- default n
- depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
- help
- The HugeTLB VmemmapvOptimization (HVO) defaults to off. Say Y here to
- enable HVO by default. It can be disabled via hugetlb_free_vmemmap=off
- (boot command line) or hugetlb_optimize_vmemmap (sysctl).
-
config ARCH_HAS_GIGANTIC_PAGE
bool
diff --git a/fs/Makefile b/fs/Makefile
index 75522f88e763..c09016257f05 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -39,6 +39,7 @@ obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
obj-$(CONFIG_BINFMT_ELF_FDPIC) += binfmt_elf_fdpic.o
obj-$(CONFIG_BINFMT_FLAT) += binfmt_flat.o
+obj-$(CONFIG_FS_STACK) += backing-file.o
obj-$(CONFIG_FS_MBCACHE) += mbcache.o
obj-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
obj-$(CONFIG_NFS_COMMON) += nfs_common/
@@ -60,7 +61,6 @@ obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
obj-$(CONFIG_NETFS_SUPPORT) += netfs/
-obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT4_FS) += ext4/
# We place ext4 before ext2 so that clean ext3 root fs's do NOT mount using the
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 3081edb09e46..a183e213a4a5 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -5,6 +5,7 @@
* Copyright (C) 1997-1999 Russell King
*/
#include <linux/buffer_head.h>
+#include <linux/mpage.h>
#include <linux/writeback.h>
#include "adfs.h"
@@ -33,9 +34,10 @@ abort_toobig:
return 0;
}
-static int adfs_writepage(struct page *page, struct writeback_control *wbc)
+static int adfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, adfs_get_block, wbc);
+ return mpage_writepages(mapping, wbc, adfs_get_block);
}
static int adfs_read_folio(struct file *file, struct folio *folio)
@@ -76,10 +78,11 @@ static const struct address_space_operations adfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = adfs_read_folio,
- .writepage = adfs_writepage,
+ .writepages = adfs_writepages,
.write_begin = adfs_write_begin,
.write_end = generic_write_end,
- .bmap = _adfs_bmap
+ .migrate_folio = buffer_migrate_folio,
+ .bmap = _adfs_bmap,
};
/*
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 60685ec76d98..2e612834329a 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -105,6 +105,7 @@ struct affs_sb_info {
int work_queued; /* non-zero delayed work is queued */
struct delayed_work sb_work; /* superblock flush delayed work */
spinlock_t work_lock; /* protects sb_work and work_queued */
+ struct rcu_head rcu;
};
#define AFFS_MOUNT_SF_INTL 0x0001 /* International filesystem. */
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d6b9758ee23d..8c154490a2d6 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -532,9 +532,6 @@ static struct dentry *affs_get_parent(struct dentry *child)
parent = affs_iget(child->d_sb,
be32_to_cpu(AFFS_TAIL(child->d_sb, bh)->parent));
brelse(bh);
- if (IS_ERR(parent))
- return ERR_CAST(parent);
-
return d_obtain_alias(parent);
}
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 58b391446ae1..b56a95cf414a 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -640,7 +640,7 @@ static void affs_kill_sb(struct super_block *sb)
affs_brelse(sbi->s_root_bh);
kfree(sbi->s_prefix);
mutex_destroy(&sbi->s_bmlock);
- kfree(sbi);
+ kfree_rcu(sbi, rcu);
}
}
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index e8956b65d7ff..dcdc0f1bb76f 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -5,6 +5,7 @@
kafs-y := \
addr_list.o \
+ addr_prefs.o \
callback.o \
cell.o \
cmservice.o \
@@ -27,6 +28,7 @@ kafs-y := \
server.o \
server_list.o \
super.o \
+ validation.o \
vlclient.o \
vl_alias.o \
vl_list.o \
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index de1ae0bead3b..6d42f85c6be5 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -13,26 +13,55 @@
#include "internal.h"
#include "afs_fs.h"
+static void afs_free_addrlist(struct rcu_head *rcu)
+{
+ struct afs_addr_list *alist = container_of(rcu, struct afs_addr_list, rcu);
+ unsigned int i;
+
+ for (i = 0; i < alist->nr_addrs; i++)
+ rxrpc_kernel_put_peer(alist->addrs[i].peer);
+ trace_afs_alist(alist->debug_id, refcount_read(&alist->usage), afs_alist_trace_free);
+ kfree(alist);
+}
+
/*
* Release an address list.
*/
-void afs_put_addrlist(struct afs_addr_list *alist)
+void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason)
+{
+ unsigned int debug_id;
+ bool dead;
+ int r;
+
+ if (!alist)
+ return;
+ debug_id = alist->debug_id;
+ dead = __refcount_dec_and_test(&alist->usage, &r);
+ trace_afs_alist(debug_id, r - 1, reason);
+ if (dead)
+ call_rcu(&alist->rcu, afs_free_addrlist);
+}
+
+struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason)
{
- if (alist && refcount_dec_and_test(&alist->usage))
- kfree_rcu(alist, rcu);
+ int r;
+
+ if (alist) {
+ __refcount_inc(&alist->usage, &r);
+ trace_afs_alist(alist->debug_id, r + 1, reason);
+ }
+ return alist;
}
/*
* Allocate an address list.
*/
-struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
- unsigned short service,
- unsigned short port)
+struct afs_addr_list *afs_alloc_addrlist(unsigned int nr)
{
struct afs_addr_list *alist;
- unsigned int i;
+ static atomic_t debug_id;
- _enter("%u,%u,%u", nr, service, port);
+ _enter("%u", nr);
if (nr > AFS_MAX_ADDRESSES)
nr = AFS_MAX_ADDRESSES;
@@ -43,17 +72,8 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
refcount_set(&alist->usage, 1);
alist->max_addrs = nr;
-
- for (i = 0; i < nr; i++) {
- struct sockaddr_rxrpc *srx = &alist->addrs[i];
- srx->srx_family = AF_RXRPC;
- srx->srx_service = service;
- srx->transport_type = SOCK_DGRAM;
- srx->transport_len = sizeof(srx->transport.sin6);
- srx->transport.sin6.sin6_family = AF_INET6;
- srx->transport.sin6.sin6_port = htons(port);
- }
-
+ alist->debug_id = atomic_inc_return(&debug_id);
+ trace_afs_alist(alist->debug_id, 1, afs_alist_trace_alloc);
return alist;
}
@@ -126,7 +146,7 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
if (!vllist->servers[0].server)
goto error_vl;
- alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT);
+ alist = afs_alloc_addrlist(nr);
if (!alist)
goto error;
@@ -197,9 +217,11 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
}
if (family == AF_INET)
- afs_merge_fs_addr4(alist, x[0], xport);
+ ret = afs_merge_fs_addr4(net, alist, x[0], xport);
else
- afs_merge_fs_addr6(alist, x, xport);
+ ret = afs_merge_fs_addr6(net, alist, x, xport);
+ if (ret < 0)
+ goto error;
} while (p < end);
@@ -216,26 +238,13 @@ bad_address:
problem, p - text, (int)len, (int)len, text);
ret = -EINVAL;
error:
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_parse_error);
error_vl:
afs_put_vlserverlist(net, vllist);
return ERR_PTR(ret);
}
/*
- * Compare old and new address lists to see if there's been any change.
- * - How to do this in better than O(Nlog(N)) time?
- * - We don't really want to sort the address list, but would rather take the
- * list as we got it so as not to undo record rotation by the DNS server.
- */
-#if 0
-static int afs_cmp_addr_list(const struct afs_addr_list *a1,
- const struct afs_addr_list *a2)
-{
-}
-#endif
-
-/*
* Perform a DNS query for VL servers and build a up an address list.
*/
struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
@@ -271,25 +280,33 @@ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry
/*
* Merge an IPv4 entry into a fileserver address list.
*/
-void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
+int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *alist,
+ __be32 xdr, u16 port)
{
- struct sockaddr_rxrpc *srx;
- u32 addr = ntohl(xdr);
+ struct sockaddr_rxrpc srx;
+ struct rxrpc_peer *peer;
int i;
if (alist->nr_addrs >= alist->max_addrs)
- return;
+ return 0;
- for (i = 0; i < alist->nr_ipv4; i++) {
- struct sockaddr_in *a = &alist->addrs[i].transport.sin;
- u32 a_addr = ntohl(a->sin_addr.s_addr);
- u16 a_port = ntohs(a->sin_port);
+ srx.srx_family = AF_RXRPC;
+ srx.transport_type = SOCK_DGRAM;
+ srx.transport_len = sizeof(srx.transport.sin);
+ srx.transport.sin.sin_family = AF_INET;
+ srx.transport.sin.sin_port = htons(port);
+ srx.transport.sin.sin_addr.s_addr = xdr;
- if (addr == a_addr && port == a_port)
- return;
- if (addr == a_addr && port < a_port)
- break;
- if (addr < a_addr)
+ peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL);
+ if (!peer)
+ return -ENOMEM;
+
+ for (i = 0; i < alist->nr_ipv4; i++) {
+ if (peer == alist->addrs[i].peer) {
+ rxrpc_kernel_put_peer(peer);
+ return 0;
+ }
+ if (peer <= alist->addrs[i].peer)
break;
}
@@ -298,38 +315,42 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
- srx = &alist->addrs[i];
- srx->srx_family = AF_RXRPC;
- srx->transport_type = SOCK_DGRAM;
- srx->transport_len = sizeof(srx->transport.sin);
- srx->transport.sin.sin_family = AF_INET;
- srx->transport.sin.sin_port = htons(port);
- srx->transport.sin.sin_addr.s_addr = xdr;
+ alist->addrs[i].peer = peer;
alist->nr_ipv4++;
alist->nr_addrs++;
+ return 0;
}
/*
* Merge an IPv6 entry into a fileserver address list.
*/
-void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
+int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist,
+ __be32 *xdr, u16 port)
{
- struct sockaddr_rxrpc *srx;
- int i, diff;
+ struct sockaddr_rxrpc srx;
+ struct rxrpc_peer *peer;
+ int i;
if (alist->nr_addrs >= alist->max_addrs)
- return;
+ return 0;
- for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
- struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6;
- u16 a_port = ntohs(a->sin6_port);
+ srx.srx_family = AF_RXRPC;
+ srx.transport_type = SOCK_DGRAM;
+ srx.transport_len = sizeof(srx.transport.sin6);
+ srx.transport.sin6.sin6_family = AF_INET6;
+ srx.transport.sin6.sin6_port = htons(port);
+ memcpy(&srx.transport.sin6.sin6_addr, xdr, 16);
- diff = memcmp(xdr, &a->sin6_addr, 16);
- if (diff == 0 && port == a_port)
- return;
- if (diff == 0 && port < a_port)
- break;
- if (diff < 0)
+ peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL);
+ if (!peer)
+ return -ENOMEM;
+
+ for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+ if (peer == alist->addrs[i].peer) {
+ rxrpc_kernel_put_peer(peer);
+ return 0;
+ }
+ if (peer <= alist->addrs[i].peer)
break;
}
@@ -337,68 +358,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
memmove(alist->addrs + i + 1,
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
-
- srx = &alist->addrs[i];
- srx->srx_family = AF_RXRPC;
- srx->transport_type = SOCK_DGRAM;
- srx->transport_len = sizeof(srx->transport.sin6);
- srx->transport.sin6.sin6_family = AF_INET6;
- srx->transport.sin6.sin6_port = htons(port);
- memcpy(&srx->transport.sin6.sin6_addr, xdr, 16);
+ alist->addrs[i].peer = peer;
alist->nr_addrs++;
-}
-
-/*
- * Get an address to try.
- */
-bool afs_iterate_addresses(struct afs_addr_cursor *ac)
-{
- unsigned long set, failed;
- int index;
-
- if (!ac->alist)
- return false;
-
- set = ac->alist->responded;
- failed = ac->alist->failed;
- _enter("%lx-%lx-%lx,%d", set, failed, ac->tried, ac->index);
-
- ac->nr_iterations++;
-
- set &= ~(failed | ac->tried);
-
- if (!set)
- return false;
-
- index = READ_ONCE(ac->alist->preferred);
- if (test_bit(index, &set))
- goto selected;
-
- index = __ffs(set);
-
-selected:
- ac->index = index;
- set_bit(index, &ac->tried);
- ac->responded = false;
- return true;
-}
-
-/*
- * Release an address list cursor.
- */
-int afs_end_cursor(struct afs_addr_cursor *ac)
-{
- struct afs_addr_list *alist;
-
- alist = ac->alist;
- if (alist) {
- if (ac->responded &&
- ac->index != alist->preferred &&
- test_bit(ac->alist->preferred, &ac->tried))
- WRITE_ONCE(alist->preferred, ac->index);
- afs_put_addrlist(alist);
- ac->alist = NULL;
- }
-
- return ac->error;
+ return 0;
}
diff --git a/fs/afs/addr_prefs.c b/fs/afs/addr_prefs.c
new file mode 100644
index 000000000000..a189ff8a5034
--- /dev/null
+++ b/fs/afs/addr_prefs.c
@@ -0,0 +1,531 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Address preferences management
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": addr_prefs: " fmt
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
+#include <linux/seq_file.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+
+static inline struct afs_net *afs_seq2net_single(struct seq_file *m)
+{
+ return afs_net(seq_file_single_net(m));
+}
+
+/*
+ * Split a NUL-terminated string up to the first newline around spaces. The
+ * source string will be modified to have NUL-terminations inserted.
+ */
+static int afs_split_string(char **pbuf, char *strv[], unsigned int maxstrv)
+{
+ unsigned int count = 0;
+ char *p = *pbuf;
+
+ maxstrv--; /* Allow for terminal NULL */
+ for (;;) {
+ /* Skip over spaces */
+ while (isspace(*p)) {
+ if (*p == '\n') {
+ p++;
+ break;
+ }
+ p++;
+ }
+ if (!*p)
+ break;
+
+ /* Mark start of word */
+ if (count >= maxstrv) {
+ pr_warn("Too many elements in string\n");
+ return -EINVAL;
+ }
+ strv[count++] = p;
+
+ /* Skip over word */
+ while (!isspace(*p))
+ p++;
+ if (!*p)
+ break;
+
+ /* Mark end of word */
+ if (*p == '\n') {
+ *p++ = 0;
+ break;
+ }
+ *p++ = 0;
+ }
+
+ *pbuf = p;
+ strv[count] = NULL;
+ return count;
+}
+
+/*
+ * Parse an address with an optional subnet mask.
+ */
+static int afs_parse_address(char *p, struct afs_addr_preference *pref)
+{
+ const char *stop;
+ unsigned long mask, tmp;
+ char *end = p + strlen(p);
+ bool bracket = false;
+
+ if (*p == '[') {
+ p++;
+ bracket = true;
+ }
+
+#if 0
+ if (*p == '[') {
+ p++;
+ q = memchr(p, ']', end - p);
+ if (!q) {
+ pr_warn("Can't find closing ']'\n");
+ return -EINVAL;
+ }
+ } else {
+ for (q = p; q < end; q++)
+ if (*q == '/')
+ break;
+ }
+#endif
+
+ if (in4_pton(p, end - p, (u8 *)&pref->ipv4_addr, -1, &stop)) {
+ pref->family = AF_INET;
+ mask = 32;
+ } else if (in6_pton(p, end - p, (u8 *)&pref->ipv6_addr, -1, &stop)) {
+ pref->family = AF_INET6;
+ mask = 128;
+ } else {
+ pr_warn("Can't determine address family\n");
+ return -EINVAL;
+ }
+
+ p = (char *)stop;
+ if (bracket) {
+ if (*p != ']') {
+ pr_warn("Can't find closing ']'\n");
+ return -EINVAL;
+ }
+ p++;
+ }
+
+ if (*p == '/') {
+ p++;
+ tmp = simple_strtoul(p, &p, 10);
+ if (tmp > mask) {
+ pr_warn("Subnet mask too large\n");
+ return -EINVAL;
+ }
+ if (tmp == 0) {
+ pr_warn("Subnet mask too small\n");
+ return -EINVAL;
+ }
+ mask = tmp;
+ }
+
+ if (*p) {
+ pr_warn("Invalid address\n");
+ return -EINVAL;
+ }
+
+ pref->subnet_mask = mask;
+ return 0;
+}
+
+enum cmp_ret {
+ CONTINUE_SEARCH,
+ INSERT_HERE,
+ EXACT_MATCH,
+ SUBNET_MATCH,
+};
+
+/*
+ * See if a candidate address matches a listed address.
+ */
+static enum cmp_ret afs_cmp_address_pref(const struct afs_addr_preference *a,
+ const struct afs_addr_preference *b)
+{
+ int subnet = min(a->subnet_mask, b->subnet_mask);
+ const __be32 *pa, *pb;
+ u32 mask, na, nb;
+ int diff;
+
+ if (a->family != b->family)
+ return INSERT_HERE;
+
+ switch (a->family) {
+ case AF_INET6:
+ pa = a->ipv6_addr.s6_addr32;
+ pb = b->ipv6_addr.s6_addr32;
+ break;
+ case AF_INET:
+ pa = &a->ipv4_addr.s_addr;
+ pb = &b->ipv4_addr.s_addr;
+ break;
+ }
+
+ while (subnet > 32) {
+ diff = ntohl(*pa++) - ntohl(*pb++);
+ if (diff < 0)
+ return INSERT_HERE; /* a<b */
+ if (diff > 0)
+ return CONTINUE_SEARCH; /* a>b */
+ subnet -= 32;
+ }
+
+ if (subnet == 0)
+ return EXACT_MATCH;
+
+ mask = 0xffffffffU << (32 - subnet);
+ na = ntohl(*pa);
+ nb = ntohl(*pb);
+ diff = (na & mask) - (nb & mask);
+ //kdebug("diff %08x %08x %08x %d", na, nb, mask, diff);
+ if (diff < 0)
+ return INSERT_HERE; /* a<b */
+ if (diff > 0)
+ return CONTINUE_SEARCH; /* a>b */
+ if (a->subnet_mask == b->subnet_mask)
+ return EXACT_MATCH;
+ if (a->subnet_mask > b->subnet_mask)
+ return SUBNET_MATCH; /* a binds tighter than b */
+ return CONTINUE_SEARCH; /* b binds tighter than a */
+}
+
+/*
+ * Insert an address preference.
+ */
+static int afs_insert_address_pref(struct afs_addr_preference_list **_preflist,
+ struct afs_addr_preference *pref,
+ int index)
+{
+ struct afs_addr_preference_list *preflist = *_preflist, *old = preflist;
+ size_t size, max_prefs;
+
+ _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index);
+
+ if (preflist->nr == 255)
+ return -ENOSPC;
+ if (preflist->nr >= preflist->max_prefs) {
+ max_prefs = preflist->max_prefs + 1;
+ size = struct_size(preflist, prefs, max_prefs);
+ size = roundup_pow_of_two(size);
+ max_prefs = min_t(size_t, (size - sizeof(*preflist)) / sizeof(*pref), 255);
+ preflist = kmalloc(size, GFP_KERNEL);
+ if (!preflist)
+ return -ENOMEM;
+ *preflist = **_preflist;
+ preflist->max_prefs = max_prefs;
+ *_preflist = preflist;
+
+ if (index < preflist->nr)
+ memcpy(preflist->prefs + index + 1, old->prefs + index,
+ sizeof(*pref) * (preflist->nr - index));
+ if (index > 0)
+ memcpy(preflist->prefs, old->prefs, sizeof(*pref) * index);
+ } else {
+ if (index < preflist->nr)
+ memmove(preflist->prefs + index + 1, preflist->prefs + index,
+ sizeof(*pref) * (preflist->nr - index));
+ }
+
+ preflist->prefs[index] = *pref;
+ preflist->nr++;
+ if (pref->family == AF_INET)
+ preflist->ipv6_off++;
+ return 0;
+}
+
+/*
+ * Add an address preference.
+ * echo "add <proto> <IP>[/<mask>] <prior>" >/proc/fs/afs/addr_prefs
+ */
+static int afs_add_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist,
+ int argc, char **argv)
+{
+ struct afs_addr_preference_list *preflist = *_preflist;
+ struct afs_addr_preference pref;
+ enum cmp_ret cmp;
+ int ret, i, stop;
+
+ if (argc != 3) {
+ pr_warn("Wrong number of params\n");
+ return -EINVAL;
+ }
+
+ if (strcmp(argv[0], "udp") != 0) {
+ pr_warn("Unsupported protocol\n");
+ return -EINVAL;
+ }
+
+ ret = afs_parse_address(argv[1], &pref);
+ if (ret < 0)
+ return ret;
+
+ ret = kstrtou16(argv[2], 10, &pref.prio);
+ if (ret < 0) {
+ pr_warn("Invalid priority\n");
+ return ret;
+ }
+
+ if (pref.family == AF_INET) {
+ i = 0;
+ stop = preflist->ipv6_off;
+ } else {
+ i = preflist->ipv6_off;
+ stop = preflist->nr;
+ }
+
+ for (; i < stop; i++) {
+ cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ case SUBNET_MATCH:
+ return afs_insert_address_pref(_preflist, &pref, i);
+ case EXACT_MATCH:
+ preflist->prefs[i].prio = pref.prio;
+ return 0;
+ }
+ }
+
+ return afs_insert_address_pref(_preflist, &pref, i);
+}
+
+/*
+ * Delete an address preference.
+ */
+static int afs_delete_address_pref(struct afs_addr_preference_list **_preflist,
+ int index)
+{
+ struct afs_addr_preference_list *preflist = *_preflist;
+
+ _enter("{%u/%u/%u},%u", preflist->ipv6_off, preflist->nr, preflist->max_prefs, index);
+
+ if (preflist->nr == 0)
+ return -ENOENT;
+
+ if (index < preflist->nr - 1)
+ memmove(preflist->prefs + index, preflist->prefs + index + 1,
+ sizeof(preflist->prefs[0]) * (preflist->nr - index - 1));
+
+ if (index < preflist->ipv6_off)
+ preflist->ipv6_off--;
+ preflist->nr--;
+ return 0;
+}
+
+/*
+ * Delete an address preference.
+ * echo "del <proto> <IP>[/<mask>]" >/proc/fs/afs/addr_prefs
+ */
+static int afs_del_address_pref(struct afs_net *net, struct afs_addr_preference_list **_preflist,
+ int argc, char **argv)
+{
+ struct afs_addr_preference_list *preflist = *_preflist;
+ struct afs_addr_preference pref;
+ enum cmp_ret cmp;
+ int ret, i, stop;
+
+ if (argc != 2) {
+ pr_warn("Wrong number of params\n");
+ return -EINVAL;
+ }
+
+ if (strcmp(argv[0], "udp") != 0) {
+ pr_warn("Unsupported protocol\n");
+ return -EINVAL;
+ }
+
+ ret = afs_parse_address(argv[1], &pref);
+ if (ret < 0)
+ return ret;
+
+ if (pref.family == AF_INET) {
+ i = 0;
+ stop = preflist->ipv6_off;
+ } else {
+ i = preflist->ipv6_off;
+ stop = preflist->nr;
+ }
+
+ for (; i < stop; i++) {
+ cmp = afs_cmp_address_pref(&pref, &preflist->prefs[i]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ case SUBNET_MATCH:
+ return 0;
+ case EXACT_MATCH:
+ return afs_delete_address_pref(_preflist, i);
+ }
+ }
+
+ return -ENOANO;
+}
+
+/*
+ * Handle writes to /proc/fs/afs/addr_prefs
+ */
+int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size)
+{
+ struct afs_addr_preference_list *preflist, *old;
+ struct seq_file *m = file->private_data;
+ struct afs_net *net = afs_seq2net_single(m);
+ size_t psize;
+ char *argv[5];
+ int ret, argc, max_prefs;
+
+ inode_lock(file_inode(file));
+
+ /* Allocate a candidate new list and initialise it from the old. */
+ old = rcu_dereference_protected(net->address_prefs,
+ lockdep_is_held(&file_inode(file)->i_rwsem));
+
+ if (old)
+ max_prefs = old->nr + 1;
+ else
+ max_prefs = 1;
+
+ psize = struct_size(old, prefs, max_prefs);
+ psize = roundup_pow_of_two(psize);
+ max_prefs = min_t(size_t, (psize - sizeof(*old)) / sizeof(old->prefs[0]), 255);
+
+ ret = -ENOMEM;
+ preflist = kmalloc(struct_size(preflist, prefs, max_prefs), GFP_KERNEL);
+ if (!preflist)
+ goto done;
+
+ if (old)
+ memcpy(preflist, old, struct_size(preflist, prefs, old->nr));
+ else
+ memset(preflist, 0, sizeof(*preflist));
+ preflist->max_prefs = max_prefs;
+
+ do {
+ argc = afs_split_string(&buf, argv, ARRAY_SIZE(argv));
+ if (argc < 0)
+ return argc;
+ if (argc < 2)
+ goto inval;
+
+ if (strcmp(argv[0], "add") == 0)
+ ret = afs_add_address_pref(net, &preflist, argc - 1, argv + 1);
+ else if (strcmp(argv[0], "del") == 0)
+ ret = afs_del_address_pref(net, &preflist, argc - 1, argv + 1);
+ else
+ goto inval;
+ if (ret < 0)
+ goto done;
+ } while (*buf);
+
+ preflist->version++;
+ rcu_assign_pointer(net->address_prefs, preflist);
+ /* Store prefs before version */
+ smp_store_release(&net->address_pref_version, preflist->version);
+ kfree_rcu(old, rcu);
+ preflist = NULL;
+ ret = 0;
+
+done:
+ kfree(preflist);
+ inode_unlock(file_inode(file));
+ _leave(" = %d", ret);
+ return ret;
+
+inval:
+ pr_warn("Invalid Command\n");
+ ret = -EINVAL;
+ goto done;
+}
+
+/*
+ * Mark the priorities on an address list if the address preferences table has
+ * changed. The caller must hold the RCU read lock.
+ */
+void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist)
+{
+ const struct afs_addr_preference_list *preflist =
+ rcu_dereference(net->address_prefs);
+ const struct sockaddr_in6 *sin6;
+ const struct sockaddr_in *sin;
+ const struct sockaddr *sa;
+ struct afs_addr_preference test;
+ enum cmp_ret cmp;
+ int i, j;
+
+ if (!preflist || !preflist->nr || !alist->nr_addrs ||
+ smp_load_acquire(&alist->addr_pref_version) == preflist->version)
+ return;
+
+ test.family = AF_INET;
+ test.subnet_mask = 32;
+ test.prio = 0;
+ for (i = 0; i < alist->nr_ipv4; i++) {
+ sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer);
+ sin = (const struct sockaddr_in *)sa;
+ test.ipv4_addr = sin->sin_addr;
+ for (j = 0; j < preflist->ipv6_off; j++) {
+ cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ break;
+ case EXACT_MATCH:
+ case SUBNET_MATCH:
+ WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio);
+ break;
+ }
+ }
+ }
+
+ test.family = AF_INET6;
+ test.subnet_mask = 128;
+ test.prio = 0;
+ for (; i < alist->nr_addrs; i++) {
+ sa = rxrpc_kernel_remote_addr(alist->addrs[i].peer);
+ sin6 = (const struct sockaddr_in6 *)sa;
+ test.ipv6_addr = sin6->sin6_addr;
+ for (j = preflist->ipv6_off; j < preflist->nr; j++) {
+ cmp = afs_cmp_address_pref(&test, &preflist->prefs[j]);
+ switch (cmp) {
+ case CONTINUE_SEARCH:
+ continue;
+ case INSERT_HERE:
+ break;
+ case EXACT_MATCH:
+ case SUBNET_MATCH:
+ WRITE_ONCE(alist->addrs[i].prio, preflist->prefs[j].prio);
+ break;
+ }
+ }
+ }
+
+ smp_store_release(&alist->addr_pref_version, preflist->version);
+}
+
+/*
+ * Mark the priorities on an address list if the address preferences table has
+ * changed. Avoid taking the RCU read lock if we can.
+ */
+void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist)
+{
+ if (!net->address_prefs ||
+ /* Load version before prefs */
+ smp_load_acquire(&net->address_pref_version) == alist->addr_pref_version)
+ return;
+
+ rcu_read_lock();
+ afs_get_address_preferences_rcu(net, alist);
+ rcu_read_unlock();
+}
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
index 81815724db6c..b488072aee87 100644
--- a/fs/afs/afs.h
+++ b/fs/afs/afs.h
@@ -165,7 +165,8 @@ struct afs_status_cb {
* AFS volume synchronisation information
*/
struct afs_volsync {
- time64_t creation; /* volume creation time */
+ time64_t creation; /* Volume creation time (or TIME64_MIN) */
+ time64_t update; /* Volume update time (or TIME64_MIN) */
};
/*
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index a484fa642808..99b2c8172021 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -33,22 +33,20 @@ void afs_invalidate_mmap_work(struct work_struct *work)
unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
}
-void afs_server_init_callback_work(struct work_struct *work)
+static void afs_volume_init_callback(struct afs_volume *volume)
{
- struct afs_server *server = container_of(work, struct afs_server, initcb_work);
struct afs_vnode *vnode;
- struct afs_cell *cell = server->cell;
- down_read(&cell->fs_open_mmaps_lock);
+ down_read(&volume->open_mmaps_lock);
- list_for_each_entry(vnode, &cell->fs_open_mmaps, cb_mmap_link) {
- if (vnode->cb_server == server) {
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ list_for_each_entry(vnode, &volume->open_mmaps, cb_mmap_link) {
+ if (vnode->cb_v_check != atomic_read(&volume->cb_v_break)) {
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
queue_work(system_unbound_wq, &vnode->cb_work);
}
}
- up_read(&cell->fs_open_mmaps_lock);
+ up_read(&volume->open_mmaps_lock);
}
/*
@@ -57,15 +55,20 @@ void afs_server_init_callback_work(struct work_struct *work)
*/
void afs_init_callback_state(struct afs_server *server)
{
- rcu_read_lock();
- do {
- server->cb_s_break++;
- atomic_inc(&server->cell->fs_s_break);
- if (!list_empty(&server->cell->fs_open_mmaps))
- queue_work(system_unbound_wq, &server->initcb_work);
+ struct afs_server_entry *se;
- } while ((server = rcu_dereference(server->uuid_next)));
- rcu_read_unlock();
+ down_read(&server->cell->vs_lock);
+
+ list_for_each_entry(se, &server->volumes, slink) {
+ se->cb_expires_at = AFS_NO_CB_PROMISE;
+ se->volume->cb_expires_at = AFS_NO_CB_PROMISE;
+ trace_afs_cb_v_break(se->volume->vid, atomic_read(&se->volume->cb_v_break),
+ afs_cb_break_for_s_reinit);
+ if (!list_empty(&se->volume->open_mmaps))
+ afs_volume_init_callback(se->volume);
+ }
+
+ up_read(&server->cell->vs_lock);
}
/*
@@ -76,9 +79,9 @@ void __afs_break_callback(struct afs_vnode *vnode, enum afs_cb_break_reason reas
_enter("");
clear_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
- if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE) {
vnode->cb_break++;
- vnode->cb_v_break = vnode->volume->cb_v_break;
+ vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
afs_clear_permits(vnode);
if (vnode->lock_state == AFS_VNODE_LOCK_WAITING_FOR_CB)
@@ -110,13 +113,14 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell,
{
struct afs_volume *volume = NULL;
struct rb_node *p;
- int seq = 0;
+ int seq = 1;
- do {
+ for (;;) {
/* Unfortunately, rbtree walking doesn't give reliable results
* under just the RCU read lock, so we have to check for
* changes.
*/
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&cell->volume_lock, &seq);
p = rcu_dereference_raw(cell->volumes.rb_node);
@@ -132,35 +136,63 @@ static struct afs_volume *afs_lookup_volume_rcu(struct afs_cell *cell,
volume = NULL;
}
- } while (need_seqretry(&cell->volume_lock, seq));
+ if (volume && afs_try_get_volume(volume, afs_volume_trace_get_callback))
+ break;
+ if (!need_seqretry(&cell->volume_lock, seq))
+ break;
+ seq |= 1; /* Want a lock next time */
+ }
done_seqretry(&cell->volume_lock, seq);
return volume;
}
/*
+ * Allow the fileserver to break callbacks at the volume-level. This is
+ * typically done when, for example, a R/W volume is snapshotted to a R/O
+ * volume (the only way to change an R/O volume). It may also, however, happen
+ * when a volserver takes control of a volume (offlining it, moving it, etc.).
+ *
+ * Every file in that volume will need to be reevaluated.
+ */
+static void afs_break_volume_callback(struct afs_server *server,
+ struct afs_volume *volume)
+ __releases(RCU)
+{
+ struct afs_server_list *slist = rcu_dereference(volume->servers);
+ unsigned int i, cb_v_break;
+
+ write_lock(&volume->cb_v_break_lock);
+
+ for (i = 0; i < slist->nr_servers; i++)
+ if (slist->servers[i].server == server)
+ slist->servers[i].cb_expires_at = AFS_NO_CB_PROMISE;
+ volume->cb_expires_at = AFS_NO_CB_PROMISE;
+
+ cb_v_break = atomic_inc_return_release(&volume->cb_v_break);
+ trace_afs_cb_v_break(volume->vid, cb_v_break, afs_cb_break_for_volume_callback);
+
+ write_unlock(&volume->cb_v_break_lock);
+ rcu_read_unlock();
+
+ if (!list_empty(&volume->open_mmaps))
+ afs_volume_init_callback(volume);
+}
+
+/*
* allow the fileserver to explicitly break one callback
* - happens when
* - the backing file is changed
* - a lock is released
*/
-static void afs_break_one_callback(struct afs_volume *volume,
+static void afs_break_one_callback(struct afs_server *server,
+ struct afs_volume *volume,
struct afs_fid *fid)
{
struct super_block *sb;
struct afs_vnode *vnode;
struct inode *inode;
- if (fid->vnode == 0 && fid->unique == 0) {
- /* The callback break applies to an entire volume. */
- write_lock(&volume->cb_v_break_lock);
- volume->cb_v_break++;
- trace_afs_cb_break(fid, volume->cb_v_break,
- afs_cb_break_for_volume_callback, false);
- write_unlock(&volume->cb_v_break_lock);
- return;
- }
-
/* See if we can find a matching inode - even an I_NEW inode needs to
* be marked as it can have its callback broken before we finish
* setting up the local inode.
@@ -187,25 +219,35 @@ static void afs_break_some_callbacks(struct afs_server *server,
afs_volid_t vid = cbb->fid.vid;
size_t i;
+ rcu_read_lock();
volume = afs_lookup_volume_rcu(server->cell, vid);
+ if (cbb->fid.vnode == 0 && cbb->fid.unique == 0) {
+ afs_break_volume_callback(server, volume);
+ *_count -= 1;
+ if (*_count)
+ memmove(cbb, cbb + 1, sizeof(*cbb) * *_count);
+ } else {
+ /* TODO: Find all matching volumes if we couldn't match the server and
+ * break them anyway.
+ */
- /* TODO: Find all matching volumes if we couldn't match the server and
- * break them anyway.
- */
-
- for (i = *_count; i > 0; cbb++, i--) {
- if (cbb->fid.vid == vid) {
- _debug("- Fid { vl=%08llx n=%llu u=%u }",
- cbb->fid.vid,
- cbb->fid.vnode,
- cbb->fid.unique);
- --*_count;
- if (volume)
- afs_break_one_callback(volume, &cbb->fid);
- } else {
- *residue++ = *cbb;
+ for (i = *_count; i > 0; cbb++, i--) {
+ if (cbb->fid.vid == vid) {
+ _debug("- Fid { vl=%08llx n=%llu u=%u }",
+ cbb->fid.vid,
+ cbb->fid.vnode,
+ cbb->fid.unique);
+ --*_count;
+ if (volume)
+ afs_break_one_callback(server, volume, &cbb->fid);
+ } else {
+ *residue++ = *cbb;
+ }
}
+ rcu_read_unlock();
}
+
+ afs_put_volume(volume, afs_volume_trace_put_callback);
}
/*
@@ -218,11 +260,6 @@ void afs_break_callbacks(struct afs_server *server, size_t count,
ASSERT(server != NULL);
- rcu_read_lock();
-
while (count > 0)
afs_break_some_callbacks(server, callbacks, &count);
-
- rcu_read_unlock();
- return;
}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 926cb1188eba..caa09875f520 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -161,13 +161,12 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net,
refcount_set(&cell->ref, 1);
atomic_set(&cell->active, 0);
INIT_WORK(&cell->manager, afs_manage_cell_work);
+ init_rwsem(&cell->vs_lock);
cell->volumes = RB_ROOT;
INIT_HLIST_HEAD(&cell->proc_volumes);
seqlock_init(&cell->volume_lock);
cell->fs_servers = RB_ROOT;
seqlock_init(&cell->fs_lock);
- INIT_LIST_HEAD(&cell->fs_open_mmaps);
- init_rwsem(&cell->fs_open_mmaps_lock);
rwlock_init(&cell->vl_servers_lock);
cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS);
@@ -817,7 +816,7 @@ done:
final_destruction:
/* The root volume is pinning the cell */
- afs_put_volume(cell->net, cell->root_volume, afs_volume_trace_put_cell_root);
+ afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root);
cell->root_volume = NULL;
afs_put_cell(cell, afs_cell_trace_put_destroy);
}
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index d4ddb20d6732..99a3f20bc786 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -146,10 +146,11 @@ static int afs_find_cm_server_by_peer(struct afs_call *call)
{
struct sockaddr_rxrpc srx;
struct afs_server *server;
+ struct rxrpc_peer *peer;
- rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
+ peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall);
- server = afs_find_server(call->net, &srx);
+ server = afs_find_server(call->net, peer);
if (!server) {
trace_afs_cm_no_server(call, &srx);
return 0;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 5219182e52e1..8a67fc427e74 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -124,7 +124,7 @@ static void afs_dir_read_cleanup(struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
BUG_ON(xa_is_value(folio));
- ASSERTCMP(folio_file_mapping(folio), ==, mapping);
+ ASSERTCMP(folio->mapping, ==, mapping);
folio_put(folio);
}
@@ -202,12 +202,12 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
- BUG_ON(folio_file_mapping(folio) != mapping);
+ BUG_ON(folio->mapping != mapping);
size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio));
for (offset = 0; offset < size; offset += sizeof(*block)) {
block = kmap_local_folio(folio, offset);
- pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block);
+ pr_warn("[%02lx] %32phN\n", folio->index + offset, block);
kunmap_local(block);
}
}
@@ -233,7 +233,7 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
if (xas_retry(&xas, folio))
continue;
- BUG_ON(folio_file_mapping(folio) != mapping);
+ BUG_ON(folio->mapping != mapping);
if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) {
afs_dir_dump(dvnode, req);
@@ -474,6 +474,16 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
continue;
}
+ /* Don't expose silly rename entries to userspace. */
+ if (nlen > 6 &&
+ dire->u.name[0] == '.' &&
+ ctx->actor != afs_lookup_filldir &&
+ ctx->actor != afs_lookup_one_filldir &&
+ memcmp(dire->u.name, ".__afs", 6) == 0) {
+ ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
+ continue;
+ }
+
/* found the next entry */
if (!dir_emit(ctx, dire->u.name, nlen,
ntohl(dire->u.vnode),
@@ -693,8 +703,9 @@ static void afs_do_lookup_success(struct afs_operation *op)
vp = &op->file[0];
abort_code = vp->scb.status.abort_code;
if (abort_code != 0) {
- op->ac.abort_code = abort_code;
- op->error = afs_abort_to_error(abort_code);
+ op->call_abort_code = abort_code;
+ afs_op_set_error(op, afs_abort_to_error(abort_code));
+ op->cumul_error.abort_code = abort_code;
}
break;
@@ -707,6 +718,8 @@ static void afs_do_lookup_success(struct afs_operation *op)
break;
}
+ if (vp->scb.status.abort_code)
+ trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code);
if (!vp->scb.have_status && !vp->scb.have_error)
continue;
@@ -806,8 +819,8 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
cookie->fids[i].vid = dvnode->fid.vid;
cookie->ctx.actor = afs_lookup_filldir;
cookie->name = dentry->d_name;
- cookie->nr_fids = 2; /* slot 0 is saved for the fid we actually want
- * and slot 1 for the directory */
+ cookie->nr_fids = 2; /* slot 1 is saved for the fid we actually want
+ * and slot 0 for the directory */
if (!afs_server_supports_ibulk(dvnode))
cookie->one_only = true;
@@ -846,13 +859,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
_debug("nr_files %u", op->nr_files);
/* Need space for examining all the selected files */
- op->error = -ENOMEM;
if (op->nr_files > 2) {
op->more_files = kvcalloc(op->nr_files - 2,
sizeof(struct afs_vnode_param),
GFP_KERNEL);
- if (!op->more_files)
+ if (!op->more_files) {
+ afs_op_nomem(op);
goto out_op;
+ }
for (i = 2; i < op->nr_files; i++) {
vp = &op->more_files[i - 2];
@@ -878,14 +892,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
* lookups contained therein are stored in the reply without aborting
* the whole operation.
*/
- op->error = -ENOTSUPP;
+ afs_op_set_error(op, -ENOTSUPP);
if (!cookie->one_only) {
op->ops = &afs_inline_bulk_status_operation;
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
- if (op->error == -ENOTSUPP) {
+ if (afs_op_error(op) == -ENOTSUPP) {
/* We could try FS.BulkStatus next, but this aborts the entire
* op if any of the lookups fails - so, for the moment, revert
* to FS.FetchStatus for op->file[1].
@@ -895,12 +909,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
afs_begin_vnode_operation(op);
afs_wait_for_operation(op);
}
- inode = ERR_PTR(op->error);
out_op:
- if (op->error == 0) {
- inode = &op->file[1].vnode->netfs.inode;
- op->file[1].vnode = NULL;
+ if (!afs_op_error(op)) {
+ if (op->file[1].scb.status.abort_code) {
+ afs_op_accumulate_error(op, -ECONNABORTED,
+ op->file[1].scb.status.abort_code);
+ } else {
+ inode = &op->file[1].vnode->netfs.inode;
+ op->file[1].vnode = NULL;
+ }
}
if (op->file[0].scb.have_status)
@@ -1116,7 +1134,12 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags)
dir = AFS_FS_I(d_inode(parent));
/* validate the parent directory */
- afs_validate(dir, key);
+ ret = afs_validate(dir, key);
+ if (ret == -ERESTARTSYS) {
+ dput(parent);
+ key_put(key);
+ return ret;
+ }
if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
_debug("%pd: parent dir deleted", dentry);
@@ -1255,9 +1278,10 @@ void afs_check_for_remote_deletion(struct afs_operation *op)
{
struct afs_vnode *vnode = op->file[0].vnode;
- switch (op->ac.abort_code) {
+ switch (afs_op_abort_code(op)) {
case VNOVNODE:
set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ clear_nlink(&vnode->netfs.inode);
afs_break_callback(vnode, afs_cb_break_for_deleted);
}
}
@@ -1273,20 +1297,20 @@ static void afs_vnode_new_inode(struct afs_operation *op)
_enter("");
- ASSERTCMP(op->error, ==, 0);
+ ASSERTCMP(afs_op_error(op), ==, 0);
inode = afs_iget(op, vp);
if (IS_ERR(inode)) {
/* ENOMEM or EINTR at a really inconvenient time - just abandon
* the new directory on the server.
*/
- op->error = PTR_ERR(inode);
+ afs_op_accumulate_error(op, PTR_ERR(inode), 0);
return;
}
vnode = AFS_FS_I(inode);
set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
- if (!op->error)
+ if (!afs_op_error(op))
afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb);
d_instantiate(op->dentry, inode);
}
@@ -1320,7 +1344,7 @@ static void afs_create_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- if (op->error)
+ if (afs_op_error(op))
d_drop(op->dentry);
}
@@ -1373,7 +1397,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry)
clear_nlink(&vnode->netfs.inode);
set_bit(AFS_VNODE_DELETED, &vnode->flags);
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags);
}
}
@@ -1480,7 +1504,7 @@ static void afs_dir_remove_link(struct afs_operation *op)
struct dentry *dentry = op->dentry;
int ret;
- if (op->error != 0 ||
+ if (afs_op_error(op) ||
(op->file[1].scb.have_status && op->file[1].scb.have_error))
return;
if (d_really_is_positive(dentry))
@@ -1504,10 +1528,10 @@ static void afs_dir_remove_link(struct afs_operation *op)
ret = afs_validate(vnode, op->key);
if (ret != -ESTALE)
- op->error = ret;
+ afs_op_set_error(op, ret);
}
- _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error);
+ _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, afs_op_error(op));
}
static void afs_unlink_success(struct afs_operation *op)
@@ -1538,7 +1562,7 @@ static void afs_unlink_edit_dir(struct afs_operation *op)
static void afs_unlink_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- if (op->unlink.need_rehash && op->error < 0 && op->error != -ENOENT)
+ if (op->unlink.need_rehash && afs_op_error(op) < 0 && afs_op_error(op) != -ENOENT)
d_rehash(op->dentry);
}
@@ -1579,7 +1603,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
/* Try to make sure we have a callback promise on the victim. */
ret = afs_validate(vnode, op->key);
if (ret < 0) {
- op->error = ret;
+ afs_op_set_error(op, ret);
goto error;
}
@@ -1588,7 +1612,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
spin_unlock(&dentry->d_lock);
/* Start asynchronous writeout of the inode */
write_inode_now(d_inode(dentry), 0);
- op->error = afs_sillyrename(dvnode, vnode, dentry, op->key);
+ afs_op_set_error(op, afs_sillyrename(dvnode, vnode, dentry, op->key));
goto error;
}
if (!d_unhashed(dentry)) {
@@ -1609,7 +1633,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
/* If there was a conflict with a third party, check the status of the
* unlinked vnode.
*/
- if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ if (afs_op_error(op) == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
op->file[1].update_ctime = false;
op->fetch_status.which = 1;
op->ops = &afs_fetch_status_operation;
@@ -1691,7 +1715,7 @@ static void afs_link_success(struct afs_operation *op)
static void afs_link_put(struct afs_operation *op)
{
_enter("op=%08x", op->debug_id);
- if (op->error)
+ if (afs_op_error(op))
d_drop(op->dentry);
}
@@ -1889,7 +1913,7 @@ static void afs_rename_put(struct afs_operation *op)
if (op->rename.rehash)
d_rehash(op->rename.rehash);
dput(op->rename.tmp);
- if (op->error)
+ if (afs_op_error(op))
d_rehash(op->dentry);
}
@@ -1934,7 +1958,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
return PTR_ERR(op);
ret = afs_validate(vnode, op->key);
- op->error = ret;
+ afs_op_set_error(op, ret);
if (ret < 0)
goto error;
@@ -1971,7 +1995,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
op->rename.tmp = d_alloc(new_dentry->d_parent,
&new_dentry->d_name);
if (!op->rename.tmp) {
- op->error = -ENOMEM;
+ afs_op_nomem(op);
goto error;
}
@@ -1979,7 +2003,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
AFS_FS_I(d_inode(new_dentry)),
new_dentry, op->key);
if (ret) {
- op->error = ret;
+ afs_op_set_error(op, ret);
goto error;
}
@@ -2014,7 +2038,7 @@ static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags)
{
struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio));
- _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio));
+ _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio->index);
folio_detach_private(folio);
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index bb5807e87fa4..a1e581946b93 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -218,7 +218,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
/* If there was a conflict with a third party, check the status of the
* unlinked vnode.
*/
- if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ if (op->cumul_error.error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
op->file[1].update_ctime = false;
op->fetch_status.which = 1;
op->ops = &afs_fetch_status_operation;
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 1f656005018e..c4d2711e20ad 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -76,7 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root)
/* there shouldn't be an existing inode */
BUG_ON(!(inode->i_state & I_NEW));
- netfs_inode_init(&vnode->netfs, NULL);
+ netfs_inode_init(&vnode->netfs, NULL, false);
inode->i_size = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
if (root) {
@@ -258,16 +258,7 @@ const struct inode_operations afs_dynroot_inode_operations = {
.lookup = afs_dynroot_lookup,
};
-/*
- * Dirs in the dynamic root don't need revalidation.
- */
-static int afs_dynroot_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
- return 1;
-}
-
const struct dentry_operations afs_dynroot_dentry_operations = {
- .d_revalidate = afs_dynroot_d_revalidate,
.d_delete = always_delete_dentry,
.d_release = afs_d_release,
.d_automount = afs_d_automount,
@@ -373,7 +364,7 @@ error:
void afs_dynroot_depopulate(struct super_block *sb)
{
struct afs_net *net = afs_sb2net(sb);
- struct dentry *root = sb->s_root, *subdir, *tmp;
+ struct dentry *root = sb->s_root, *subdir;
/* Prevent more subdirs from being created */
mutex_lock(&net->proc_cells_lock);
@@ -382,10 +373,11 @@ void afs_dynroot_depopulate(struct super_block *sb)
mutex_unlock(&net->proc_cells_lock);
if (root) {
+ struct hlist_node *n;
inode_lock(root->d_inode);
/* Remove all the pins for dirs created for manually added cells */
- list_for_each_entry_safe(subdir, tmp, &root->d_subdirs, d_child) {
+ hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) {
if (subdir->d_fsdata) {
subdir->d_fsdata = NULL;
dput(subdir);
diff --git a/fs/afs/file.c b/fs/afs/file.c
index d37dd201752b..ef2cc8f565d2 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -20,9 +20,6 @@
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
static int afs_symlink_read_folio(struct file *file, struct folio *folio);
-static void afs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length);
-static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags);
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter);
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
@@ -37,7 +34,7 @@ const struct file_operations afs_file_operations = {
.release = afs_release,
.llseek = generic_file_llseek,
.read_iter = afs_file_read_iter,
- .write_iter = afs_file_write,
+ .write_iter = netfs_file_write_iter,
.mmap = afs_file_mmap,
.splice_read = afs_file_splice_read,
.splice_write = iter_file_splice_write,
@@ -53,22 +50,21 @@ const struct inode_operations afs_file_inode_operations = {
};
const struct address_space_operations afs_file_aops = {
+ .direct_IO = noop_direct_IO,
.read_folio = netfs_read_folio,
.readahead = netfs_readahead,
- .dirty_folio = afs_dirty_folio,
- .launder_folio = afs_launder_folio,
- .release_folio = afs_release_folio,
- .invalidate_folio = afs_invalidate_folio,
- .write_begin = afs_write_begin,
- .write_end = afs_write_end,
- .writepages = afs_writepages,
+ .dirty_folio = netfs_dirty_folio,
+ .launder_folio = netfs_launder_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
+ .writepages = afs_writepages,
};
const struct address_space_operations afs_symlink_aops = {
.read_folio = afs_symlink_read_folio,
- .release_folio = afs_release_folio,
- .invalidate_folio = afs_invalidate_folio,
+ .release_folio = netfs_release_folio,
+ .invalidate_folio = netfs_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
};
@@ -243,12 +239,9 @@ static void afs_fetch_data_notify(struct afs_operation *op)
{
struct afs_read *req = op->fetch.req;
struct netfs_io_subrequest *subreq = req->subreq;
- int error = op->error;
+ int error = afs_op_error(op);
- if (error == -ECONNABORTED)
- error = afs_abort_to_error(op->ac.abort_code);
req->error = error;
-
if (subreq) {
__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
@@ -271,7 +264,7 @@ static void afs_fetch_data_success(struct afs_operation *op)
static void afs_fetch_data_put(struct afs_operation *op)
{
- op->fetch.req->error = op->error;
+ op->fetch.req->error = afs_op_error(op);
afs_put_read(op->fetch.req);
}
@@ -326,11 +319,7 @@ static void afs_issue_read(struct netfs_io_subrequest *subreq)
fsreq->len = subreq->len - subreq->transferred;
fsreq->key = key_get(subreq->rreq->netfs_priv);
fsreq->vnode = vnode;
- fsreq->iter = &fsreq->def_iter;
-
- iov_iter_xarray(&fsreq->def_iter, ITER_DEST,
- &fsreq->vnode->netfs.inode.i_mapping->i_pages,
- fsreq->pos, fsreq->len);
+ fsreq->iter = &subreq->io_iter;
afs_fetch_data(fsreq->vnode, fsreq);
afs_put_read(fsreq);
@@ -362,22 +351,13 @@ static int afs_symlink_read_folio(struct file *file, struct folio *folio)
static int afs_init_request(struct netfs_io_request *rreq, struct file *file)
{
- rreq->netfs_priv = key_get(afs_file_key(file));
+ if (file)
+ rreq->netfs_priv = key_get(afs_file_key(file));
+ rreq->rsize = 256 * 1024;
+ rreq->wsize = 256 * 1024;
return 0;
}
-static int afs_begin_cache_operation(struct netfs_io_request *rreq)
-{
-#ifdef CONFIG_AFS_FSCACHE
- struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
-
- return fscache_begin_read_operation(&rreq->cache_resources,
- afs_vnode_cache(vnode));
-#else
- return -ENOBUFS;
-#endif
-}
-
static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
struct folio **foliop, void **_fsdata)
{
@@ -391,153 +371,65 @@ static void afs_free_request(struct netfs_io_request *rreq)
key_put(rreq->netfs_priv);
}
-const struct netfs_request_ops afs_req_ops = {
- .init_request = afs_init_request,
- .free_request = afs_free_request,
- .begin_cache_operation = afs_begin_cache_operation,
- .check_write_begin = afs_check_write_begin,
- .issue_read = afs_issue_read,
-};
-
-int afs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode)));
- return 0;
-}
-
-/*
- * Adjust the dirty region of the page on truncation or full invalidation,
- * getting rid of the markers altogether if the region is entirely invalidated.
- */
-static void afs_invalidate_dirty(struct folio *folio, size_t offset,
- size_t length)
+static void afs_update_i_size(struct inode *inode, loff_t new_i_size)
{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
- unsigned long priv;
- unsigned int f, t, end = offset + length;
-
- priv = (unsigned long)folio_get_private(folio);
-
- /* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == folio_size(folio))
- goto full_invalidate;
-
- /* If the page was dirtied by page_mkwrite(), the PTE stays writable
- * and we don't get another notification to tell us to expand it
- * again.
- */
- if (afs_is_folio_dirty_mmapped(priv))
- return;
-
- /* We may need to shorten the dirty region */
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
-
- if (t <= offset || f >= end)
- return; /* Doesn't overlap */
-
- if (f < offset && t > end)
- return; /* Splits the dirty region - just absorb it */
-
- if (f >= offset && t <= end)
- goto undirty;
+ struct afs_vnode *vnode = AFS_FS_I(inode);
+ loff_t i_size;
- if (f < offset)
- t = offset;
- else
- f = end;
- if (f == t)
- goto undirty;
-
- priv = afs_folio_dirty(folio, f, t);
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio);
- return;
-
-undirty:
- trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio);
- folio_clear_dirty_for_io(folio);
-full_invalidate:
- trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio);
- folio_detach_private(folio);
+ write_seqlock(&vnode->cb_lock);
+ i_size = i_size_read(&vnode->netfs.inode);
+ if (new_i_size > i_size) {
+ i_size_write(&vnode->netfs.inode, new_i_size);
+ inode_set_bytes(&vnode->netfs.inode, new_i_size);
+ }
+ write_sequnlock(&vnode->cb_lock);
+ fscache_update_cookie(afs_vnode_cache(vnode), NULL, &new_i_size);
}
-/*
- * invalidate part or all of a page
- * - release a page and clean up its private data if offset is 0 (indicating
- * the entire page)
- */
-static void afs_invalidate_folio(struct folio *folio, size_t offset,
- size_t length)
+static void afs_netfs_invalidate_cache(struct netfs_io_request *wreq)
{
- _enter("{%lu},%zu,%zu", folio->index, offset, length);
-
- BUG_ON(!folio_test_locked(folio));
+ struct afs_vnode *vnode = AFS_FS_I(wreq->inode);
- if (folio_get_private(folio))
- afs_invalidate_dirty(folio, offset, length);
-
- folio_wait_fscache(folio);
- _leave("");
+ afs_invalidate_cache(vnode, 0);
}
-/*
- * release a page and clean up its private state if it's not busy
- * - return true if the page can now be released, false if not
- */
-static bool afs_release_folio(struct folio *folio, gfp_t gfp)
-{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
-
- _enter("{{%llx:%llu}[%lu],%lx},%x",
- vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags,
- gfp);
-
- /* deny if folio is being written to the cache and the caller hasn't
- * elected to wait */
-#ifdef CONFIG_AFS_FSCACHE
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- fscache_note_page_release(afs_vnode_cache(vnode));
-#endif
-
- if (folio_test_private(folio)) {
- trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio);
- folio_detach_private(folio);
- }
-
- /* Indicate that the folio can be released */
- _leave(" = T");
- return true;
-}
+const struct netfs_request_ops afs_req_ops = {
+ .init_request = afs_init_request,
+ .free_request = afs_free_request,
+ .check_write_begin = afs_check_write_begin,
+ .issue_read = afs_issue_read,
+ .update_i_size = afs_update_i_size,
+ .invalidate_cache = afs_netfs_invalidate_cache,
+ .create_write_requests = afs_create_write_requests,
+};
static void afs_add_open_mmap(struct afs_vnode *vnode)
{
if (atomic_inc_return(&vnode->cb_nr_mmap) == 1) {
- down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ down_write(&vnode->volume->open_mmaps_lock);
if (list_empty(&vnode->cb_mmap_link))
- list_add_tail(&vnode->cb_mmap_link,
- &vnode->volume->cell->fs_open_mmaps);
+ list_add_tail(&vnode->cb_mmap_link, &vnode->volume->open_mmaps);
- up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ up_write(&vnode->volume->open_mmaps_lock);
}
}
static void afs_drop_open_mmap(struct afs_vnode *vnode)
{
- if (!atomic_dec_and_test(&vnode->cb_nr_mmap))
+ if (atomic_add_unless(&vnode->cb_nr_mmap, -1, 1))
return;
- down_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ down_write(&vnode->volume->open_mmaps_lock);
- if (atomic_read(&vnode->cb_nr_mmap) == 0)
+ read_seqlock_excl(&vnode->cb_lock);
+ // the only place where ->cb_nr_mmap may hit 0
+ // see __afs_break_callback() for the other side...
+ if (atomic_dec_and_test(&vnode->cb_nr_mmap))
list_del_init(&vnode->cb_mmap_link);
+ read_sequnlock_excl(&vnode->cb_lock);
- up_write(&vnode->volume->cell->fs_open_mmaps_lock);
+ up_write(&vnode->volume->open_mmaps_lock);
flush_work(&vnode->cb_work);
}
@@ -573,35 +465,46 @@ static vm_fault_t afs_vm_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pg
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(vmf->vma->vm_file));
- if (afs_pagecache_valid(vnode))
+ if (afs_check_validity(vnode))
return filemap_map_pages(vmf, start_pgoff, end_pgoff);
return 0;
}
static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = iocb->ki_filp->private_data;
- int ret;
+ ssize_t ret;
- ret = afs_validate(vnode, af->key);
+ if (iocb->ki_flags & IOCB_DIRECT)
+ return netfs_unbuffered_read_iter(iocb, iter);
+
+ ret = netfs_start_io_read(inode);
if (ret < 0)
return ret;
-
- return generic_file_read_iter(iocb, iter);
+ ret = afs_validate(vnode, af->key);
+ if (ret == 0)
+ ret = filemap_read(iocb, iter, 0);
+ netfs_end_io_read(inode);
+ return ret;
}
static ssize_t afs_file_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe,
size_t len, unsigned int flags)
{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(in));
+ struct inode *inode = file_inode(in);
+ struct afs_vnode *vnode = AFS_FS_I(inode);
struct afs_file *af = in->private_data;
- int ret;
+ ssize_t ret;
- ret = afs_validate(vnode, af->key);
+ ret = netfs_start_io_read(inode);
if (ret < 0)
return ret;
-
- return filemap_splice_read(in, ppos, pipe, len, flags);
+ ret = afs_validate(vnode, af->key);
+ if (ret == 0)
+ ret = filemap_splice_read(in, ppos, pipe, len, flags);
+ netfs_end_io_read(inode);
+ return ret;
}
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 7a3803ce3a22..3546b087e791 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -35,13 +35,15 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
key_get(key);
}
- op->key = key;
- op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op);
- op->net = volume->cell->net;
- op->cb_v_break = volume->cb_v_break;
- op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
- op->error = -EDESTADDRREQ;
- op->ac.error = SHRT_MAX;
+ op->key = key;
+ op->volume = afs_get_volume(volume, afs_volume_trace_get_new_op);
+ op->net = volume->cell->net;
+ op->cb_v_break = atomic_read(&volume->cb_v_break);
+ op->pre_volsync.creation = volume->creation_time;
+ op->pre_volsync.update = volume->update_time;
+ op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
+ op->nr_iterations = -1;
+ afs_op_set_error(op, -EDESTADDRREQ);
_leave(" = [op=%08x]", op->debug_id);
return op;
@@ -71,7 +73,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
swap(vnode, vnode2);
if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
- op->error = -ERESTARTSYS;
+ afs_op_set_error(op, -ERESTARTSYS);
op->flags |= AFS_OPERATION_STOP;
_leave(" = f [I 0]");
return false;
@@ -80,7 +82,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
if (vnode2) {
if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) {
- op->error = -ERESTARTSYS;
+ afs_op_set_error(op, -ERESTARTSYS);
op->flags |= AFS_OPERATION_STOP;
mutex_unlock(&vnode->io_lock);
op->flags &= ~AFS_OPERATION_LOCK_0;
@@ -147,7 +149,7 @@ bool afs_begin_vnode_operation(struct afs_operation *op)
afs_prepare_vnode(op, &op->file[0], 0);
afs_prepare_vnode(op, &op->file[1], 1);
- op->cb_v_break = op->volume->cb_v_break;
+ op->cb_v_break = atomic_read(&op->volume->cb_v_break);
_leave(" = true");
return true;
}
@@ -159,16 +161,16 @@ static void afs_end_vnode_operation(struct afs_operation *op)
{
_enter("");
- if (op->error == -EDESTADDRREQ ||
- op->error == -EADDRNOTAVAIL ||
- op->error == -ENETUNREACH ||
- op->error == -EHOSTUNREACH)
+ switch (afs_op_error(op)) {
+ case -EDESTADDRREQ:
+ case -EADDRNOTAVAIL:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
afs_dump_edestaddrreq(op);
+ break;
+ }
afs_drop_io_locks(op);
-
- if (op->error == -ECONNABORTED)
- op->error = afs_abort_to_error(op->ac.abort_code);
}
/*
@@ -179,37 +181,43 @@ void afs_wait_for_operation(struct afs_operation *op)
_enter("");
while (afs_select_fileserver(op)) {
- op->cb_s_break = op->server->cb_s_break;
+ op->call_responded = false;
+ op->call_error = 0;
+ op->call_abort_code = 0;
if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) &&
op->ops->issue_yfs_rpc)
op->ops->issue_yfs_rpc(op);
else if (op->ops->issue_afs_rpc)
op->ops->issue_afs_rpc(op);
else
- op->ac.error = -ENOTSUPP;
-
- if (op->call)
- op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
+ op->call_error = -ENOTSUPP;
+
+ if (op->call) {
+ afs_wait_for_call_to_complete(op->call);
+ op->call_abort_code = op->call->abort_code;
+ op->call_error = op->call->error;
+ op->call_responded = op->call->responded;
+ afs_put_call(op->call);
+ }
}
- switch (op->error) {
- case 0:
+ if (op->call_responded)
+ set_bit(AFS_SERVER_FL_RESPONDING, &op->server->flags);
+
+ if (!afs_op_error(op)) {
_debug("success");
op->ops->success(op);
- break;
- case -ECONNABORTED:
+ } else if (op->cumul_error.aborted) {
if (op->ops->aborted)
op->ops->aborted(op);
- fallthrough;
- default:
+ } else {
if (op->ops->failed)
op->ops->failed(op);
- break;
}
afs_end_vnode_operation(op);
- if (op->error == 0 && op->ops->edit_dir) {
+ if (!afs_op_error(op) && op->ops->edit_dir) {
_debug("edit_dir");
op->ops->edit_dir(op);
}
@@ -221,7 +229,8 @@ void afs_wait_for_operation(struct afs_operation *op)
*/
int afs_put_operation(struct afs_operation *op)
{
- int i, ret = op->error;
+ struct afs_addr_list *alist;
+ int i, ret = afs_op_error(op);
_enter("op=%08x,%d", op->debug_id, ret);
@@ -243,9 +252,19 @@ int afs_put_operation(struct afs_operation *op)
kfree(op->more_files);
}
- afs_end_cursor(&op->ac);
+ if (op->estate) {
+ alist = op->estate->addresses;
+ if (alist) {
+ if (op->call_responded &&
+ op->addr_index != alist->preferred &&
+ test_bit(alist->preferred, &op->addr_tried))
+ WRITE_ONCE(alist->preferred, op->addr_index);
+ }
+ }
+
+ afs_clear_server_states(op);
afs_put_serverlist(op->net, op->server_list);
- afs_put_volume(op->net, op->volume, afs_volume_trace_put_put_op);
+ afs_put_volume(op->volume, afs_volume_trace_put_put_op);
key_put(op->key);
kfree(op);
return ret;
diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
index daaf3810cc92..580de4adaaf6 100644
--- a/fs/afs/fs_probe.c
+++ b/fs/afs/fs_probe.c
@@ -15,6 +15,42 @@
static unsigned int afs_fs_probe_fast_poll_interval = 30 * HZ;
static unsigned int afs_fs_probe_slow_poll_interval = 5 * 60 * HZ;
+struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate,
+ enum afs_estate_trace where)
+{
+ if (estate) {
+ int r;
+
+ __refcount_inc(&estate->ref, &r);
+ trace_afs_estate(estate->server_id, estate->probe_seq, r, where);
+ }
+ return estate;
+}
+
+static void afs_endpoint_state_rcu(struct rcu_head *rcu)
+{
+ struct afs_endpoint_state *estate = container_of(rcu, struct afs_endpoint_state, rcu);
+
+ trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
+ afs_estate_trace_free);
+ afs_put_addrlist(estate->addresses, afs_alist_trace_put_estate);
+ kfree(estate);
+}
+
+void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where)
+{
+ if (estate) {
+ unsigned int server_id = estate->server_id, probe_seq = estate->probe_seq;
+ bool dead;
+ int r;
+
+ dead = __refcount_dec_and_test(&estate->ref, &r);
+ trace_afs_estate(server_id, probe_seq, r, where);
+ if (dead)
+ call_rcu(&estate->rcu, afs_endpoint_state_rcu);
+ }
+}
+
/*
* Start the probe polling timer. We have to supply it with an inc on the
* outstanding server count.
@@ -38,9 +74,10 @@ static void afs_schedule_fs_probe(struct afs_net *net,
/*
* Handle the completion of a set of probes.
*/
-static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server)
+static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server,
+ struct afs_endpoint_state *estate)
{
- bool responded = server->probe.responded;
+ bool responded = test_bit(AFS_ESTATE_RESPONDED, &estate->flags);
write_seqlock(&net->fs_lock);
if (responded) {
@@ -50,6 +87,7 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server
clear_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
list_add_tail(&server->probe_link, &net->fs_probe_fast);
}
+
write_sequnlock(&net->fs_lock);
afs_schedule_fs_probe(net, server, !responded);
@@ -58,12 +96,13 @@ static void afs_finished_fs_probe(struct afs_net *net, struct afs_server *server
/*
* Handle the completion of a probe.
*/
-static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server)
+static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server,
+ struct afs_endpoint_state *estate)
{
_enter("");
- if (atomic_dec_and_test(&server->probe_outstanding))
- afs_finished_fs_probe(net, server);
+ if (atomic_dec_and_test(&estate->nr_probing))
+ afs_finished_fs_probe(net, server, estate);
wake_up_all(&server->probe_wq);
}
@@ -74,24 +113,22 @@ static void afs_done_one_fs_probe(struct afs_net *net, struct afs_server *server
*/
static void afs_fs_probe_not_done(struct afs_net *net,
struct afs_server *server,
- struct afs_addr_cursor *ac)
+ struct afs_endpoint_state *estate,
+ int index)
{
- struct afs_addr_list *alist = ac->alist;
- unsigned int index = ac->index;
-
_enter("");
trace_afs_io_error(0, -ENOMEM, afs_io_error_fs_probe_fail);
spin_lock(&server->probe_lock);
- server->probe.local_failure = true;
- if (server->probe.error == 0)
- server->probe.error = -ENOMEM;
+ set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags);
+ if (estate->error == 0)
+ estate->error = -ENOMEM;
- set_bit(index, &alist->failed);
+ set_bit(index, &estate->failed_set);
spin_unlock(&server->probe_lock);
- return afs_done_one_fs_probe(net, server);
+ return afs_done_one_fs_probe(net, server, estate);
}
/*
@@ -100,30 +137,34 @@ static void afs_fs_probe_not_done(struct afs_net *net,
*/
void afs_fileserver_probe_result(struct afs_call *call)
{
- struct afs_addr_list *alist = call->alist;
+ struct afs_endpoint_state *estate = call->probe;
+ struct afs_addr_list *alist = estate->addresses;
+ struct afs_address *addr = &alist->addrs[call->probe_index];
struct afs_server *server = call->server;
- unsigned int index = call->addr_ix;
- unsigned int rtt_us = 0, cap0;
+ unsigned int index = call->probe_index;
+ unsigned int rtt_us = -1, cap0;
int ret = call->error;
_enter("%pU,%u", &server->uuid, index);
+ WRITE_ONCE(addr->last_error, ret);
+
spin_lock(&server->probe_lock);
switch (ret) {
case 0:
- server->probe.error = 0;
+ estate->error = 0;
goto responded;
case -ECONNABORTED:
- if (!server->probe.responded) {
- server->probe.abort_code = call->abort_code;
- server->probe.error = ret;
+ if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags)) {
+ estate->abort_code = call->abort_code;
+ estate->error = ret;
}
goto responded;
case -ENOMEM:
case -ENONET:
- clear_bit(index, &alist->responded);
- server->probe.local_failure = true;
+ clear_bit(index, &estate->responsive_set);
+ set_bit(AFS_ESTATE_LOCAL_FAILURE, &estate->flags);
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
goto out;
case -ECONNRESET: /* Responded, but call expired. */
@@ -136,29 +177,29 @@ void afs_fileserver_probe_result(struct afs_call *call)
case -ETIMEDOUT:
case -ETIME:
default:
- clear_bit(index, &alist->responded);
- set_bit(index, &alist->failed);
- if (!server->probe.responded &&
- (server->probe.error == 0 ||
- server->probe.error == -ETIMEDOUT ||
- server->probe.error == -ETIME))
- server->probe.error = ret;
+ clear_bit(index, &estate->responsive_set);
+ set_bit(index, &estate->failed_set);
+ if (!test_bit(AFS_ESTATE_RESPONDED, &estate->flags) &&
+ (estate->error == 0 ||
+ estate->error == -ETIMEDOUT ||
+ estate->error == -ETIME))
+ estate->error = ret;
trace_afs_io_error(call->debug_id, ret, afs_io_error_fs_probe_fail);
goto out;
}
responded:
- clear_bit(index, &alist->failed);
+ clear_bit(index, &estate->failed_set);
if (call->service_id == YFS_FS_SERVICE) {
- server->probe.is_yfs = true;
+ set_bit(AFS_ESTATE_IS_YFS, &estate->flags);
set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
} else {
- server->probe.not_yfs = true;
- if (!server->probe.is_yfs) {
+ set_bit(AFS_ESTATE_NOT_YFS, &estate->flags);
+ if (!test_bit(AFS_ESTATE_IS_YFS, &estate->flags)) {
clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
}
cap0 = ntohl(call->tmp);
if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES)
@@ -167,116 +208,136 @@ responded:
clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
}
- rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
- if (rtt_us < server->probe.rtt) {
- server->probe.rtt = rtt_us;
+ rtt_us = rxrpc_kernel_get_srtt(addr->peer);
+ if (rtt_us < estate->rtt) {
+ estate->rtt = rtt_us;
server->rtt = rtt_us;
alist->preferred = index;
}
smp_wmb(); /* Set rtt before responded. */
- server->probe.responded = true;
- set_bit(index, &alist->responded);
+ set_bit(AFS_ESTATE_RESPONDED, &estate->flags);
+ set_bit(index, &estate->responsive_set);
set_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
out:
spin_unlock(&server->probe_lock);
- _debug("probe %pU [%u] %pISpc rtt=%u ret=%d",
- &server->uuid, index, &alist->addrs[index].transport,
+ trace_afs_fs_probe(server, false, estate, index, call->error, call->abort_code, rtt_us);
+ _debug("probe[%x] %pU [%u] %pISpc rtt=%d ret=%d",
+ estate->probe_seq, &server->uuid, index,
+ rxrpc_kernel_remote_addr(alist->addrs[index].peer),
rtt_us, ret);
- return afs_done_one_fs_probe(call->net, server);
+ return afs_done_one_fs_probe(call->net, server, estate);
}
/*
- * Probe one or all of a fileserver's addresses to find out the best route and
- * to query its capabilities.
+ * Probe all of a fileserver's addresses to find out the best route and to
+ * query its capabilities.
*/
void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
- struct key *key, bool all)
+ struct afs_addr_list *new_alist, struct key *key)
{
- struct afs_addr_cursor ac = {
- .index = 0,
- };
+ struct afs_endpoint_state *estate, *old;
+ struct afs_addr_list *alist;
+ unsigned long unprobed;
_enter("%pU", &server->uuid);
- read_lock(&server->fs_lock);
- ac.alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->fs_lock));
- afs_get_addrlist(ac.alist);
- read_unlock(&server->fs_lock);
+ estate = kzalloc(sizeof(*estate), GFP_KERNEL);
+ if (!estate)
+ return;
+
+ refcount_set(&estate->ref, 1);
+ estate->server_id = server->debug_id;
+ estate->rtt = UINT_MAX;
+
+ write_lock(&server->fs_lock);
+
+ old = rcu_dereference_protected(server->endpoint_state,
+ lockdep_is_held(&server->fs_lock));
+ estate->responsive_set = old->responsive_set;
+ estate->addresses = afs_get_addrlist(new_alist ?: old->addresses,
+ afs_alist_trace_get_estate);
+ alist = estate->addresses;
+ estate->probe_seq = ++server->probe_counter;
+ atomic_set(&estate->nr_probing, alist->nr_addrs);
+
+ rcu_assign_pointer(server->endpoint_state, estate);
+ set_bit(AFS_ESTATE_SUPERSEDED, &old->flags);
+ write_unlock(&server->fs_lock);
+
+ trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
+ afs_estate_trace_alloc_probe);
+
+ afs_get_address_preferences(net, alist);
server->probed_at = jiffies;
- atomic_set(&server->probe_outstanding, all ? ac.alist->nr_addrs : 1);
- memset(&server->probe, 0, sizeof(server->probe));
- server->probe.rtt = UINT_MAX;
-
- ac.index = ac.alist->preferred;
- if (ac.index < 0 || ac.index >= ac.alist->nr_addrs)
- all = true;
-
- if (all) {
- for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++)
- if (!afs_fs_get_capabilities(net, server, &ac, key))
- afs_fs_probe_not_done(net, server, &ac);
- } else {
- if (!afs_fs_get_capabilities(net, server, &ac, key))
- afs_fs_probe_not_done(net, server, &ac);
+ unprobed = (1UL << alist->nr_addrs) - 1;
+ while (unprobed) {
+ unsigned int index = 0, i;
+ int best_prio = -1;
+
+ for (i = 0; i < alist->nr_addrs; i++) {
+ if (test_bit(i, &unprobed) &&
+ alist->addrs[i].prio > best_prio) {
+ index = i;
+ best_prio = alist->addrs[i].prio;
+ }
+ }
+ __clear_bit(index, &unprobed);
+
+ trace_afs_fs_probe(server, true, estate, index, 0, 0, 0);
+ if (!afs_fs_get_capabilities(net, server, estate, index, key))
+ afs_fs_probe_not_done(net, server, estate, index);
}
- afs_put_addrlist(ac.alist);
+ afs_put_endpoint_state(old, afs_estate_trace_put_probe);
}
/*
- * Wait for the first as-yet untried fileserver to respond.
+ * Wait for the first as-yet untried fileserver to respond, for the probe state
+ * to be superseded or for all probes to finish.
*/
-int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
+int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr)
{
- struct wait_queue_entry *waits;
- struct afs_server *server;
- unsigned int rtt = UINT_MAX, rtt_s;
- bool have_responders = false;
- int pref = -1, i;
+ struct afs_endpoint_state *estate;
+ struct afs_server_list *slist = op->server_list;
+ bool still_probing = true;
+ int ret = 0, i;
- _enter("%u,%lx", slist->nr_servers, untried);
+ _enter("%u", slist->nr_servers);
- /* Only wait for servers that have a probe outstanding. */
for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- if (!atomic_read(&server->probe_outstanding))
- __clear_bit(i, &untried);
- if (server->probe.responded)
- have_responders = true;
- }
+ estate = states[i].endpoint_state;
+ if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
+ return 2;
+ if (atomic_read(&estate->nr_probing))
+ still_probing = true;
+ if (estate->responsive_set & states[i].untried_addrs)
+ return 1;
}
- if (have_responders || !untried)
+ if (!still_probing)
return 0;
- waits = kmalloc(array_size(slist->nr_servers, sizeof(*waits)), GFP_KERNEL);
- if (!waits)
- return -ENOMEM;
-
- for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- init_waitqueue_entry(&waits[i], current);
- add_wait_queue(&server->probe_wq, &waits[i]);
- }
- }
+ for (i = 0; i < slist->nr_servers; i++)
+ add_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter);
for (;;) {
- bool still_probing = false;
+ still_probing = false;
- set_current_state(TASK_INTERRUPTIBLE);
+ set_current_state(intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- if (server->probe.responded)
- goto stop;
- if (atomic_read(&server->probe_outstanding))
- still_probing = true;
+ estate = states[i].endpoint_state;
+ if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags)) {
+ ret = 2;
+ goto stop;
+ }
+ if (atomic_read(&estate->nr_probing))
+ still_probing = true;
+ if (estate->responsive_set & states[i].untried_addrs) {
+ ret = 1;
+ goto stop;
}
}
@@ -288,28 +349,12 @@ int afs_wait_for_fs_probes(struct afs_server_list *slist, unsigned long untried)
stop:
set_current_state(TASK_RUNNING);
- for (i = 0; i < slist->nr_servers; i++) {
- if (test_bit(i, &untried)) {
- server = slist->servers[i].server;
- rtt_s = READ_ONCE(server->rtt);
- if (test_bit(AFS_SERVER_FL_RESPONDING, &server->flags) &&
- rtt_s < rtt) {
- pref = i;
- rtt = rtt_s;
- }
-
- remove_wait_queue(&server->probe_wq, &waits[i]);
- }
- }
-
- kfree(waits);
-
- if (pref == -1 && signal_pending(current))
- return -ERESTARTSYS;
+ for (i = 0; i < slist->nr_servers; i++)
+ remove_wait_queue(&slist->servers[i].server->probe_wq, &states[i].probe_waiter);
- if (pref >= 0)
- slist->preferred = pref;
- return 0;
+ if (!ret && signal_pending(current))
+ ret = -ERESTARTSYS;
+ return ret;
}
/*
@@ -327,7 +372,7 @@ void afs_fs_probe_timer(struct timer_list *timer)
/*
* Dispatch a probe to a server.
*/
-static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server, bool all)
+static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server)
__releases(&net->fs_lock)
{
struct key *key = NULL;
@@ -340,7 +385,7 @@ static void afs_dispatch_fs_probe(struct afs_net *net, struct afs_server *server
afs_get_server(server, afs_server_trace_get_probe);
write_sequnlock(&net->fs_lock);
- afs_fs_probe_fileserver(net, server, key, all);
+ afs_fs_probe_fileserver(net, server, NULL, key);
afs_put_server(net, server, afs_server_trace_put_probe);
}
@@ -352,7 +397,7 @@ void afs_probe_fileserver(struct afs_net *net, struct afs_server *server)
{
write_seqlock(&net->fs_lock);
if (!list_empty(&server->probe_link))
- return afs_dispatch_fs_probe(net, server, true);
+ return afs_dispatch_fs_probe(net, server);
write_sequnlock(&net->fs_lock);
}
@@ -412,7 +457,7 @@ again:
_debug("probe %pU", &server->uuid);
if (server && (first_pass || !need_resched())) {
- afs_dispatch_fs_probe(net, server, server == fast);
+ afs_dispatch_fs_probe(net, server);
first_pass = false;
goto again;
}
@@ -436,12 +481,13 @@ again:
/*
* Wait for a probe on a particular fileserver to complete for 2s.
*/
-int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
+int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate,
+ unsigned long exclude, bool is_intr)
{
struct wait_queue_entry wait;
unsigned long timo = 2 * HZ;
- if (atomic_read(&server->probe_outstanding) == 0)
+ if (atomic_read(&estate->nr_probing) == 0)
goto dont_wait;
init_wait_entry(&wait, 0);
@@ -449,8 +495,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
prepare_to_wait_event(&server->probe_wq, &wait,
is_intr ? TASK_INTERRUPTIBLE : TASK_UNINTERRUPTIBLE);
if (timo == 0 ||
- server->probe.responded ||
- atomic_read(&server->probe_outstanding) == 0 ||
+ test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags) ||
+ (estate->responsive_set & ~exclude) ||
+ atomic_read(&estate->nr_probing) == 0 ||
(is_intr && signal_pending(current)))
break;
timo = schedule_timeout(timo);
@@ -459,7 +506,9 @@ int afs_wait_for_one_fs_probe(struct afs_server *server, bool is_intr)
finish_wait(&server->probe_wq, &wait);
dont_wait:
- if (server->probe.responded)
+ if (estate->responsive_set & ~exclude)
+ return 1;
+ if (test_bit(AFS_ESTATE_SUPERSEDED, &estate->flags))
return 0;
if (is_intr && signal_pending(current))
return -ERESTARTSYS;
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 7d37f63ef0f0..79cd30775b7a 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -290,6 +290,7 @@ void afs_fs_fetch_status(struct afs_operation *op)
bp[2] = htonl(vp->fid.vnode);
bp[3] = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -442,6 +443,7 @@ static void afs_fs_fetch_data64(struct afs_operation *op)
bp[6] = 0;
bp[7] = htonl(lower_32_bits(req->len));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -476,6 +478,7 @@ void afs_fs_fetch_data(struct afs_operation *op)
bp[4] = htonl(lower_32_bits(req->pos));
bp[5] = htonl(lower_32_bits(req->len));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -559,6 +562,7 @@ void afs_fs_create_file(struct afs_operation *op)
*bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */
*bp++ = 0; /* segment size */
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -612,6 +616,7 @@ void afs_fs_make_dir(struct afs_operation *op)
*bp++ = htonl(op->create.mode & S_IALLUGO); /* unix mode */
*bp++ = 0; /* segment size */
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -685,6 +690,7 @@ void afs_fs_remove_file(struct afs_operation *op)
bp = (void *) bp + padsz;
}
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -732,6 +738,7 @@ void afs_fs_remove_dir(struct afs_operation *op)
bp = (void *) bp + padsz;
}
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -812,6 +819,7 @@ void afs_fs_link(struct afs_operation *op)
*bp++ = htonl(vp->fid.vnode);
*bp++ = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call1(call, &vp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -907,6 +915,7 @@ void afs_fs_symlink(struct afs_operation *op)
*bp++ = htonl(S_IRWXUGO); /* unix mode */
*bp++ = 0; /* segment size */
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1003,6 +1012,7 @@ void afs_fs_rename(struct afs_operation *op)
bp = (void *) bp + n_padsz;
}
+ call->fid = orig_dvp->fid;
trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1090,6 +1100,7 @@ static void afs_fs_store_data64(struct afs_operation *op)
*bp++ = htonl(upper_32_bits(op->store.i_size));
*bp++ = htonl(lower_32_bits(op->store.i_size));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1140,6 +1151,7 @@ void afs_fs_store_data(struct afs_operation *op)
*bp++ = htonl(lower_32_bits(op->store.size));
*bp++ = htonl(lower_32_bits(op->store.i_size));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1206,6 +1218,7 @@ static void afs_fs_setattr_size64(struct afs_operation *op)
*bp++ = htonl(upper_32_bits(attr->ia_size)); /* new file length */
*bp++ = htonl(lower_32_bits(attr->ia_size));
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1247,6 +1260,7 @@ static void afs_fs_setattr_size(struct afs_operation *op)
*bp++ = 0; /* size of write */
*bp++ = htonl(attr->ia_size); /* new file length */
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1283,6 +1297,7 @@ void afs_fs_setattr(struct afs_operation *op)
xdr_encode_AFS_StoreStatus(&bp, op->setattr.attr);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1446,6 +1461,7 @@ void afs_fs_get_volume_status(struct afs_operation *op)
bp[0] = htonl(FSGETVOLUMESTATUS);
bp[1] = htonl(vp->fid.vid);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1528,6 +1544,7 @@ void afs_fs_set_lock(struct afs_operation *op)
*bp++ = htonl(vp->fid.unique);
*bp++ = htonl(op->lock.type);
+ call->fid = vp->fid;
trace_afs_make_fs_calli(call, &vp->fid, op->lock.type);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1554,6 +1571,7 @@ void afs_fs_extend_lock(struct afs_operation *op)
*bp++ = htonl(vp->fid.vnode);
*bp++ = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1580,6 +1598,7 @@ void afs_fs_release_lock(struct afs_operation *op)
*bp++ = htonl(vp->fid.vnode);
*bp++ = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1605,13 +1624,12 @@ static const struct afs_call_type afs_RXFSGiveUpAllCallBacks = {
/*
* Flush all the callbacks we have on a server.
*/
-int afs_fs_give_up_all_callbacks(struct afs_net *net,
- struct afs_server *server,
- struct afs_addr_cursor *ac,
- struct key *key)
+int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server,
+ struct afs_address *addr, struct key *key)
{
struct afs_call *call;
__be32 *bp;
+ int ret;
_enter("");
@@ -1619,15 +1637,22 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net,
if (!call)
return -ENOMEM;
- call->key = key;
+ call->key = key;
+ call->peer = rxrpc_kernel_get_peer(addr->peer);
+ call->service_id = server->service_id;
/* marshall the parameters */
bp = call->request;
*bp++ = htonl(FSGIVEUPALLCALLBACKS);
call->server = afs_use_server(server, afs_server_trace_give_up_cb);
- afs_make_call(ac, call, GFP_NOFS);
- return afs_wait_for_call_to_complete(call, ac);
+ afs_make_call(call, GFP_NOFS);
+ afs_wait_for_call_to_complete(call);
+ ret = call->error;
+ if (call->responded)
+ set_bit(AFS_SERVER_FL_RESPONDING, &server->flags);
+ afs_put_call(call);
+ return ret;
}
/*
@@ -1689,6 +1714,12 @@ static int afs_deliver_fs_get_capabilities(struct afs_call *call)
return 0;
}
+static void afs_fs_get_capabilities_destructor(struct afs_call *call)
+{
+ afs_put_endpoint_state(call->probe, afs_estate_trace_put_getcaps);
+ afs_flat_call_destructor(call);
+}
+
/*
* FS.GetCapabilities operation type
*/
@@ -1697,7 +1728,7 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
.op = afs_FS_GetCapabilities,
.deliver = afs_deliver_fs_get_capabilities,
.done = afs_fileserver_probe_result,
- .destructor = afs_flat_call_destructor,
+ .destructor = afs_fs_get_capabilities_destructor,
};
/*
@@ -1707,7 +1738,8 @@ static const struct afs_call_type afs_RXFSGetCapabilities = {
* ->done() - otherwise we return false to indicate we didn't even try.
*/
bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
- struct afs_addr_cursor *ac, struct key *key)
+ struct afs_endpoint_state *estate, unsigned int addr_index,
+ struct key *key)
{
struct afs_call *call;
__be32 *bp;
@@ -1718,10 +1750,14 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
if (!call)
return false;
- call->key = key;
- call->server = afs_use_server(server, afs_server_trace_get_caps);
- call->upgrade = true;
- call->async = true;
+ call->key = key;
+ call->server = afs_use_server(server, afs_server_trace_get_caps);
+ call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer);
+ call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps);
+ call->probe_index = addr_index;
+ call->service_id = server->service_id;
+ call->upgrade = true;
+ call->async = true;
call->max_lifespan = AFS_PROBE_MAX_LIFESPAN;
/* marshall the parameters */
@@ -1729,7 +1765,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
*bp++ = htonl(FSGETCAPABILITIES);
trace_afs_make_fs_call(call, NULL);
- afs_make_call(ac, call, GFP_NOFS);
+ afs_make_call(call, GFP_NOFS);
afs_put_call(call);
return true;
}
@@ -1853,7 +1889,10 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
return ret;
bp = call->buffer;
- xdr_decode_AFSVolSync(&bp, &op->volsync);
+ /* Unfortunately, prior to OpenAFS-1.6, volsync here is filled
+ * with rubbish.
+ */
+ xdr_decode_AFSVolSync(&bp, NULL);
call->unmarshall++;
fallthrough;
@@ -1899,7 +1938,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op)
int i;
if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->server->flags)) {
- op->error = -ENOTSUPP;
+ afs_op_set_error(op, -ENOTSUPP);
return;
}
@@ -1928,6 +1967,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op)
*bp++ = htonl(op->more_files[i].fid.unique);
}
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -2033,6 +2073,7 @@ void afs_fs_fetch_acl(struct afs_operation *op)
bp[2] = htonl(vp->fid.vnode);
bp[3] = htonl(vp->fid.unique);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}
@@ -2078,6 +2119,7 @@ void afs_fs_store_acl(struct afs_operation *op)
if (acl->size != size)
memset((void *)&bp[5] + acl->size, 0, size - acl->size);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 78efc9719349..94fc049aff58 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -58,7 +58,7 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren
*/
static void afs_set_netfs_context(struct afs_vnode *vnode)
{
- netfs_inode_init(&vnode->netfs, &afs_req_ops);
+ netfs_inode_init(&vnode->netfs, &afs_req_ops, true);
}
/*
@@ -85,8 +85,7 @@ static int afs_inode_init_from_status(struct afs_operation *op,
write_seqlock(&vnode->cb_lock);
- vnode->cb_v_break = op->cb_v_break;
- vnode->cb_s_break = op->cb_s_break;
+ vnode->cb_v_check = op->cb_v_break;
vnode->status = *status;
t = status->mtime_client;
@@ -146,11 +145,10 @@ static int afs_inode_init_from_status(struct afs_operation *op,
if (!vp->scb.have_cb) {
/* it's a symlink we just created (the fileserver
* didn't give us a callback) */
- vnode->cb_expires_at = ktime_get_real_seconds();
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
} else {
- vnode->cb_expires_at = vp->scb.callback.expires_at;
vnode->cb_server = op->server;
- set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ atomic64_set(&vnode->cb_expires_at, vp->scb.callback.expires_at);
}
write_sequnlock(&vnode->cb_lock);
@@ -168,6 +166,7 @@ static void afs_apply_status(struct afs_operation *op,
struct inode *inode = &vnode->netfs.inode;
struct timespec64 t;
umode_t mode;
+ bool unexpected_jump = false;
bool data_changed = false;
bool change_size = vp->set_size;
@@ -214,7 +213,8 @@ static void afs_apply_status(struct afs_operation *op,
vnode->status = *status;
if (vp->dv_before + vp->dv_delta != status->data_version) {
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+ if (vnode->cb_ro_snapshot == atomic_read(&vnode->volume->cb_ro_snapshot) &&
+ atomic64_read(&vnode->cb_expires_at) != AFS_NO_CB_PROMISE)
pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n",
vnode->fid.vid, vnode->fid.vnode,
(unsigned long long)vp->dv_before + vp->dv_delta,
@@ -231,6 +231,7 @@ static void afs_apply_status(struct afs_operation *op,
}
change_size = true;
data_changed = true;
+ unexpected_jump = true;
} else if (vnode->status.type == AFS_FTYPE_DIR) {
/* Expected directory change is handled elsewhere so
* that we can locally edit the directory and save on a
@@ -250,8 +251,10 @@ static void afs_apply_status(struct afs_operation *op,
* what's on the server.
*/
vnode->netfs.remote_i_size = status->size;
- if (change_size) {
+ if (change_size || status->size > i_size_read(inode)) {
afs_set_i_size(vnode, status->size);
+ if (unexpected_jump)
+ vnode->netfs.zero_point = status->size;
inode_set_ctime_to_ts(inode, t);
inode_set_atime_to_ts(inode, t);
}
@@ -268,9 +271,9 @@ static void afs_apply_callback(struct afs_operation *op,
struct afs_vnode *vnode = vp->vnode;
if (!afs_cb_is_broken(vp->cb_break_before, vnode)) {
- vnode->cb_expires_at = cb->expires_at;
- vnode->cb_server = op->server;
- set_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ if (op->volume->type == AFSVL_RWVOL)
+ vnode->cb_server = op->server;
+ atomic64_set(&vnode->cb_expires_at, cb->expires_at);
}
}
@@ -331,7 +334,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
if (vnode->netfs.inode.i_state & I_NEW) {
ret = afs_inode_init_from_status(op, vp, vnode);
- op->error = ret;
+ afs_op_set_error(op, ret);
if (ret == 0)
afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb);
} else {
@@ -542,7 +545,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key)
BUG_ON(!(inode->i_state & I_NEW));
vnode = AFS_FS_I(inode);
- vnode->cb_v_break = as->volume->cb_v_break,
+ vnode->cb_v_check = atomic_read(&as->volume->cb_v_break),
afs_set_netfs_context(vnode);
op = afs_alloc_operation(key, as->volume);
@@ -573,180 +576,6 @@ error:
}
/*
- * mark the data attached to an inode as obsolete due to a write on the server
- * - might also want to ditch all the outstanding writes and dirty pages
- */
-static void afs_zap_data(struct afs_vnode *vnode)
-{
- _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
-
- afs_invalidate_cache(vnode, 0);
-
- /* nuke all the non-dirty pages that aren't locked, mapped or being
- * written back in a regular file and completely discard the pages in a
- * directory or symlink */
- if (S_ISREG(vnode->netfs.inode.i_mode))
- invalidate_remote_inode(&vnode->netfs.inode);
- else
- invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
-}
-
-/*
- * Check to see if we have a server currently serving this volume and that it
- * hasn't been reinitialised or dropped from the list.
- */
-static bool afs_check_server_good(struct afs_vnode *vnode)
-{
- struct afs_server_list *slist;
- struct afs_server *server;
- bool good;
- int i;
-
- if (vnode->cb_fs_s_break == atomic_read(&vnode->volume->cell->fs_s_break))
- return true;
-
- rcu_read_lock();
-
- slist = rcu_dereference(vnode->volume->servers);
- for (i = 0; i < slist->nr_servers; i++) {
- server = slist->servers[i].server;
- if (server == vnode->cb_server) {
- good = (vnode->cb_s_break == server->cb_s_break);
- rcu_read_unlock();
- return good;
- }
- }
-
- rcu_read_unlock();
- return false;
-}
-
-/*
- * Check the validity of a vnode/inode.
- */
-bool afs_check_validity(struct afs_vnode *vnode)
-{
- enum afs_cb_break_reason need_clear = afs_cb_break_no_break;
- time64_t now = ktime_get_real_seconds();
- unsigned int cb_break;
- int seq = 0;
-
- do {
- read_seqbegin_or_lock(&vnode->cb_lock, &seq);
- cb_break = vnode->cb_break;
-
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
- if (vnode->cb_v_break != vnode->volume->cb_v_break)
- need_clear = afs_cb_break_for_v_break;
- else if (!afs_check_server_good(vnode))
- need_clear = afs_cb_break_for_s_reinit;
- else if (test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
- need_clear = afs_cb_break_for_zap;
- else if (vnode->cb_expires_at - 10 <= now)
- need_clear = afs_cb_break_for_lapsed;
- } else if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- ;
- } else {
- need_clear = afs_cb_break_no_promise;
- }
-
- } while (need_seqretry(&vnode->cb_lock, seq));
-
- done_seqretry(&vnode->cb_lock, seq);
-
- if (need_clear == afs_cb_break_no_break)
- return true;
-
- write_seqlock(&vnode->cb_lock);
- if (need_clear == afs_cb_break_no_promise)
- vnode->cb_v_break = vnode->volume->cb_v_break;
- else if (cb_break == vnode->cb_break)
- __afs_break_callback(vnode, need_clear);
- else
- trace_afs_cb_miss(&vnode->fid, need_clear);
- write_sequnlock(&vnode->cb_lock);
- return false;
-}
-
-/*
- * Returns true if the pagecache is still valid. Does not sleep.
- */
-bool afs_pagecache_valid(struct afs_vnode *vnode)
-{
- if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) {
- if (vnode->netfs.inode.i_nlink)
- clear_nlink(&vnode->netfs.inode);
- return true;
- }
-
- if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags) &&
- afs_check_validity(vnode))
- return true;
-
- return false;
-}
-
-/*
- * validate a vnode/inode
- * - there are several things we need to check
- * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
- * symlink)
- * - parent dir metadata changed (security changes)
- * - dentry data changed (write, truncate)
- * - dentry metadata changed (security changes)
- */
-int afs_validate(struct afs_vnode *vnode, struct key *key)
-{
- int ret;
-
- _enter("{v={%llx:%llu} fl=%lx},%x",
- vnode->fid.vid, vnode->fid.vnode, vnode->flags,
- key_serial(key));
-
- if (afs_pagecache_valid(vnode))
- goto valid;
-
- down_write(&vnode->validate_lock);
-
- /* if the promise has expired, we need to check the server again to get
- * a new promise - note that if the (parent) directory's metadata was
- * changed then the security may be different and we may no longer have
- * access */
- if (!test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
- _debug("not promised");
- ret = afs_fetch_status(vnode, key, false, NULL);
- if (ret < 0) {
- if (ret == -ENOENT) {
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- ret = -ESTALE;
- }
- goto error_unlock;
- }
- _debug("new promise [fl=%lx]", vnode->flags);
- }
-
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- _debug("file already deleted");
- ret = -ESTALE;
- goto error_unlock;
- }
-
- /* if the vnode's data version number changed then its contents are
- * different */
- if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags))
- afs_zap_data(vnode);
- up_write(&vnode->validate_lock);
-valid:
- _leave(" = 0");
- return 0;
-
-error_unlock:
- up_write(&vnode->validate_lock);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
* read the attributes of an inode
*/
int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
@@ -755,13 +584,13 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct inode *inode = d_inode(path->dentry);
struct afs_vnode *vnode = AFS_FS_I(inode);
struct key *key;
- int ret, seq = 0;
+ int ret, seq;
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
if (vnode->volume &&
!(query_flags & AT_STATX_DONT_SYNC) &&
- !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ atomic64_read(&vnode->cb_expires_at) == AFS_NO_CB_PROMISE) {
key = afs_request_key(vnode->volume->cell);
if (IS_ERR(key))
return PTR_ERR(key);
@@ -772,7 +601,7 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
}
do {
- read_seqbegin_or_lock(&vnode->cb_lock, &seq);
+ seq = read_seqbegin(&vnode->cb_lock);
generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
if (test_bit(AFS_VNODE_SILLY_DELETED, &vnode->flags) &&
stat->nlink > 0)
@@ -784,9 +613,8 @@ int afs_getattr(struct mnt_idmap *idmap, const struct path *path,
*/
if (S_ISDIR(inode->i_mode))
stat->size = vnode->netfs.remote_i_size;
- } while (need_seqretry(&vnode->cb_lock, seq));
+ } while (read_seqretry(&vnode->cb_lock, seq));
- done_seqretry(&vnode->cb_lock, seq);
return 0;
}
@@ -823,7 +651,7 @@ void afs_evict_inode(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
afs_set_cache_aux(vnode, &aux);
- fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux);
+ netfs_clear_inode_writeback(inode, &aux);
clear_inode(inode);
while (!list_empty(&vnode->wb_keys)) {
@@ -865,17 +693,17 @@ static void afs_setattr_success(struct afs_operation *op)
static void afs_setattr_edit_file(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
- struct inode *inode = &vp->vnode->netfs.inode;
+ struct afs_vnode *vnode = vp->vnode;
if (op->setattr.attr->ia_valid & ATTR_SIZE) {
loff_t size = op->setattr.attr->ia_size;
loff_t i_size = op->setattr.old_i_size;
- if (size < i_size)
- truncate_pagecache(inode, size);
- if (size != i_size)
- fscache_resize_cookie(afs_vnode_cache(vp->vnode),
- vp->scb.status.size);
+ if (size != i_size) {
+ truncate_setsize(&vnode->netfs.inode, size);
+ netfs_resize_file(&vnode->netfs, size, true);
+ fscache_resize_cookie(afs_vnode_cache(vnode), size);
+ }
}
}
@@ -943,11 +771,11 @@ int afs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
*/
if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) &&
attr->ia_size < i_size &&
- attr->ia_size > vnode->status.size) {
- truncate_pagecache(inode, attr->ia_size);
+ attr->ia_size > vnode->netfs.remote_i_size) {
+ truncate_setsize(inode, attr->ia_size);
+ netfs_resize_file(&vnode->netfs, size, false);
fscache_resize_cookie(afs_vnode_cache(vnode),
attr->ia_size);
- i_size_write(inode, attr->ia_size);
ret = 0;
goto out_unlock;
}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 7385d62c8cf5..6ce5a612937c 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -33,6 +33,7 @@
struct pagevec;
struct afs_call;
struct afs_vnode;
+struct afs_server_probe;
/*
* Partial file-locking emulation mode. (The problem being that AFS3 only
@@ -73,21 +74,51 @@ enum afs_call_state {
};
/*
+ * Address preferences.
+ */
+struct afs_addr_preference {
+ union {
+ struct in_addr ipv4_addr; /* AF_INET address to compare against */
+ struct in6_addr ipv6_addr; /* AF_INET6 address to compare against */
+ };
+ sa_family_t family; /* Which address to use */
+ u16 prio; /* Priority */
+ u8 subnet_mask; /* How many bits to compare */
+};
+
+struct afs_addr_preference_list {
+ struct rcu_head rcu;
+ u16 version; /* Incremented when prefs list changes */
+ u8 ipv6_off; /* Offset of IPv6 addresses */
+ u8 nr; /* Number of addresses in total */
+ u8 max_prefs; /* Number of prefs allocated */
+ struct afs_addr_preference prefs[] __counted_by(max_prefs);
+};
+
+struct afs_address {
+ struct rxrpc_peer *peer;
+ short last_error; /* Last error from this address */
+ u16 prio; /* Address priority */
+};
+
+/*
* List of server addresses.
*/
struct afs_addr_list {
struct rcu_head rcu;
refcount_t usage;
u32 version; /* Version */
+ unsigned int debug_id;
+ unsigned int addr_pref_version; /* Version of address preference list */
unsigned char max_addrs;
unsigned char nr_addrs;
unsigned char preferred; /* Preferred address */
unsigned char nr_ipv4; /* Number of IPv4 addresses */
enum dns_record_source source:8;
enum dns_lookup_status status:8;
- unsigned long failed; /* Mask of addrs that failed locally/ICMP */
+ unsigned long probe_failed; /* Mask of addrs that failed locally/ICMP */
unsigned long responded; /* Mask of addrs that responded */
- struct sockaddr_rxrpc addrs[] __counted_by(max_addrs);
+ struct afs_address addrs[] __counted_by(max_addrs);
#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
};
@@ -96,11 +127,11 @@ struct afs_addr_list {
*/
struct afs_call {
const struct afs_call_type *type; /* type of call */
- struct afs_addr_list *alist; /* Address is alist[addr_ix] */
wait_queue_head_t waitq; /* processes awaiting completion */
struct work_struct async_work; /* async I/O processor */
struct work_struct work; /* actual work processor */
struct rxrpc_call *rxcall; /* RxRPC call handle */
+ struct rxrpc_peer *peer; /* Remote endpoint */
struct key *key; /* security for this call */
struct afs_net *net; /* The network namespace */
struct afs_server *server; /* The fileserver record if fs op (pins ref) */
@@ -116,11 +147,14 @@ struct afs_call {
};
void *buffer; /* reply receive buffer */
union {
- long ret0; /* Value to reply with instead of 0 */
+ struct afs_endpoint_state *probe;
+ struct afs_addr_list *vl_probe;
struct afs_addr_list *ret_alist;
struct afs_vldb_entry *ret_vldb;
char *ret_str;
};
+ struct afs_fid fid; /* Primary vnode ID (or all zeroes) */
+ unsigned char probe_index; /* Address in ->probe_alist */
struct afs_operation *op;
unsigned int server_index;
refcount_t ref;
@@ -133,13 +167,13 @@ struct afs_call {
unsigned reply_max; /* maximum size of reply */
unsigned count2; /* count used in unmarshalling */
unsigned char unmarshall; /* unmarshalling phase */
- unsigned char addr_ix; /* Address in ->alist */
bool drop_ref; /* T if need to drop ref for incoming call */
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
bool intr; /* T if interruptible */
bool unmarshalling_error; /* T if an unmarshalling error occurred */
+ bool responded; /* Got a response from the call (may be abort) */
u16 service_id; /* Actual service ID (after upgrade) */
unsigned int debug_id; /* Trace ID */
u32 operation_ID; /* operation ID for an incoming call */
@@ -287,8 +321,7 @@ struct afs_net {
struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */
struct hlist_head fs_proc; /* procfs servers list */
- struct hlist_head fs_addresses4; /* afs_server (by lowest IPv4 addr) */
- struct hlist_head fs_addresses6; /* afs_server (by lowest IPv6 addr) */
+ struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */
seqlock_t fs_addr_lock; /* For fs_addresses[46] */
struct work_struct fs_manager;
@@ -306,6 +339,8 @@ struct afs_net {
struct proc_dir_entry *proc_afs; /* /proc/net/afs directory */
struct afs_sysnames *sysnames;
rwlock_t sysnames_lock;
+ struct afs_addr_preference_list __rcu *address_prefs;
+ u16 address_pref_version;
/* Statistics counters */
atomic_t n_lookup; /* Number of lookups done */
@@ -379,6 +414,7 @@ struct afs_cell {
unsigned int debug_id;
/* The volumes belonging to this cell */
+ struct rw_semaphore vs_lock; /* Lock for server->volumes */
struct rb_root volumes; /* Tree of volumes on this server */
struct hlist_head proc_volumes; /* procfs volume list */
seqlock_t volume_lock; /* For volumes */
@@ -386,9 +422,6 @@ struct afs_cell {
/* Active fileserver interaction state. */
struct rb_root fs_servers; /* afs_server (by server UUID) */
seqlock_t fs_lock; /* For fs_servers */
- struct rw_semaphore fs_open_mmaps_lock;
- struct list_head fs_open_mmaps; /* List of vnodes that are mmapped */
- atomic_t fs_s_break; /* Counter of CB.InitCallBackState messages */
/* VL server list. */
rwlock_t vl_servers_lock; /* Lock on vl_servers */
@@ -412,13 +445,14 @@ struct afs_vlserver {
rwlock_t lock; /* Lock on addresses */
refcount_t ref;
unsigned int rtt; /* Server's current RTT in uS */
+ unsigned int debug_id;
/* Probe state */
wait_queue_head_t probe_wq;
atomic_t probe_outstanding;
spinlock_t probe_lock;
struct {
- unsigned int rtt; /* RTT in uS */
+ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */
u32 abort_code;
short error;
unsigned short flags;
@@ -428,6 +462,7 @@ struct afs_vlserver {
#define AFS_VLSERVER_PROBE_LOCAL_FAILURE 0x08 /* A local failure prevented a probe */
} probe;
+ u16 service_id; /* Service ID we're using */
u16 port;
u16 name_len; /* Length of name */
char name[]; /* Server name, case-flattened */
@@ -477,6 +512,7 @@ struct afs_vldb_entry {
#define AFS_VOL_VTM_RW 0x01 /* R/W version of the volume is available (on this server) */
#define AFS_VOL_VTM_RO 0x02 /* R/O version of the volume is available (on this server) */
#define AFS_VOL_VTM_BAK 0x04 /* backup version of the volume is available (on this server) */
+ u8 vlsf_flags[AFS_NMAXNSERVERS];
short error;
u8 nr_servers; /* Number of server records */
u8 name_len;
@@ -484,6 +520,32 @@ struct afs_vldb_entry {
};
/*
+ * Fileserver endpoint state. The records the addresses of a fileserver's
+ * endpoints and the state and result of a round of probing on them. This
+ * allows the rotation algorithm to access those results without them being
+ * erased by a subsequent round of probing.
+ */
+struct afs_endpoint_state {
+ struct rcu_head rcu;
+ struct afs_addr_list *addresses; /* The addresses being probed */
+ unsigned long responsive_set; /* Bitset of responsive endpoints */
+ unsigned long failed_set; /* Bitset of endpoints we failed to probe */
+ refcount_t ref;
+ unsigned int server_id; /* Debug ID of server */
+ unsigned int probe_seq; /* Probe sequence (from server::probe_counter) */
+ atomic_t nr_probing; /* Number of outstanding probes */
+ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */
+ s32 abort_code;
+ short error;
+ unsigned long flags;
+#define AFS_ESTATE_RESPONDED 0 /* Set if the server responded */
+#define AFS_ESTATE_SUPERSEDED 1 /* Set if this record has been superseded */
+#define AFS_ESTATE_IS_YFS 2 /* Set if probe upgraded to YFS */
+#define AFS_ESTATE_NOT_YFS 3 /* Set if probe didn't upgrade to YFS */
+#define AFS_ESTATE_LOCAL_FAILURE 4 /* Set if there was a local failure (eg. ENOMEM) */
+};
+
+/*
* Record of fileserver with which we're actively communicating.
*/
struct afs_server {
@@ -493,16 +555,14 @@ struct afs_server {
struct afs_uuid _uuid;
};
- struct afs_addr_list __rcu *addresses;
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node uuid_rb; /* Link in net->fs_servers */
struct afs_server __rcu *uuid_next; /* Next server with same UUID */
struct afs_server *uuid_prev; /* Previous server with same UUID */
struct list_head probe_link; /* Link in net->fs_probe_list */
- struct hlist_node addr4_link; /* Link in net->fs_addresses4 */
- struct hlist_node addr6_link; /* Link in net->fs_addresses6 */
+ struct hlist_node addr_link; /* Link in net->fs_addresses6 */
struct hlist_node proc_link; /* Link in net->fs_proc */
- struct work_struct initcb_work; /* Work for CB.InitCallBackState* */
+ struct list_head volumes; /* RCU list of afs_server_entry objects */
struct afs_server *gc_next; /* Next server in manager's list */
time64_t unuse_time; /* Time at which last unused */
unsigned long flags;
@@ -520,44 +580,47 @@ struct afs_server {
refcount_t ref; /* Object refcount */
atomic_t active; /* Active user count */
u32 addr_version; /* Address list version */
+ u16 service_id; /* Service ID we're using. */
unsigned int rtt; /* Server's current RTT in uS */
unsigned int debug_id; /* Debugging ID for traces */
/* file service access */
rwlock_t fs_lock; /* access lock */
- /* callback promise management */
- unsigned cb_s_break; /* Break-everything counter. */
-
/* Probe state */
+ struct afs_endpoint_state __rcu *endpoint_state; /* Latest endpoint/probe state */
unsigned long probed_at; /* Time last probe was dispatched (jiffies) */
wait_queue_head_t probe_wq;
- atomic_t probe_outstanding;
+ unsigned int probe_counter; /* Number of probes issued */
spinlock_t probe_lock;
- struct {
- unsigned int rtt; /* RTT in uS */
- u32 abort_code;
- short error;
- bool responded:1;
- bool is_yfs:1;
- bool not_yfs:1;
- bool local_failure:1;
- } probe;
};
+enum afs_ro_replicating {
+ AFS_RO_NOT_REPLICATING, /* Not doing replication */
+ AFS_RO_REPLICATING_USE_OLD, /* Replicating; use old version */
+ AFS_RO_REPLICATING_USE_NEW, /* Replicating; switch to new version */
+} __mode(byte);
+
/*
* Replaceable volume server list.
*/
struct afs_server_entry {
struct afs_server *server;
+ struct afs_volume *volume;
+ struct list_head slink; /* Link in server->volumes */
+ time64_t cb_expires_at; /* Time at which volume-level callback expires */
+ unsigned long flags;
+#define AFS_SE_EXCLUDED 0 /* Set if server is to be excluded in rotation */
+#define AFS_SE_VOLUME_OFFLINE 1 /* Set if volume offline notice given */
+#define AFS_SE_VOLUME_BUSY 2 /* Set if volume busy notice given */
};
struct afs_server_list {
struct rcu_head rcu;
- afs_volid_t vids[AFS_MAXTYPES]; /* Volume IDs */
refcount_t usage;
+ bool attached; /* T if attached to servers */
+ enum afs_ro_replicating ro_replicating; /* RW->RO update (probably) in progress */
unsigned char nr_servers;
- unsigned char preferred; /* Preferred server */
unsigned short vnovol_mask; /* Servers to be skipped due to VNOVOL */
unsigned int seq; /* Set to ->servers_seq when installed */
rwlock_t lock;
@@ -568,25 +631,23 @@ struct afs_server_list {
* Live AFS volume management.
*/
struct afs_volume {
- union {
- struct rcu_head rcu;
- afs_volid_t vid; /* volume ID */
- };
+ struct rcu_head rcu;
+ afs_volid_t vid; /* The volume ID of this volume */
+ afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */
refcount_t ref;
time64_t update_at; /* Time at which to next update */
struct afs_cell *cell; /* Cell to which belongs (pins ref) */
struct rb_node cell_node; /* Link in cell->volumes */
struct hlist_node proc_link; /* Link in cell->proc_volumes */
struct super_block __rcu *sb; /* Superblock on which inodes reside */
+ struct work_struct destructor; /* Deferred destructor */
unsigned long flags;
#define AFS_VOLUME_NEEDS_UPDATE 0 /* - T if an update needs performing */
#define AFS_VOLUME_UPDATING 1 /* - T if an update is in progress */
#define AFS_VOLUME_WAIT 2 /* - T if users must wait for update */
#define AFS_VOLUME_DELETED 3 /* - T if volume appears deleted */
-#define AFS_VOLUME_OFFLINE 4 /* - T if volume offline notice given */
-#define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */
-#define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */
-#define AFS_VOLUME_RM_TREE 7 /* - Set if volume removed from cell->volumes */
+#define AFS_VOLUME_MAYBE_NO_IBULK 4 /* - T if some servers don't have InlineBulkStatus */
+#define AFS_VOLUME_RM_TREE 5 /* - Set if volume removed from cell->volumes */
#ifdef CONFIG_AFS_FSCACHE
struct fscache_volume *cache; /* Caching cookie */
#endif
@@ -594,8 +655,21 @@ struct afs_volume {
rwlock_t servers_lock; /* Lock for ->servers */
unsigned int servers_seq; /* Incremented each time ->servers changes */
- unsigned cb_v_break; /* Break-everything counter. */
+ /* RO release tracking */
+ struct mutex volsync_lock; /* Time/state evaluation lock */
+ time64_t creation_time; /* Volume creation time (or TIME64_MIN) */
+ time64_t update_time; /* Volume update time (or TIME64_MIN) */
+
+ /* Callback management */
+ struct mutex cb_check_lock; /* Lock to control race to check after v_break */
+ time64_t cb_expires_at; /* Earliest volume callback expiry time */
+ atomic_t cb_ro_snapshot; /* RO volume update-from-snapshot counter */
+ atomic_t cb_v_break; /* Volume-break event counter. */
+ atomic_t cb_v_check; /* Volume-break has-been-checked counter. */
+ atomic_t cb_scrub; /* Scrub-all-data event counter. */
rwlock_t cb_v_break_lock;
+ struct rw_semaphore open_mmaps_lock;
+ struct list_head open_mmaps; /* List of vnodes that are mmapped */
afs_voltype_t type; /* type of volume */
char type_force; /* force volume type (suppress R/O -> R/W) */
@@ -634,7 +708,6 @@ struct afs_vnode {
spinlock_t wb_lock; /* lock for wb_keys */
spinlock_t lock; /* waitqueue/flags lock */
unsigned long flags;
-#define AFS_VNODE_CB_PROMISED 0 /* Set if vnode has a callback promise */
#define AFS_VNODE_UNSET 1 /* set if vnode attributes not yet set */
#define AFS_VNODE_DIR_VALID 2 /* Set if dir contents are valid */
#define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */
@@ -660,13 +733,14 @@ struct afs_vnode {
struct list_head cb_mmap_link; /* Link in cell->fs_open_mmaps */
void *cb_server; /* Server with callback/filelock */
atomic_t cb_nr_mmap; /* Number of mmaps */
- unsigned int cb_fs_s_break; /* Mass server break counter (cell->fs_s_break) */
- unsigned int cb_s_break; /* Mass break counter on ->server */
- unsigned int cb_v_break; /* Mass break counter on ->volume */
+ unsigned int cb_ro_snapshot; /* RO volume release counter on ->volume */
+ unsigned int cb_scrub; /* Scrub counter on ->volume */
unsigned int cb_break; /* Break counter on vnode */
+ unsigned int cb_v_check; /* Break check counter on ->volume */
seqlock_t cb_lock; /* Lock for ->cb_server, ->status, ->cb_*break */
- time64_t cb_expires_at; /* time at which callback expires */
+ atomic64_t cb_expires_at; /* time at which callback expires */
+#define AFS_NO_CB_PROMISE TIME64_MIN
};
static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode)
@@ -714,40 +788,49 @@ struct afs_permits {
* Error prioritisation and accumulation.
*/
struct afs_error {
- short error; /* Accumulated error */
+ s32 abort_code; /* Cumulative abort code */
+ short error; /* Cumulative error */
bool responded; /* T if server responded */
-};
-
-/*
- * Cursor for iterating over a server's address list.
- */
-struct afs_addr_cursor {
- struct afs_addr_list *alist; /* Current address list (pins ref) */
- unsigned long tried; /* Tried addresses */
- signed char index; /* Current address */
- bool responded; /* T if the current address responded */
- unsigned short nr_iterations; /* Number of address iterations */
- short error;
- u32 abort_code;
+ bool aborted; /* T if ->error is from an abort */
};
/*
* Cursor for iterating over a set of volume location servers.
*/
struct afs_vl_cursor {
- struct afs_addr_cursor ac;
struct afs_cell *cell; /* The cell we're querying */
struct afs_vlserver_list *server_list; /* Current server list (pins ref) */
struct afs_vlserver *server; /* Server on which this resides */
+ struct afs_addr_list *alist; /* Current address list (pins ref) */
struct key *key; /* Key for the server */
- unsigned long untried; /* Bitmask of untried servers */
- short index; /* Current server */
- short error;
+ unsigned long untried_servers; /* Bitmask of untried servers */
+ unsigned long addr_tried; /* Tried addresses */
+ struct afs_error cumul_error; /* Cumulative error */
+ unsigned int debug_id;
+ s32 call_abort_code;
+ short call_error; /* Error from single call */
+ short server_index; /* Current server */
+ signed char addr_index; /* Current address */
unsigned short flags;
#define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */
#define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */
#define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */
- unsigned short nr_iterations; /* Number of server iterations */
+ short nr_iterations; /* Number of server iterations */
+ bool call_responded; /* T if the current address responded */
+};
+
+/*
+ * Fileserver state tracking for an operation. An array of these is kept,
+ * indexed by server index.
+ */
+struct afs_server_state {
+ /* Tracking of fileserver probe state. Other operations may interfere
+ * by probing a fileserver when accessing other volumes.
+ */
+ unsigned int probe_seq;
+ unsigned long untried_addrs; /* Addresses we haven't tried yet */
+ struct wait_queue_entry probe_waiter;
+ struct afs_endpoint_state *endpoint_state; /* Endpoint state being monitored */
};
/*
@@ -768,7 +851,7 @@ struct afs_vnode_param {
struct afs_fid fid; /* Fid to access */
struct afs_status_cb scb; /* Returned status and callback promise */
afs_dataversion_t dv_before; /* Data version before the call */
- unsigned int cb_break_before; /* cb_break + cb_s_break before the call */
+ unsigned int cb_break_before; /* cb_break before the call */
u8 dv_delta; /* Expected change in data version */
bool put_vnode:1; /* T if we have a ref on the vnode */
bool need_io_lock:1; /* T if we need the I/O lock on this */
@@ -793,17 +876,17 @@ struct afs_operation {
struct afs_volume *volume; /* Volume being accessed */
struct afs_vnode_param file[2];
struct afs_vnode_param *more_files;
- struct afs_volsync volsync;
+ struct afs_volsync pre_volsync; /* Volsync before op */
+ struct afs_volsync volsync; /* Volsync returned by op */
struct dentry *dentry; /* Dentry to be altered */
struct dentry *dentry_2; /* Second dentry to be altered */
struct timespec64 mtime; /* Modification time to record */
struct timespec64 ctime; /* Change time to set */
+ struct afs_error cumul_error; /* Cumulative error */
short nr_files; /* Number of entries in file[], more_files */
- short error;
unsigned int debug_id;
unsigned int cb_v_break; /* Volume break counter before op */
- unsigned int cb_s_break; /* Server break counter before op */
union {
struct {
@@ -848,13 +931,19 @@ struct afs_operation {
};
/* Fileserver iteration state */
- struct afs_addr_cursor ac;
struct afs_server_list *server_list; /* Current server list (pins ref) */
struct afs_server *server; /* Server we're using (ref pinned by server_list) */
+ struct afs_endpoint_state *estate; /* Current endpoint state (doesn't pin ref) */
+ struct afs_server_state *server_states; /* States of the servers involved */
struct afs_call *call;
- unsigned long untried; /* Bitmask of untried servers */
- short index; /* Current server */
- unsigned short nr_iterations; /* Number of server iterations */
+ unsigned long untried_servers; /* Bitmask of untried servers */
+ unsigned long addr_tried; /* Tried addresses */
+ s32 call_abort_code; /* Abort code from single call */
+ short call_error; /* Error from single call */
+ short server_index; /* Current server */
+ short nr_iterations; /* Number of server iterations */
+ signed char addr_index; /* Current address */
+ bool call_responded; /* T if the current address responded */
unsigned int flags;
#define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */
@@ -894,93 +983,38 @@ static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int fl
i_size_read(&vnode->netfs.inode), flags);
}
-/*
- * We use folio->private to hold the amount of the folio that we've written to,
- * splitting the field into two parts. However, we need to represent a range
- * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio
- * exceeds what we can encode.
- */
-#ifdef CONFIG_64BIT
-#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL
-#define __AFS_FOLIO_PRIV_SHIFT 32
-#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL
-#else
-#define __AFS_FOLIO_PRIV_MASK 0x7fffUL
-#define __AFS_FOLIO_PRIV_SHIFT 16
-#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL
-#endif
-
-static inline unsigned int afs_folio_dirty_resolution(struct folio *folio)
-{
- int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1);
- return (shift > 0) ? shift : 0;
-}
-
-static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv)
-{
- unsigned long x = priv & __AFS_FOLIO_PRIV_MASK;
-
- /* The lower bound is inclusive */
- return x << afs_folio_dirty_resolution(folio);
-}
-
-static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv)
-{
- unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK;
-
- /* The upper bound is immediately beyond the region */
- return (x + 1) << afs_folio_dirty_resolution(folio);
-}
-
-static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to)
-{
- unsigned int res = afs_folio_dirty_resolution(folio);
- from >>= res;
- to = (to - 1) >> res;
- return (to << __AFS_FOLIO_PRIV_SHIFT) | from;
-}
-
-static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv)
-{
- return priv | __AFS_FOLIO_PRIV_MMAPPED;
-}
-
-static inline bool afs_is_folio_dirty_mmapped(unsigned long priv)
-{
- return priv & __AFS_FOLIO_PRIV_MMAPPED;
-}
-
#include <trace/events/afs.h>
/*****************************************************************************/
/*
* addr_list.c
*/
-static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist)
-{
- if (alist)
- refcount_inc(&alist->usage);
- return alist;
-}
-extern struct afs_addr_list *afs_alloc_addrlist(unsigned int,
- unsigned short,
- unsigned short);
-extern void afs_put_addrlist(struct afs_addr_list *);
+struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason);
+extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr);
+extern void afs_put_addrlist(struct afs_addr_list *alist, enum afs_alist_trace reason);
extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *,
const char *, size_t, char,
unsigned short, unsigned short);
+bool afs_addr_list_same(const struct afs_addr_list *a,
+ const struct afs_addr_list *b);
extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
-extern bool afs_iterate_addresses(struct afs_addr_cursor *);
-extern int afs_end_cursor(struct afs_addr_cursor *);
-extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
-extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
+extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr,
+ __be32 xdr, u16 port);
+extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr,
+ __be32 *xdr, u16 port);
+
+/*
+ * addr_prefs.c
+ */
+int afs_proc_addr_prefs_write(struct file *file, char *buf, size_t size);
+void afs_get_address_preferences_rcu(struct afs_net *net, struct afs_addr_list *alist);
+void afs_get_address_preferences(struct afs_net *net, struct afs_addr_list *alist);
/*
* callback.c
*/
extern void afs_invalidate_mmap_work(struct work_struct *);
-extern void afs_server_init_callback_work(struct work_struct *work);
extern void afs_init_callback_state(struct afs_server *);
extern void __afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
extern void afs_break_callback(struct afs_vnode *, enum afs_cb_break_reason);
@@ -988,13 +1022,15 @@ extern void afs_break_callbacks(struct afs_server *, size_t, struct afs_callback
static inline unsigned int afs_calc_vnode_cb_break(struct afs_vnode *vnode)
{
- return vnode->cb_break + vnode->cb_v_break;
+ return vnode->cb_break + vnode->cb_ro_snapshot + vnode->cb_scrub;
}
static inline bool afs_cb_is_broken(unsigned int cb_break,
const struct afs_vnode *vnode)
{
- return cb_break != (vnode->cb_break + vnode->volume->cb_v_break);
+ return cb_break != (vnode->cb_break +
+ atomic_read(&vnode->volume->cb_ro_snapshot) +
+ atomic_read(&vnode->volume->cb_scrub));
}
/*
@@ -1073,7 +1109,6 @@ extern int afs_release(struct inode *, struct file *);
extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
extern struct afs_read *afs_alloc_read(gfp_t);
extern void afs_put_read(struct afs_read *);
-extern int afs_write_inode(struct inode *, struct writeback_control *);
static inline struct afs_read *afs_get_read(struct afs_read *req)
{
@@ -1110,15 +1145,16 @@ extern void afs_fs_get_volume_status(struct afs_operation *);
extern void afs_fs_set_lock(struct afs_operation *);
extern void afs_fs_extend_lock(struct afs_operation *);
extern void afs_fs_release_lock(struct afs_operation *);
-extern int afs_fs_give_up_all_callbacks(struct afs_net *, struct afs_server *,
- struct afs_addr_cursor *, struct key *);
-extern bool afs_fs_get_capabilities(struct afs_net *, struct afs_server *,
- struct afs_addr_cursor *, struct key *);
+int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server,
+ struct afs_address *addr, struct key *key);
+bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server,
+ struct afs_endpoint_state *estate, unsigned int addr_index,
+ struct key *key);
extern void afs_fs_inline_bulk_status(struct afs_operation *);
struct afs_acl {
u32 size;
- u8 data[];
+ u8 data[] __counted_by(size);
};
extern void afs_fs_fetch_acl(struct afs_operation *);
@@ -1133,11 +1169,6 @@ extern bool afs_begin_vnode_operation(struct afs_operation *);
extern void afs_wait_for_operation(struct afs_operation *);
extern int afs_do_sync_operation(struct afs_operation *);
-static inline void afs_op_nomem(struct afs_operation *op)
-{
- op->error = -ENOMEM;
-}
-
static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n,
struct afs_vnode *vnode)
{
@@ -1154,12 +1185,17 @@ static inline void afs_op_set_fid(struct afs_operation *op, unsigned int n,
/*
* fs_probe.c
*/
+struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *estate,
+ enum afs_estate_trace where);
+void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where);
extern void afs_fileserver_probe_result(struct afs_call *);
-extern void afs_fs_probe_fileserver(struct afs_net *, struct afs_server *, struct key *, bool);
-extern int afs_wait_for_fs_probes(struct afs_server_list *, unsigned long);
+void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server,
+ struct afs_addr_list *new_addrs, struct key *key);
+int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr);
extern void afs_probe_fileserver(struct afs_net *, struct afs_server *);
extern void afs_fs_probe_dispatcher(struct work_struct *);
-extern int afs_wait_for_one_fs_probe(struct afs_server *, bool);
+int afs_wait_for_one_fs_probe(struct afs_server *server, struct afs_endpoint_state *estate,
+ unsigned long exclude, bool is_intr);
extern void afs_fs_probe_cleanup(struct afs_net *);
/*
@@ -1173,9 +1209,6 @@ extern int afs_ilookup5_test_by_fid(struct inode *, void *);
extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool);
extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *);
extern struct inode *afs_root_iget(struct super_block *, struct key *);
-extern bool afs_check_validity(struct afs_vnode *);
-extern int afs_validate(struct afs_vnode *, struct key *);
-bool afs_pagecache_valid(struct afs_vnode *);
extern int afs_getattr(struct mnt_idmap *idmap, const struct path *,
struct kstat *, u32, unsigned int);
extern int afs_setattr(struct mnt_idmap *idmap, struct dentry *, struct iattr *);
@@ -1231,6 +1264,31 @@ static inline void __afs_stat(atomic_t *s)
extern int afs_abort_to_error(u32);
extern void afs_prioritise_error(struct afs_error *, int, u32);
+static inline void afs_op_nomem(struct afs_operation *op)
+{
+ op->cumul_error.error = -ENOMEM;
+}
+
+static inline int afs_op_error(const struct afs_operation *op)
+{
+ return op->cumul_error.error;
+}
+
+static inline s32 afs_op_abort_code(const struct afs_operation *op)
+{
+ return op->cumul_error.abort_code;
+}
+
+static inline int afs_op_set_error(struct afs_operation *op, int error)
+{
+ return op->cumul_error.error = error;
+}
+
+static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code)
+{
+ afs_prioritise_error(&op->cumul_error, error, abort_code);
+}
+
/*
* mntpt.c
*/
@@ -1261,6 +1319,7 @@ static inline void afs_put_sysnames(struct afs_sysnames *sysnames) {}
/*
* rotate.c
*/
+void afs_clear_server_states(struct afs_operation *op);
extern bool afs_select_fileserver(struct afs_operation *);
extern void afs_dump_edestaddrreq(const struct afs_operation *);
@@ -1273,8 +1332,8 @@ extern int __net_init afs_open_socket(struct afs_net *);
extern void __net_exit afs_close_socket(struct afs_net *);
extern void afs_charge_preallocation(struct work_struct *);
extern void afs_put_call(struct afs_call *);
-extern void afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t);
-extern long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *);
+void afs_make_call(struct afs_call *call, gfp_t gfp);
+void afs_wait_for_call_to_complete(struct afs_call *call);
extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
const struct afs_call_type *,
size_t, size_t);
@@ -1287,12 +1346,16 @@ extern int afs_protocol_error(struct afs_call *, enum afs_eproto_cause);
static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *call,
gfp_t gfp)
{
- op->call = call;
- op->type = call->type;
- call->op = op;
- call->key = op->key;
- call->intr = !(op->flags & AFS_OPERATION_UNINTR);
- afs_make_call(&op->ac, call, gfp);
+ struct afs_addr_list *alist = op->estate->addresses;
+
+ op->call = call;
+ op->type = call->type;
+ call->op = op;
+ call->key = op->key;
+ call->intr = !(op->flags & AFS_OPERATION_UNINTR);
+ call->peer = rxrpc_kernel_get_peer(alist->addrs[op->addr_index].peer);
+ call->service_id = op->server->service_id;
+ afs_make_call(call, gfp);
}
static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size)
@@ -1401,8 +1464,7 @@ extern void __exit afs_clean_up_permit_cache(void);
*/
extern spinlock_t afs_server_peer_lock;
-extern struct afs_server *afs_find_server(struct afs_net *,
- const struct sockaddr_rxrpc *);
+extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *);
extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32);
extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace);
@@ -1414,7 +1476,7 @@ extern void afs_manage_servers(struct work_struct *);
extern void afs_servers_timer(struct timer_list *);
extern void afs_fs_probe_timer(struct timer_list *);
extern void __net_exit afs_purge_servers(struct afs_net *);
-extern bool afs_check_server_record(struct afs_operation *, struct afs_server *);
+bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key);
static inline void afs_inc_servers_outstanding(struct afs_net *net)
{
@@ -1442,10 +1504,14 @@ static inline struct afs_server_list *afs_get_serverlist(struct afs_server_list
}
extern void afs_put_serverlist(struct afs_net *, struct afs_server_list *);
-extern struct afs_server_list *afs_alloc_server_list(struct afs_cell *, struct key *,
- struct afs_vldb_entry *,
- u8);
+struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
+ struct key *key,
+ struct afs_vldb_entry *vldb);
extern bool afs_annotate_server_list(struct afs_server_list *, struct afs_server_list *);
+void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist);
+void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist,
+ struct afs_server_list *old);
+void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist);
/*
* super.c
@@ -1454,13 +1520,24 @@ extern int __init afs_fs_init(void);
extern void afs_fs_exit(void);
/*
+ * validation.c
+ */
+bool afs_check_validity(const struct afs_vnode *vnode);
+int afs_update_volume_state(struct afs_operation *op);
+int afs_validate(struct afs_vnode *vnode, struct key *key);
+
+/*
* vlclient.c
*/
extern struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *,
const char *, int);
extern struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *, const uuid_t *);
-extern struct afs_call *afs_vl_get_capabilities(struct afs_net *, struct afs_addr_cursor *,
- struct key *, struct afs_vlserver *, unsigned int);
+struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
+ struct afs_addr_list *alist,
+ unsigned int addr_index,
+ struct key *key,
+ struct afs_vlserver *server,
+ unsigned int server_index);
extern struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *, const uuid_t *);
extern char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *);
@@ -1516,30 +1593,17 @@ extern int afs_activate_volume(struct afs_volume *);
extern void afs_deactivate_volume(struct afs_volume *);
bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason);
extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace);
-extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace);
+void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason);
extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *);
/*
* write.c
*/
-#ifdef CONFIG_AFS_FSCACHE
-bool afs_dirty_folio(struct address_space *, struct folio *);
-#else
-#define afs_dirty_folio filemap_dirty_folio
-#endif
-extern int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **pagep, void **fsdata);
-extern int afs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata);
-extern int afs_writepage(struct page *, struct writeback_control *);
extern int afs_writepages(struct address_space *, struct writeback_control *);
-extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *);
extern int afs_fsync(struct file *, loff_t, loff_t, int);
extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf);
extern void afs_prune_wb_keys(struct afs_vnode *);
-int afs_launder_folio(struct folio *);
+void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len);
/*
* xattr.c
@@ -1603,7 +1667,7 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
struct afs_vnode_param *dir_vp,
struct dentry *dentry)
{
- if (!op->error)
+ if (!op->cumul_error.error)
dentry->d_fsdata =
(void *)(unsigned long)dir_vp->scb.status.data_version;
}
diff --git a/fs/afs/main.c b/fs/afs/main.c
index 6425c81d07de..a14f6013e316 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -90,8 +90,7 @@ static int __net_init afs_net_init(struct net *net_ns)
INIT_LIST_HEAD(&net->fs_probe_slow);
INIT_HLIST_HEAD(&net->fs_proc);
- INIT_HLIST_HEAD(&net->fs_addresses4);
- INIT_HLIST_HEAD(&net->fs_addresses6);
+ INIT_HLIST_HEAD(&net->fs_addresses);
seqlock_init(&net->fs_addr_lock);
INIT_WORK(&net->fs_manager, afs_manage_servers);
@@ -156,6 +155,7 @@ static void __net_exit afs_net_exit(struct net *net_ns)
afs_close_socket(net);
afs_proc_cleanup(net);
afs_put_sysnames(net->sysnames);
+ kfree_rcu(rcu_access_pointer(net->address_prefs), rcu);
}
static struct pernet_operations afs_net_ops = {
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 805328ca5428..b8180bf2281f 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -116,6 +116,8 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
{
switch (error) {
case 0:
+ e->aborted = false;
+ e->error = 0;
return;
default:
if (e->error == -ETIMEDOUT ||
@@ -161,12 +163,16 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
if (e->responded)
return;
e->error = error;
+ e->aborted = false;
return;
case -ECONNABORTED:
- error = afs_abort_to_error(abort_code);
- fallthrough;
+ e->error = afs_abort_to_error(abort_code);
+ e->aborted = true;
+ e->responded = true;
+ return;
case -ENETRESET: /* Responded, but we seem to have changed address */
+ e->aborted = false;
e->responded = true;
e->error = error;
return;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 2a0c83d71565..15eab053af6d 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -147,6 +147,56 @@ inval:
}
/*
+ * Display the list of addr_prefs known to the namespace.
+ */
+static int afs_proc_addr_prefs_show(struct seq_file *m, void *v)
+{
+ struct afs_addr_preference_list *preflist;
+ struct afs_addr_preference *pref;
+ struct afs_net *net = afs_seq2net_single(m);
+ union {
+ struct sockaddr_in sin;
+ struct sockaddr_in6 sin6;
+ } addr;
+ unsigned int i;
+ char buf[44]; /* Maximum ipv6 + max subnet is 43 */
+
+ rcu_read_lock();
+ preflist = rcu_dereference(net->address_prefs);
+
+ if (!preflist) {
+ seq_puts(m, "NO PREFS\n");
+ goto out;
+ }
+
+ seq_printf(m, "PROT SUBNET PRIOR (v=%u n=%u/%u/%u)\n",
+ preflist->version, preflist->ipv6_off, preflist->nr, preflist->max_prefs);
+
+ memset(&addr, 0, sizeof(addr));
+
+ for (i = 0; i < preflist->nr; i++) {
+ pref = &preflist->prefs[i];
+
+ addr.sin.sin_family = pref->family;
+ if (pref->family == AF_INET) {
+ memcpy(&addr.sin.sin_addr, &pref->ipv4_addr,
+ sizeof(addr.sin.sin_addr));
+ snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin, pref->subnet_mask);
+ seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio);
+ } else {
+ memcpy(&addr.sin6.sin6_addr, &pref->ipv6_addr,
+ sizeof(addr.sin6.sin6_addr));
+ snprintf(buf, sizeof(buf), "%pISc/%u", &addr.sin6, pref->subnet_mask);
+ seq_printf(m, "UDP %-43.43s %5u\n", buf, pref->prio);
+ }
+ }
+
+out:
+ rcu_read_unlock();
+ return 0;
+}
+
+/*
* Display the name of the current workstation cell.
*/
static int afs_proc_rootcell_show(struct seq_file *m, void *v)
@@ -307,7 +357,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
for (i = 0; i < alist->nr_addrs; i++)
seq_printf(m, " %c %pISpc\n",
alist->preferred == i ? '>' : '-',
- &alist->addrs[i].transport);
+ rxrpc_kernel_remote_addr(alist->addrs[i].peer));
}
seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt);
seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n",
@@ -375,32 +425,45 @@ static const struct seq_operations afs_proc_cell_vlservers_ops = {
*/
static int afs_proc_servers_show(struct seq_file *m, void *v)
{
- struct afs_server *server;
+ struct afs_endpoint_state *estate;
struct afs_addr_list *alist;
+ struct afs_server *server;
+ unsigned long failed;
int i;
if (v == SEQ_START_TOKEN) {
- seq_puts(m, "UUID REF ACT\n");
+ seq_puts(m, "UUID REF ACT CELL\n");
return 0;
}
server = list_entry(v, struct afs_server, proc_link);
- alist = rcu_dereference(server->addresses);
- seq_printf(m, "%pU %3d %3d\n",
+ estate = rcu_dereference(server->endpoint_state);
+ alist = estate->addresses;
+ seq_printf(m, "%pU %3d %3d %s\n",
&server->uuid,
refcount_read(&server->ref),
- atomic_read(&server->active));
- seq_printf(m, " - info: fl=%lx rtt=%u brk=%x\n",
- server->flags, server->rtt, server->cb_s_break);
- seq_printf(m, " - probe: last=%d out=%d\n",
- (int)(jiffies - server->probed_at) / HZ,
- atomic_read(&server->probe_outstanding));
- seq_printf(m, " - ALIST v=%u rsp=%lx f=%lx\n",
- alist->version, alist->responded, alist->failed);
- for (i = 0; i < alist->nr_addrs; i++)
- seq_printf(m, " [%x] %pISpc%s\n",
- i, &alist->addrs[i].transport,
- alist->preferred == i ? "*" : "");
+ atomic_read(&server->active),
+ server->cell->name);
+ seq_printf(m, " - info: fl=%lx rtt=%u\n",
+ server->flags, server->rtt);
+ seq_printf(m, " - probe: last=%d\n",
+ (int)(jiffies - server->probed_at) / HZ);
+ failed = estate->failed_set;
+ seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n",
+ estate->probe_seq, atomic_read(&estate->nr_probing),
+ estate->responsive_set, estate->failed_set);
+ seq_printf(m, " - ALIST v=%u ap=%u\n",
+ alist->version, alist->addr_pref_version);
+ for (i = 0; i < alist->nr_addrs; i++) {
+ const struct afs_address *addr = &alist->addrs[i];
+
+ seq_printf(m, " [%x] %pISpc%s rtt=%d err=%d p=%u\n",
+ i, rxrpc_kernel_remote_addr(addr->peer),
+ alist->preferred == i ? "*" :
+ test_bit(i, &failed) ? "!" : "",
+ rxrpc_kernel_get_srtt(addr->peer),
+ addr->last_error, addr->prio);
+ }
return 0;
}
@@ -681,7 +744,11 @@ int afs_proc_init(struct afs_net *net)
&afs_proc_sysname_ops,
afs_proc_sysname_write,
sizeof(struct seq_net_private),
- NULL))
+ NULL) ||
+ !proc_create_net_single_write("addr_prefs", 0644, p,
+ afs_proc_addr_prefs_show,
+ afs_proc_addr_prefs_write,
+ NULL))
goto error_tree;
net->proc_afs = p;
diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
index a840c3588ebb..700a27bc8c25 100644
--- a/fs/afs/rotate.c
+++ b/fs/afs/rotate.c
@@ -13,6 +13,19 @@
#include <linux/sched/signal.h>
#include "internal.h"
#include "afs_fs.h"
+#include "protocol_uae.h"
+
+void afs_clear_server_states(struct afs_operation *op)
+{
+ unsigned int i;
+
+ if (op->server_states) {
+ for (i = 0; i < op->server_list->nr_servers; i++)
+ afs_put_endpoint_state(op->server_states[i].endpoint_state,
+ afs_estate_trace_put_server_state);
+ kfree(op->server_states);
+ }
+}
/*
* Begin iteration through a server list, starting with the vnode's last used
@@ -25,14 +38,41 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
void *cb_server;
int i;
+ trace_afs_rotate(op, afs_rotate_trace_start, 0);
+
read_lock(&op->volume->servers_lock);
op->server_list = afs_get_serverlist(
rcu_dereference_protected(op->volume->servers,
lockdep_is_held(&op->volume->servers_lock)));
read_unlock(&op->volume->servers_lock);
- op->untried = (1UL << op->server_list->nr_servers) - 1;
- op->index = READ_ONCE(op->server_list->preferred);
+ op->server_states = kcalloc(op->server_list->nr_servers, sizeof(op->server_states[0]),
+ GFP_KERNEL);
+ if (!op->server_states) {
+ afs_op_nomem(op);
+ trace_afs_rotate(op, afs_rotate_trace_nomem, 0);
+ return false;
+ }
+
+ rcu_read_lock();
+ for (i = 0; i < op->server_list->nr_servers; i++) {
+ struct afs_endpoint_state *estate;
+ struct afs_server_state *s = &op->server_states[i];
+
+ server = op->server_list->servers[i].server;
+ estate = rcu_dereference(server->endpoint_state);
+ s->endpoint_state = afs_get_endpoint_state(estate,
+ afs_estate_trace_get_server_state);
+ s->probe_seq = estate->probe_seq;
+ s->untried_addrs = (1UL << estate->addresses->nr_addrs) - 1;
+ init_waitqueue_entry(&s->probe_waiter, current);
+ afs_get_address_preferences(op->net, estate->addresses);
+ }
+ rcu_read_unlock();
+
+
+ op->untried_servers = (1UL << op->server_list->nr_servers) - 1;
+ op->server_index = -1;
cb_server = vnode->cb_server;
if (cb_server) {
@@ -40,7 +80,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
for (i = 0; i < op->server_list->nr_servers; i++) {
server = op->server_list->servers[i].server;
if (server == cb_server) {
- op->index = i;
+ op->server_index = i;
goto found_interest;
}
}
@@ -50,7 +90,8 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
* and have to return an error.
*/
if (op->flags & AFS_OPERATION_CUR_ONLY) {
- op->error = -ESTALE;
+ afs_op_set_error(op, -ESTALE);
+ trace_afs_rotate(op, afs_rotate_trace_stale_lock, 0);
return false;
}
@@ -58,7 +99,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
write_seqlock(&vnode->cb_lock);
ASSERTCMP(cb_server, ==, vnode->cb_server);
vnode->cb_server = NULL;
- if (test_and_clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
+ if (atomic64_xchg(&vnode->cb_expires_at, AFS_NO_CB_PROMISE) != AFS_NO_CB_PROMISE)
vnode->cb_break++;
write_sequnlock(&vnode->cb_lock);
}
@@ -70,7 +111,7 @@ found_interest:
/*
* Post volume busy note.
*/
-static void afs_busy(struct afs_volume *volume, u32 abort_code)
+static void afs_busy(struct afs_operation *op, u32 abort_code)
{
const char *m;
@@ -81,7 +122,8 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code)
default: m = "busy"; break;
}
- pr_notice("kAFS: Volume %llu '%s' is %s\n", volume->vid, volume->name, m);
+ pr_notice("kAFS: Volume %llu '%s' on server %pU is %s\n",
+ op->volume->vid, op->volume->name, &op->server->uuid, m);
}
/*
@@ -89,10 +131,11 @@ static void afs_busy(struct afs_volume *volume, u32 abort_code)
*/
static bool afs_sleep_and_retry(struct afs_operation *op)
{
+ trace_afs_rotate(op, afs_rotate_trace_busy_sleep, 0);
if (!(op->flags & AFS_OPERATION_UNINTR)) {
msleep_interruptible(1000);
if (signal_pending(current)) {
- op->error = -ERESTARTSYS;
+ afs_op_set_error(op, -ERESTARTSYS);
return false;
}
} else {
@@ -111,62 +154,105 @@ bool afs_select_fileserver(struct afs_operation *op)
struct afs_addr_list *alist;
struct afs_server *server;
struct afs_vnode *vnode = op->file[0].vnode;
- struct afs_error e;
- u32 rtt;
- int error = op->ac.error, i;
+ unsigned long set, failed;
+ s32 abort_code = op->call_abort_code;
+ int best_prio = 0;
+ int error = op->call_error, addr_index, i, j;
+
+ op->nr_iterations++;
- _enter("%lx[%d],%lx[%d],%d,%d",
- op->untried, op->index,
- op->ac.tried, op->ac.index,
- error, op->ac.abort_code);
+ _enter("OP=%x+%x,%llx,%u{%lx},%u{%lx},%d,%d",
+ op->debug_id, op->nr_iterations, op->volume->vid,
+ op->server_index, op->untried_servers,
+ op->addr_index, op->addr_tried,
+ error, abort_code);
if (op->flags & AFS_OPERATION_STOP) {
+ trace_afs_rotate(op, afs_rotate_trace_stopped, 0);
_leave(" = f [stopped]");
return false;
}
- op->nr_iterations++;
-
- /* Evaluate the result of the previous operation, if there was one. */
- switch (error) {
- case SHRT_MAX:
+ if (op->nr_iterations == 0)
goto start;
+ WRITE_ONCE(op->estate->addresses->addrs[op->addr_index].last_error, error);
+ trace_afs_rotate(op, afs_rotate_trace_iter, op->call_error);
+
+ /* Evaluate the result of the previous operation, if there was one. */
+ switch (op->call_error) {
case 0:
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
+ op->cumul_error.responded = true;
+
+ /* We succeeded, but we may need to redo the op from another
+ * server if we're looking at a set of RO volumes where some of
+ * the servers have not yet been brought up to date lest we
+ * regress the data. We only switch to the new version once
+ * >=50% of the servers are updated.
+ */
+ error = afs_update_volume_state(op);
+ if (error != 0) {
+ if (error == 1) {
+ afs_sleep_and_retry(op);
+ goto restart_from_beginning;
+ }
+ afs_op_set_error(op, error);
+ goto failed;
+ }
+ fallthrough;
default:
/* Success or local failure. Stop. */
- op->error = error;
+ afs_op_set_error(op, error);
op->flags |= AFS_OPERATION_STOP;
+ trace_afs_rotate(op, afs_rotate_trace_stop, error);
_leave(" = f [okay/local %d]", error);
return false;
case -ECONNABORTED:
/* The far side rejected the operation on some grounds. This
* might involve the server being busy or the volume having been moved.
+ *
+ * Note that various V* errors should not be sent to a cache manager
+ * by a fileserver as they should be translated to more modern UAE*
+ * errors instead. IBM AFS and OpenAFS fileservers, however, do leak
+ * these abort codes.
*/
- switch (op->ac.abort_code) {
+ trace_afs_rotate(op, afs_rotate_trace_aborted, abort_code);
+ op->cumul_error.responded = true;
+ switch (abort_code) {
case VNOVOL:
/* This fileserver doesn't know about the volume.
* - May indicate that the VL is wrong - retry once and compare
* the results.
* - May indicate that the fileserver couldn't attach to the vol.
+ * - The volume might have been temporarily removed so that it can
+ * be replaced by a volume restore. "vos" might have ended one
+ * transaction and has yet to create the next.
+ * - The volume might not be blessed or might not be in-service
+ * (administrative action).
*/
if (op->flags & AFS_OPERATION_VNOVOL) {
- op->error = -EREMOTEIO;
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
goto next_server;
}
write_lock(&op->volume->servers_lock);
- op->server_list->vnovol_mask |= 1 << op->index;
+ op->server_list->vnovol_mask |= 1 << op->server_index;
write_unlock(&op->volume->servers_lock);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
error = afs_check_volume_status(op->volume, op);
- if (error < 0)
- goto failed_set_error;
+ if (error < 0) {
+ afs_op_set_error(op, error);
+ goto failed;
+ }
if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
- op->error = -ENOMEDIUM;
+ afs_op_set_error(op, -ENOMEDIUM);
goto failed;
}
@@ -174,7 +260,7 @@ bool afs_select_fileserver(struct afs_operation *op)
* it's the fileserver having trouble.
*/
if (rcu_access_pointer(op->volume->servers) == op->server_list) {
- op->error = -EREMOTEIO;
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
goto next_server;
}
@@ -183,50 +269,99 @@ bool afs_select_fileserver(struct afs_operation *op)
_leave(" = t [vnovol]");
return true;
- case VSALVAGE: /* TODO: Should this return an error or iterate? */
case VVOLEXISTS:
- case VNOSERVICE:
case VONLINE:
- case VDISKFULL:
- case VOVERQUOTA:
- op->error = afs_abort_to_error(op->ac.abort_code);
+ /* These should not be returned from the fileserver. */
+ pr_warn("Fileserver returned unexpected abort %d\n",
+ abort_code);
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
goto next_server;
+ case VNOSERVICE:
+ /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
+ * if the volume was neither in-service nor administratively
+ * blessed. All usage was replaced by VNOVOL because AFS 3.1 and
+ * earlier cache managers did not handle VNOSERVICE and assumed
+ * it was the client OSes errno 105.
+ *
+ * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
+ * fileserver idle dead time error which was sent in place of
+ * RX_CALL_TIMEOUT (-3). The error was intended to be sent if the
+ * fileserver took too long to send a reply to the client.
+ * RX_CALL_TIMEOUT would have caused the cache manager to mark the
+ * server down whereas VNOSERVICE since AFS 3.2 would cause cache
+ * manager to temporarily (up to 15 minutes) mark the volume
+ * instance as unusable.
+ *
+ * The idle dead logic resulted in cache inconsistency since a
+ * state changing call that the cache manager assumed was dead
+ * could still be processed to completion by the fileserver. This
+ * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
+ * returned. However, many 1.4.8 through 1.6.24 fileservers are
+ * still in existence.
+ *
+ * AuriStorFS fileservers have never returned VNOSERVICE.
+ *
+ * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
+ */
+ case RX_CALL_TIMEOUT:
+ afs_op_accumulate_error(op, -ETIMEDOUT, abort_code);
+ goto next_server;
+
+ case VSALVAGING: /* This error should not be leaked to cache managers
+ * but is from OpenAFS demand attach fileservers.
+ * It should be treated as an alias for VOFFLINE.
+ */
+ case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
case VOFFLINE:
- if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
- afs_busy(op->volume, op->ac.abort_code);
- clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+ /* The volume is in use by the volserver or another volume utility
+ * for an operation that might alter the contents. The volume is
+ * expected to come back but it might take a long time (could be
+ * days).
+ */
+ if (!test_and_set_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags)) {
+ afs_busy(op, abort_code);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
}
if (op->flags & AFS_OPERATION_NO_VSLEEP) {
- op->error = -EADV;
- goto failed;
- }
- if (op->flags & AFS_OPERATION_CUR_ONLY) {
- op->error = -ESTALE;
+ afs_op_set_error(op, -EADV);
goto failed;
}
goto busy;
- case VSALVAGING:
- case VRESTARTING:
+ case VRESTARTING: /* The fileserver is either shutting down or starting up. */
case VBUSY:
- /* Retry after going round all the servers unless we
- * have a file lock we need to maintain.
+ /* The volume is in use by the volserver or another volume
+ * utility for an operation that is not expected to alter the
+ * contents of the volume. VBUSY does not need to be returned
+ * for a ROVOL or BACKVOL bound to an ITBusy volserver
+ * transaction. The fileserver is permitted to continue serving
+ * content from ROVOLs and BACKVOLs during an ITBusy transaction
+ * because the content will not change. However, many fileserver
+ * releases do return VBUSY for ROVOL and BACKVOL instances under
+ * many circumstances.
+ *
+ * Retry after going round all the servers unless we have a file
+ * lock we need to maintain.
*/
if (op->flags & AFS_OPERATION_NO_VSLEEP) {
- op->error = -EBUSY;
+ afs_op_set_error(op, -EBUSY);
goto failed;
}
- if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
- afs_busy(op->volume, op->ac.abort_code);
- clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
+ if (!test_and_set_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags)) {
+ afs_busy(op, abort_code);
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
}
busy:
if (op->flags & AFS_OPERATION_CUR_ONLY) {
if (!afs_sleep_and_retry(op))
goto failed;
- /* Retry with same server & address */
+ /* Retry with same server & address */
_leave(" = t [vbusy]");
return true;
}
@@ -243,7 +378,7 @@ bool afs_select_fileserver(struct afs_operation *op)
* honour, just in case someone sets up a loop.
*/
if (op->flags & AFS_OPERATION_VMOVED) {
- op->error = -EREMOTEIO;
+ afs_op_set_error(op, -EREMOTEIO);
goto failed;
}
op->flags |= AFS_OPERATION_VMOVED;
@@ -251,8 +386,10 @@ bool afs_select_fileserver(struct afs_operation *op)
set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
error = afs_check_volume_status(op->volume, op);
- if (error < 0)
- goto failed_set_error;
+ if (error < 0) {
+ afs_op_set_error(op, error);
+ goto failed;
+ }
/* If the server list didn't change, then the VLDB is
* out of sync with the fileservers. This is hopefully
@@ -264,22 +401,50 @@ bool afs_select_fileserver(struct afs_operation *op)
* TODO: Retry a few times with sleeps.
*/
if (rcu_access_pointer(op->volume->servers) == op->server_list) {
- op->error = -ENOMEDIUM;
+ afs_op_accumulate_error(op, -ENOMEDIUM, abort_code);
goto failed;
}
goto restart_from_beginning;
+ case UAEIO:
+ case VIO:
+ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
+ if (op->volume->type != AFSVL_RWVOL)
+ goto next_server;
+ goto failed;
+
+ case VDISKFULL:
+ case UAENOSPC:
+ /* The partition is full. Only applies to RWVOLs.
+ * Translate locally and return ENOSPC.
+ * No replicas to failover to.
+ */
+ afs_op_set_error(op, -ENOSPC);
+ goto failed_but_online;
+
+ case VOVERQUOTA:
+ case UAEDQUOT:
+ /* Volume is full. Only applies to RWVOLs.
+ * Translate locally and return EDQUOT.
+ * No replicas to failover to.
+ */
+ afs_op_set_error(op, -EDQUOT);
+ goto failed_but_online;
+
default:
- clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
- clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
- op->error = afs_abort_to_error(op->ac.abort_code);
+ afs_op_accumulate_error(op, error, abort_code);
+ failed_but_online:
+ clear_bit(AFS_SE_VOLUME_OFFLINE,
+ &op->server_list->servers[op->server_index].flags);
+ clear_bit(AFS_SE_VOLUME_BUSY,
+ &op->server_list->servers[op->server_index].flags);
goto failed;
}
case -ETIMEDOUT:
case -ETIME:
- if (op->error != -EDESTADDRREQ)
+ if (afs_op_error(op) != -EDESTADDRREQ)
goto iterate_address;
fallthrough;
case -ERFKILL:
@@ -289,7 +454,7 @@ bool afs_select_fileserver(struct afs_operation *op)
case -EHOSTDOWN:
case -ECONNREFUSED:
_debug("no conn");
- op->error = error;
+ afs_op_accumulate_error(op, error, 0);
goto iterate_address;
case -ENETRESET:
@@ -298,24 +463,31 @@ bool afs_select_fileserver(struct afs_operation *op)
fallthrough;
case -ECONNRESET:
_debug("call reset");
- op->error = error;
+ afs_op_set_error(op, error);
goto failed;
}
restart_from_beginning:
+ trace_afs_rotate(op, afs_rotate_trace_restart, 0);
_debug("restart");
- afs_end_cursor(&op->ac);
+ op->estate = NULL;
op->server = NULL;
+ afs_clear_server_states(op);
+ op->server_states = NULL;
afs_put_serverlist(op->net, op->server_list);
op->server_list = NULL;
start:
_debug("start");
+ ASSERTCMP(op->estate, ==, NULL);
/* See if we need to do an update of the volume record. Note that the
* volume may have moved or even have been deleted.
*/
error = afs_check_volume_status(op->volume, op);
- if (error < 0)
- goto failed_set_error;
+ trace_afs_rotate(op, afs_rotate_trace_check_vol_status, error);
+ if (error < 0) {
+ afs_op_set_error(op, error);
+ goto failed;
+ }
if (!afs_start_fs_iteration(op, vnode))
goto failed;
@@ -323,52 +495,83 @@ start:
_debug("__ VOL %llx __", op->volume->vid);
pick_server:
- _debug("pick [%lx]", op->untried);
+ _debug("pick [%lx]", op->untried_servers);
+ ASSERTCMP(op->estate, ==, NULL);
- error = afs_wait_for_fs_probes(op->server_list, op->untried);
- if (error < 0)
- goto failed_set_error;
+ error = afs_wait_for_fs_probes(op, op->server_states,
+ !(op->flags & AFS_OPERATION_UNINTR));
+ switch (error) {
+ case 0: /* No untried responsive servers and no outstanding probes */
+ trace_afs_rotate(op, afs_rotate_trace_probe_none, 0);
+ goto no_more_servers;
+ case 1: /* Got a response */
+ trace_afs_rotate(op, afs_rotate_trace_probe_response, 0);
+ break;
+ case 2: /* Probe data superseded */
+ trace_afs_rotate(op, afs_rotate_trace_probe_superseded, 0);
+ goto restart_from_beginning;
+ default:
+ trace_afs_rotate(op, afs_rotate_trace_probe_error, error);
+ afs_op_set_error(op, error);
+ goto failed;
+ }
- /* Pick the untried server with the lowest RTT. If we have outstanding
- * callbacks, we stick with the server we're already using if we can.
+ /* Pick the untried server with the highest priority untried endpoint.
+ * If we have outstanding callbacks, we stick with the server we're
+ * already using if we can.
*/
if (op->server) {
- _debug("server %u", op->index);
- if (test_bit(op->index, &op->untried))
+ _debug("server %u", op->server_index);
+ if (test_bit(op->server_index, &op->untried_servers))
goto selected_server;
op->server = NULL;
_debug("no server");
}
- op->index = -1;
- rtt = U32_MAX;
+ rcu_read_lock();
+ op->server_index = -1;
+ best_prio = -1;
for (i = 0; i < op->server_list->nr_servers; i++) {
- struct afs_server *s = op->server_list->servers[i].server;
+ struct afs_endpoint_state *es;
+ struct afs_server_entry *se = &op->server_list->servers[i];
+ struct afs_addr_list *sal;
+ struct afs_server *s = se->server;
- if (!test_bit(i, &op->untried) ||
+ if (!test_bit(i, &op->untried_servers) ||
+ test_bit(AFS_SE_EXCLUDED, &se->flags) ||
!test_bit(AFS_SERVER_FL_RESPONDING, &s->flags))
continue;
- if (s->probe.rtt < rtt) {
- op->index = i;
- rtt = s->probe.rtt;
+ es = op->server_states->endpoint_state;
+ sal = es->addresses;
+
+ afs_get_address_preferences_rcu(op->net, sal);
+ for (j = 0; j < sal->nr_addrs; j++) {
+ if (!sal->addrs[j].peer)
+ continue;
+ if (sal->addrs[j].prio > best_prio) {
+ op->server_index = i;
+ best_prio = sal->addrs[j].prio;
+ }
}
}
+ rcu_read_unlock();
- if (op->index == -1)
+ if (op->server_index == -1)
goto no_more_servers;
selected_server:
- _debug("use %d", op->index);
- __clear_bit(op->index, &op->untried);
+ trace_afs_rotate(op, afs_rotate_trace_selected_server, best_prio);
+ _debug("use %d prio %u", op->server_index, best_prio);
+ __clear_bit(op->server_index, &op->untried_servers);
/* We're starting on a different fileserver from the list. We need to
* check it, create a callback intercept, find its address list and
* probe its capabilities before we use it.
*/
- ASSERTCMP(op->ac.alist, ==, NULL);
- server = op->server_list->servers[op->index].server;
+ ASSERTCMP(op->estate, ==, NULL);
+ server = op->server_list->servers[op->server_index].server;
- if (!afs_check_server_record(op, server))
+ if (!afs_check_server_record(op, server, op->key))
goto failed;
_debug("USING SERVER: %pU", &server->uuid);
@@ -377,58 +580,73 @@ selected_server:
op->server = server;
if (vnode->cb_server != server) {
vnode->cb_server = server;
- vnode->cb_s_break = server->cb_s_break;
- vnode->cb_fs_s_break = atomic_read(&server->cell->fs_s_break);
- vnode->cb_v_break = vnode->volume->cb_v_break;
- clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags);
+ vnode->cb_v_check = atomic_read(&vnode->volume->cb_v_break);
+ atomic64_set(&vnode->cb_expires_at, AFS_NO_CB_PROMISE);
}
- read_lock(&server->fs_lock);
- alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->fs_lock));
- afs_get_addrlist(alist);
- read_unlock(&server->fs_lock);
-
retry_server:
- memset(&op->ac, 0, sizeof(op->ac));
-
- if (!op->ac.alist)
- op->ac.alist = alist;
- else
- afs_put_addrlist(alist);
-
- op->ac.index = -1;
+ op->addr_tried = 0;
+ op->addr_index = -1;
iterate_address:
- ASSERT(op->ac.alist);
/* Iterate over the current server's address list to try and find an
* address on which it will respond to us.
*/
- if (!afs_iterate_addresses(&op->ac))
- goto out_of_addresses;
+ op->estate = op->server_states[op->server_index].endpoint_state;
+ set = READ_ONCE(op->estate->responsive_set);
+ failed = READ_ONCE(op->estate->failed_set);
+ _debug("iterate ES=%x rs=%lx fs=%lx", op->estate->probe_seq, set, failed);
+ set &= ~(failed | op->addr_tried);
+ trace_afs_rotate(op, afs_rotate_trace_iterate_addr, set);
+ if (!set)
+ goto wait_for_more_probe_results;
+
+ alist = op->estate->addresses;
+ for (i = 0; i < alist->nr_addrs; i++) {
+ if (alist->addrs[i].prio > best_prio) {
+ addr_index = i;
+ best_prio = alist->addrs[i].prio;
+ }
+ }
- _debug("address [%u] %u/%u %pISp",
- op->index, op->ac.index, op->ac.alist->nr_addrs,
- &op->ac.alist->addrs[op->ac.index].transport);
+ addr_index = READ_ONCE(alist->preferred);
+ if (!test_bit(addr_index, &set))
+ addr_index = __ffs(set);
+ op->addr_index = addr_index;
+ set_bit(addr_index, &op->addr_tried);
+
+ op->volsync.creation = TIME64_MIN;
+ op->volsync.update = TIME64_MIN;
+ op->call_responded = false;
+ _debug("address [%u] %u/%u %pISp",
+ op->server_index, addr_index, alist->nr_addrs,
+ rxrpc_kernel_remote_addr(alist->addrs[op->addr_index].peer));
_leave(" = t");
return true;
-out_of_addresses:
+wait_for_more_probe_results:
+ error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
+ !(op->flags & AFS_OPERATION_UNINTR));
+ if (!error)
+ goto iterate_address;
+
/* We've now had a failure to respond on all of a server's addresses -
* immediately probe them again and consider retrying the server.
*/
+ trace_afs_rotate(op, afs_rotate_trace_probe_fileserver, 0);
afs_probe_fileserver(op->net, op->server);
if (op->flags & AFS_OPERATION_RETRY_SERVER) {
- alist = op->ac.alist;
- error = afs_wait_for_one_fs_probe(
- op->server, !(op->flags & AFS_OPERATION_UNINTR));
+ error = afs_wait_for_one_fs_probe(op->server, op->estate, op->addr_tried,
+ !(op->flags & AFS_OPERATION_UNINTR));
switch (error) {
case 0:
op->flags &= ~AFS_OPERATION_RETRY_SERVER;
+ trace_afs_rotate(op, afs_rotate_trace_retry_server, 0);
goto retry_server;
case -ERESTARTSYS:
- goto failed_set_error;
+ afs_op_set_error(op, error);
+ goto failed;
case -ETIME:
case -EDESTADDRREQ:
goto next_server;
@@ -436,34 +654,51 @@ out_of_addresses:
}
next_server:
+ trace_afs_rotate(op, afs_rotate_trace_next_server, 0);
_debug("next");
- afs_end_cursor(&op->ac);
+ ASSERT(op->estate);
+ alist = op->estate->addresses;
+ if (op->call_responded &&
+ op->addr_index != READ_ONCE(alist->preferred) &&
+ test_bit(alist->preferred, &op->addr_tried))
+ WRITE_ONCE(alist->preferred, op->addr_index);
+ op->estate = NULL;
goto pick_server;
no_more_servers:
/* That's all the servers poked to no good effect. Try again if some
* of them were busy.
*/
- if (op->flags & AFS_OPERATION_VBUSY)
+ trace_afs_rotate(op, afs_rotate_trace_no_more_servers, 0);
+ if (op->flags & AFS_OPERATION_VBUSY) {
+ afs_sleep_and_retry(op);
+ op->flags &= ~AFS_OPERATION_VBUSY;
goto restart_from_beginning;
+ }
- e.error = -EDESTADDRREQ;
- e.responded = false;
+ rcu_read_lock();
for (i = 0; i < op->server_list->nr_servers; i++) {
- struct afs_server *s = op->server_list->servers[i].server;
+ struct afs_endpoint_state *estate;
- afs_prioritise_error(&e, READ_ONCE(s->probe.error),
- s->probe.abort_code);
+ estate = op->server_states->endpoint_state;
+ error = READ_ONCE(estate->error);
+ if (error < 0)
+ afs_op_accumulate_error(op, error, estate->abort_code);
}
+ rcu_read_unlock();
- error = e.error;
-
-failed_set_error:
- op->error = error;
failed:
+ trace_afs_rotate(op, afs_rotate_trace_failed, 0);
op->flags |= AFS_OPERATION_STOP;
- afs_end_cursor(&op->ac);
- _leave(" = f [failed %d]", op->error);
+ if (op->estate) {
+ alist = op->estate->addresses;
+ if (op->call_responded &&
+ op->addr_index != READ_ONCE(alist->preferred) &&
+ test_bit(alist->preferred, &op->addr_tried))
+ WRITE_ONCE(alist->preferred, op->addr_index);
+ op->estate = NULL;
+ }
+ _leave(" = f [failed %d]", afs_op_error(op));
return false;
}
@@ -482,37 +717,40 @@ void afs_dump_edestaddrreq(const struct afs_operation *op)
rcu_read_lock();
pr_notice("EDESTADDR occurred\n");
- pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n",
+ pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n",
op->file[0].cb_break_before,
- op->file[1].cb_break_before, op->flags, op->error);
- pr_notice("FC: ut=%lx ix=%d ni=%u\n",
- op->untried, op->index, op->nr_iterations);
+ op->file[1].cb_break_before, op->flags, op->cumul_error.error);
+ pr_notice("OP: ut=%lx ix=%d ni=%u\n",
+ op->untried_servers, op->server_index, op->nr_iterations);
+ pr_notice("OP: call er=%d ac=%d r=%u\n",
+ op->call_error, op->call_abort_code, op->call_responded);
if (op->server_list) {
const struct afs_server_list *sl = op->server_list;
- pr_notice("FC: SL nr=%u pr=%u vnov=%hx\n",
- sl->nr_servers, sl->preferred, sl->vnovol_mask);
+
+ pr_notice("FC: SL nr=%u vnov=%hx\n",
+ sl->nr_servers, sl->vnovol_mask);
for (i = 0; i < sl->nr_servers; i++) {
const struct afs_server *s = sl->servers[i].server;
+ const struct afs_endpoint_state *e =
+ rcu_dereference(s->endpoint_state);
+ const struct afs_addr_list *a = e->addresses;
+
pr_notice("FC: server fl=%lx av=%u %pU\n",
s->flags, s->addr_version, &s->uuid);
- if (s->addresses) {
- const struct afs_addr_list *a =
- rcu_dereference(s->addresses);
+ pr_notice("FC: - pq=%x R=%lx F=%lx\n",
+ e->probe_seq, e->responsive_set, e->failed_set);
+ if (a) {
pr_notice("FC: - av=%u nr=%u/%u/%u pr=%u\n",
a->version,
a->nr_ipv4, a->nr_addrs, a->max_addrs,
a->preferred);
- pr_notice("FC: - R=%lx F=%lx\n",
- a->responded, a->failed);
- if (a == op->ac.alist)
+ if (a == e->addresses)
pr_notice("FC: - current\n");
}
}
}
- pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
- op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error,
- op->ac.responded, op->ac.nr_iterations);
+ pr_notice("AC: t=%lx ax=%d\n", op->addr_tried, op->addr_index);
rcu_read_unlock();
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index d642d06a453b..c453428f3c8b 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -178,6 +178,8 @@ void afs_put_call(struct afs_call *call)
ASSERT(!work_pending(&call->async_work));
ASSERT(call->type->name != NULL);
+ rxrpc_kernel_put_peer(call->peer);
+
if (call->rxcall) {
rxrpc_kernel_shutdown_call(net->socket, call->rxcall);
rxrpc_kernel_put_call(net->socket, call->rxcall);
@@ -187,7 +189,6 @@ void afs_put_call(struct afs_call *call)
call->type->destructor(call);
afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call);
- afs_put_addrlist(call->alist);
kfree(call->request);
trace_afs_call(call->debug_id, afs_call_trace_free, 0, o,
@@ -294,9 +295,8 @@ static void afs_notify_end_request_tx(struct sock *sock,
* Initiate a call and synchronously queue up the parameters for dispatch. Any
* error is stored into the call struct, which the caller must check for.
*/
-void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+void afs_make_call(struct afs_call *call, gfp_t gfp)
{
- struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index];
struct rxrpc_call *rxcall;
struct msghdr msg;
struct kvec iov[1];
@@ -304,7 +304,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
s64 tx_total_len;
int ret;
- _enter(",{%pISp},", &srx->transport);
+ _enter(",{%pISp+%u},", rxrpc_kernel_remote_addr(call->peer), call->service_id);
ASSERT(call->type != NULL);
ASSERT(call->type->name != NULL);
@@ -313,8 +313,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
call, call->type->name, key_serial(call->key),
atomic_read(&call->net->nr_outstanding_calls));
- call->addr_ix = ac->index;
- call->alist = afs_get_addrlist(ac->alist);
+ trace_afs_make_call(call);
/* Work out the length we're going to transmit. This is awkward for
* calls such as FS.StoreData where there's an extra injection of data
@@ -333,7 +332,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
}
/* create a call */
- rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
+ rxcall = rxrpc_kernel_begin_call(call->net->socket, call->peer, call->key,
(unsigned long)call,
tx_total_len,
call->max_lifespan,
@@ -341,6 +340,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
(call->async ?
afs_wake_up_async_call :
afs_wake_up_call_waiter),
+ call->service_id,
call->upgrade,
(call->intr ? RXRPC_PREINTERRUPTIBLE :
RXRPC_UNINTERRUPTIBLE),
@@ -390,7 +390,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
/* Note that at this point, we may have received the reply or an abort
* - and an asynchronous call may already have completed.
*
- * afs_wait_for_call_to_complete(call, ac)
+ * afs_wait_for_call_to_complete(call)
* must be called to synchronously clean up.
*/
return;
@@ -406,8 +406,7 @@ error_do_abort:
rxrpc_kernel_recv_data(call->net->socket, rxcall,
&msg.msg_iter, &len, false,
&call->abort_code, &call->service_id);
- ac->abort_code = call->abort_code;
- ac->responded = true;
+ call->responded = true;
}
call->error = ret;
trace_afs_call_done(call);
@@ -427,7 +426,7 @@ error_kill_call:
afs_set_call_complete(call, ret, 0);
}
- ac->error = ret;
+ call->error = ret;
call->state = AFS_CALL_COMPLETE;
_leave(" = %d", ret);
}
@@ -461,7 +460,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort)
max = m + 1;
pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n",
msg, call->type->name,
- &call->alist->addrs[call->addr_ix].transport);
+ rxrpc_kernel_remote_addr(call->peer));
}
}
@@ -508,6 +507,7 @@ static void afs_deliver_to_call(struct afs_call *call)
ret = -EBADMSG;
switch (ret) {
case 0:
+ call->responded = true;
afs_queue_call_work(call);
if (state == AFS_CALL_CL_PROC_REPLY) {
if (call->op)
@@ -522,9 +522,11 @@ static void afs_deliver_to_call(struct afs_call *call)
goto out;
case -ECONNABORTED:
ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
+ call->responded = true;
afs_log_error(call, call->abort_code);
goto done;
case -ENOTSUPP:
+ call->responded = true;
abort_code = RXGEN_OPCODE;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret,
@@ -571,50 +573,46 @@ call_complete:
}
/*
- * Wait synchronously for a call to complete and clean up the call struct.
+ * Wait synchronously for a call to complete.
*/
-long afs_wait_for_call_to_complete(struct afs_call *call,
- struct afs_addr_cursor *ac)
+void afs_wait_for_call_to_complete(struct afs_call *call)
{
- long ret;
bool rxrpc_complete = false;
- DECLARE_WAITQUEUE(myself, current);
-
_enter("");
- ret = call->error;
- if (ret < 0)
- goto out;
+ if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
+ DECLARE_WAITQUEUE(myself, current);
+
+ add_wait_queue(&call->waitq, &myself);
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+
+ /* deliver any messages that are in the queue */
+ if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
+ call->need_attention) {
+ call->need_attention = false;
+ __set_current_state(TASK_RUNNING);
+ afs_deliver_to_call(call);
+ continue;
+ }
- add_wait_queue(&call->waitq, &myself);
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- /* deliver any messages that are in the queue */
- if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
- call->need_attention) {
- call->need_attention = false;
- __set_current_state(TASK_RUNNING);
- afs_deliver_to_call(call);
- continue;
- }
+ if (afs_check_call_state(call, AFS_CALL_COMPLETE))
+ break;
- if (afs_check_call_state(call, AFS_CALL_COMPLETE))
- break;
+ if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
+ /* rxrpc terminated the call. */
+ rxrpc_complete = true;
+ break;
+ }
- if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
- /* rxrpc terminated the call. */
- rxrpc_complete = true;
- break;
+ schedule();
}
- schedule();
+ remove_wait_queue(&call->waitq, &myself);
+ __set_current_state(TASK_RUNNING);
}
- remove_wait_queue(&call->waitq, &myself);
- __set_current_state(TASK_RUNNING);
-
if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
if (rxrpc_complete) {
afs_set_call_complete(call, call->error, call->abort_code);
@@ -627,29 +625,6 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
afs_set_call_complete(call, -EINTR, 0);
}
}
-
- spin_lock_bh(&call->state_lock);
- ac->abort_code = call->abort_code;
- ac->error = call->error;
- spin_unlock_bh(&call->state_lock);
-
- ret = ac->error;
- switch (ret) {
- case 0:
- ret = call->ret0;
- call->ret0 = 0;
-
- fallthrough;
- case -ECONNABORTED:
- ac->responded = true;
- break;
- }
-
-out:
- _debug("call complete");
- afs_put_call(call);
- _leave(" = %p", (void *)ret);
- return ret;
}
/*
diff --git a/fs/afs/server.c b/fs/afs/server.c
index b5237206eac3..038f9d0ae3af 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -21,13 +21,13 @@ static void __afs_put_server(struct afs_net *, struct afs_server *);
/*
* Find a server by one of its addresses.
*/
-struct afs_server *afs_find_server(struct afs_net *net,
- const struct sockaddr_rxrpc *srx)
+struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer)
{
+ const struct afs_endpoint_state *estate;
const struct afs_addr_list *alist;
struct afs_server *server = NULL;
unsigned int i;
- int seq = 0, diff;
+ int seq = 1;
rcu_read_lock();
@@ -35,39 +35,15 @@ struct afs_server *afs_find_server(struct afs_net *net,
if (server)
afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq);
server = NULL;
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
- if (srx->transport.family == AF_INET6) {
- const struct sockaddr_in6 *a = &srx->transport.sin6, *b;
- hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
- alist = rcu_dereference(server->addresses);
- for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
- b = &alist->addrs[i].transport.sin6;
- diff = ((u16 __force)a->sin6_port -
- (u16 __force)b->sin6_port);
- if (diff == 0)
- diff = memcmp(&a->sin6_addr,
- &b->sin6_addr,
- sizeof(struct in6_addr));
- if (diff == 0)
- goto found;
- }
- }
- } else {
- const struct sockaddr_in *a = &srx->transport.sin, *b;
- hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) {
- alist = rcu_dereference(server->addresses);
- for (i = 0; i < alist->nr_ipv4; i++) {
- b = &alist->addrs[i].transport.sin;
- diff = ((u16 __force)a->sin_port -
- (u16 __force)b->sin_port);
- if (diff == 0)
- diff = ((u32 __force)a->sin_addr.s_addr -
- (u32 __force)b->sin_addr.s_addr);
- if (diff == 0)
- goto found;
- }
- }
+ hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) {
+ estate = rcu_dereference(server->endpoint_state);
+ alist = estate->addresses;
+ for (i = 0; i < alist->nr_addrs; i++)
+ if (alist->addrs[i].peer == peer)
+ goto found;
}
server = NULL;
@@ -90,7 +66,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
{
struct afs_server *server = NULL;
struct rb_node *p;
- int diff, seq = 0;
+ int diff, seq = 1;
_enter("%pU", uuid);
@@ -102,7 +78,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
if (server)
afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq);
server = NULL;
-
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
read_seqbegin_or_lock(&net->fs_lock, &seq);
p = net->fs_servers.rb_node;
@@ -137,6 +113,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
static struct afs_server *afs_install_server(struct afs_cell *cell,
struct afs_server *candidate)
{
+ const struct afs_endpoint_state *estate;
const struct afs_addr_list *alist;
struct afs_server *server, *next;
struct afs_net *net = cell->net;
@@ -188,8 +165,9 @@ static struct afs_server *afs_install_server(struct afs_cell *cell,
added_dup:
write_seqlock(&net->fs_addr_lock);
- alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&net->fs_addr_lock.lock));
+ estate = rcu_dereference_protected(server->endpoint_state,
+ lockdep_is_held(&net->fs_addr_lock.lock));
+ alist = estate->addresses;
/* Secondly, if the server has any IPv4 and/or IPv6 addresses, install
* it in the IPv4 and/or IPv6 reverse-map lists.
@@ -199,10 +177,8 @@ added_dup:
* bit, but anything we might want to do gets messy and memory
* intensive.
*/
- if (alist->nr_ipv4 > 0)
- hlist_add_head_rcu(&server->addr4_link, &net->fs_addresses4);
- if (alist->nr_addrs > alist->nr_ipv4)
- hlist_add_head_rcu(&server->addr6_link, &net->fs_addresses6);
+ if (alist->nr_addrs > 0)
+ hlist_add_head_rcu(&server->addr_link, &net->fs_addresses);
write_sequnlock(&net->fs_addr_lock);
@@ -219,6 +195,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
const uuid_t *uuid,
struct afs_addr_list *alist)
{
+ struct afs_endpoint_state *estate;
struct afs_server *server;
struct afs_net *net = cell->net;
@@ -228,25 +205,41 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
if (!server)
goto enomem;
+ estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL);
+ if (!estate)
+ goto enomem_server;
+
refcount_set(&server->ref, 1);
atomic_set(&server->active, 1);
server->debug_id = atomic_inc_return(&afs_server_debug_id);
- RCU_INIT_POINTER(server->addresses, alist);
server->addr_version = alist->version;
server->uuid = *uuid;
rwlock_init(&server->fs_lock);
- INIT_WORK(&server->initcb_work, afs_server_init_callback_work);
+ INIT_LIST_HEAD(&server->volumes);
init_waitqueue_head(&server->probe_wq);
INIT_LIST_HEAD(&server->probe_link);
spin_lock_init(&server->probe_lock);
server->cell = cell;
server->rtt = UINT_MAX;
+ server->service_id = FS_SERVICE;
+
+ server->probe_counter = 1;
+ server->probed_at = jiffies - LONG_MAX / 2;
+ refcount_set(&estate->ref, 1);
+ estate->addresses = alist;
+ estate->server_id = server->debug_id;
+ estate->probe_seq = 1;
+ rcu_assign_pointer(server->endpoint_state, estate);
afs_inc_servers_outstanding(net);
trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc);
+ trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref),
+ afs_estate_trace_alloc_server);
_leave(" = %p", server);
return server;
+enomem_server:
+ kfree(server);
enomem:
_leave(" = NULL [nomem]");
return NULL;
@@ -301,20 +294,20 @@ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key,
candidate = afs_alloc_server(cell, uuid, alist);
if (!candidate) {
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_server_oom);
return ERR_PTR(-ENOMEM);
}
server = afs_install_server(cell, candidate);
if (server != candidate) {
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_server_dup);
kfree(candidate);
} else {
/* Immediately dispatch an asynchronous probe to each interface
* on the fileserver. This will make sure the repeat-probing
* service is started.
*/
- afs_fs_probe_fileserver(cell->net, server, key, true);
+ afs_fs_probe_fileserver(cell->net, server, alist, key);
}
return server;
@@ -447,7 +440,8 @@ static void afs_server_rcu(struct rcu_head *rcu)
trace_afs_server(server->debug_id, refcount_read(&server->ref),
atomic_read(&server->active), afs_server_trace_free);
- afs_put_addrlist(rcu_access_pointer(server->addresses));
+ afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state),
+ afs_estate_trace_put_server);
kfree(server);
}
@@ -459,14 +453,10 @@ static void __afs_put_server(struct afs_net *net, struct afs_server *server)
static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server)
{
- struct afs_addr_list *alist = rcu_access_pointer(server->addresses);
- struct afs_addr_cursor ac = {
- .alist = alist,
- .index = alist->preferred,
- .error = 0,
- };
-
- afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+ struct afs_endpoint_state *estate = rcu_access_pointer(server->endpoint_state);
+ struct afs_addr_list *alist = estate->addresses;
+
+ afs_fs_give_up_all_callbacks(net, server, &alist->addrs[alist->preferred], NULL);
}
/*
@@ -477,7 +467,6 @@ static void afs_destroy_server(struct afs_net *net, struct afs_server *server)
if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags))
afs_give_up_callbacks(net, server);
- flush_work(&server->initcb_work);
afs_put_server(net, server, afs_server_trace_destroy);
}
@@ -520,10 +509,8 @@ static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list)
list_del(&server->probe_link);
hlist_del_rcu(&server->proc_link);
- if (!hlist_unhashed(&server->addr4_link))
- hlist_del_rcu(&server->addr4_link);
- if (!hlist_unhashed(&server->addr6_link))
- hlist_del_rcu(&server->addr6_link);
+ if (!hlist_unhashed(&server->addr_link))
+ hlist_del_rcu(&server->addr_link);
}
write_sequnlock(&net->fs_lock);
@@ -636,9 +623,12 @@ void afs_purge_servers(struct afs_net *net)
* Get an update for a server's address list.
*/
static noinline bool afs_update_server_record(struct afs_operation *op,
- struct afs_server *server)
+ struct afs_server *server,
+ struct key *key)
{
- struct afs_addr_list *alist, *discard;
+ struct afs_endpoint_state *estate;
+ struct afs_addr_list *alist;
+ bool has_addrs;
_enter("");
@@ -648,29 +638,27 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid);
if (IS_ERR(alist)) {
+ rcu_read_lock();
+ estate = rcu_dereference(server->endpoint_state);
+ has_addrs = estate->addresses;
+ rcu_read_unlock();
+
if ((PTR_ERR(alist) == -ERESTARTSYS ||
PTR_ERR(alist) == -EINTR) &&
(op->flags & AFS_OPERATION_UNINTR) &&
- server->addresses) {
+ has_addrs) {
_leave(" = t [intr]");
return true;
}
- op->error = PTR_ERR(alist);
- _leave(" = f [%d]", op->error);
+ afs_op_set_error(op, PTR_ERR(alist));
+ _leave(" = f [%d]", afs_op_error(op));
return false;
}
- discard = alist;
- if (server->addr_version != alist->version) {
- write_lock(&server->fs_lock);
- discard = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->fs_lock));
- rcu_assign_pointer(server->addresses, alist);
- server->addr_version = alist->version;
- write_unlock(&server->fs_lock);
- }
+ if (server->addr_version != alist->version)
+ afs_fs_probe_fileserver(op->net, server, alist, key);
- afs_put_addrlist(discard);
+ afs_put_addrlist(alist, afs_alist_trace_put_server_update);
_leave(" = t");
return true;
}
@@ -678,7 +666,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
/*
* See if a server's address list needs updating.
*/
-bool afs_check_server_record(struct afs_operation *op, struct afs_server *server)
+bool afs_check_server_record(struct afs_operation *op, struct afs_server *server,
+ struct key *key)
{
bool success;
int ret, retries = 0;
@@ -698,7 +687,7 @@ retry:
update:
if (!test_and_set_bit_lock(AFS_SERVER_FL_UPDATING, &server->flags)) {
clear_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags);
- success = afs_update_server_record(op, server);
+ success = afs_update_server_record(op, server, key);
clear_bit_unlock(AFS_SERVER_FL_UPDATING, &server->flags);
wake_up_bit(&server->flags, AFS_SERVER_FL_UPDATING);
_leave(" = %d", success);
@@ -710,7 +699,7 @@ wait:
(op->flags & AFS_OPERATION_UNINTR) ?
TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE);
if (ret == -ERESTARTSYS) {
- op->error = ret;
+ afs_op_set_error(op, ret);
_leave(" = f [intr]");
return false;
}
diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c
index b59896b1de0a..7e7e567a7f8a 100644
--- a/fs/afs/server_list.c
+++ b/fs/afs/server_list.c
@@ -24,35 +24,62 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist)
/*
* Build a server list from a VLDB record.
*/
-struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
+struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume,
struct key *key,
- struct afs_vldb_entry *vldb,
- u8 type_mask)
+ struct afs_vldb_entry *vldb)
{
struct afs_server_list *slist;
struct afs_server *server;
- int ret = -ENOMEM, nr_servers = 0, i, j;
-
- for (i = 0; i < vldb->nr_servers; i++)
- if (vldb->fs_mask[i] & type_mask)
- nr_servers++;
+ unsigned int type_mask = 1 << volume->type;
+ bool use_newrepsites = false;
+ int ret = -ENOMEM, nr_servers = 0, newrep = 0, i, j, usable = 0;
+
+ /* Work out if we're going to restrict to NEWREPSITE-marked servers or
+ * not. If at least one site is marked as NEWREPSITE, then it's likely
+ * that "vos release" is busy updating RO sites. We cut over from one
+ * to the other when >=50% of the sites have been updated. Sites that
+ * are in the process of being updated are marked DONTUSE.
+ */
+ for (i = 0; i < vldb->nr_servers; i++) {
+ if (!(vldb->fs_mask[i] & type_mask))
+ continue;
+ nr_servers++;
+ if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE)
+ continue;
+ usable++;
+ if (vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE)
+ newrep++;
+ }
slist = kzalloc(struct_size(slist, servers, nr_servers), GFP_KERNEL);
if (!slist)
goto error;
+ if (newrep) {
+ if (newrep < usable / 2) {
+ slist->ro_replicating = AFS_RO_REPLICATING_USE_OLD;
+ } else {
+ slist->ro_replicating = AFS_RO_REPLICATING_USE_NEW;
+ use_newrepsites = true;
+ }
+ }
+
refcount_set(&slist->usage, 1);
rwlock_init(&slist->lock);
- for (i = 0; i < AFS_MAXTYPES; i++)
- slist->vids[i] = vldb->vid[i];
-
/* Make sure a records exists for each server in the list. */
for (i = 0; i < vldb->nr_servers; i++) {
+ unsigned long se_flags = 0;
+ bool newrepsite = vldb->vlsf_flags[i] & AFS_VLSF_NEWREPSITE;
+
if (!(vldb->fs_mask[i] & type_mask))
continue;
+ if (vldb->vlsf_flags[i] & AFS_VLSF_DONTUSE)
+ __set_bit(AFS_SE_EXCLUDED, &se_flags);
+ if (newrep && (newrepsite ^ use_newrepsites))
+ __set_bit(AFS_SE_EXCLUDED, &se_flags);
- server = afs_lookup_server(cell, key, &vldb->fs_server[i],
+ server = afs_lookup_server(volume->cell, key, &vldb->fs_server[i],
vldb->addr_version[i]);
if (IS_ERR(server)) {
ret = PTR_ERR(server);
@@ -70,7 +97,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
break;
if (j < slist->nr_servers) {
if (slist->servers[j].server == server) {
- afs_put_server(cell->net, server,
+ afs_put_server(volume->cell->net, server,
afs_server_trace_put_slist_isort);
continue;
}
@@ -81,6 +108,9 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
}
slist->servers[j].server = server;
+ slist->servers[j].volume = volume;
+ slist->servers[j].flags = se_flags;
+ slist->servers[j].cb_expires_at = AFS_NO_CB_PROMISE;
slist->nr_servers++;
}
@@ -92,7 +122,7 @@ struct afs_server_list *afs_alloc_server_list(struct afs_cell *cell,
return slist;
error_2:
- afs_put_serverlist(cell->net, slist);
+ afs_put_serverlist(volume->cell->net, slist);
error:
return ERR_PTR(ret);
}
@@ -103,27 +133,117 @@ error:
bool afs_annotate_server_list(struct afs_server_list *new,
struct afs_server_list *old)
{
- struct afs_server *cur;
- int i, j;
+ unsigned long mask = 1UL << AFS_SE_EXCLUDED;
+ int i;
- if (old->nr_servers != new->nr_servers)
+ if (old->nr_servers != new->nr_servers ||
+ old->ro_replicating != new->ro_replicating)
goto changed;
- for (i = 0; i < old->nr_servers; i++)
+ for (i = 0; i < old->nr_servers; i++) {
if (old->servers[i].server != new->servers[i].server)
goto changed;
-
+ if ((old->servers[i].flags & mask) != (new->servers[i].flags & mask))
+ goto changed;
+ }
return false;
-
changed:
- /* Maintain the same preferred server as before if possible. */
- cur = old->servers[old->preferred].server;
- for (j = 0; j < new->nr_servers; j++) {
- if (new->servers[j].server == cur) {
- new->preferred = j;
- break;
+ return true;
+}
+
+/*
+ * Attach a volume to the servers it is going to use.
+ */
+void afs_attach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *slist)
+{
+ struct afs_server_entry *se, *pe;
+ struct afs_server *server;
+ struct list_head *p;
+ unsigned int i;
+
+ down_write(&volume->cell->vs_lock);
+
+ for (i = 0; i < slist->nr_servers; i++) {
+ se = &slist->servers[i];
+ server = se->server;
+
+ list_for_each(p, &server->volumes) {
+ pe = list_entry(p, struct afs_server_entry, slink);
+ if (volume->vid <= pe->volume->vid)
+ break;
}
+ list_add_tail(&se->slink, p);
}
- return true;
+ slist->attached = true;
+ up_write(&volume->cell->vs_lock);
+}
+
+/*
+ * Reattach a volume to the servers it is going to use when server list is
+ * replaced. We try to switch the attachment points to avoid rewalking the
+ * lists.
+ */
+void afs_reattach_volume_to_servers(struct afs_volume *volume, struct afs_server_list *new,
+ struct afs_server_list *old)
+{
+ unsigned int n = 0, o = 0;
+
+ down_write(&volume->cell->vs_lock);
+
+ while (n < new->nr_servers || o < old->nr_servers) {
+ struct afs_server_entry *pn = n < new->nr_servers ? &new->servers[n] : NULL;
+ struct afs_server_entry *po = o < old->nr_servers ? &old->servers[o] : NULL;
+ struct afs_server_entry *s;
+ struct list_head *p;
+ int diff;
+
+ if (pn && po && pn->server == po->server) {
+ pn->cb_expires_at = po->cb_expires_at;
+ list_replace(&po->slink, &pn->slink);
+ n++;
+ o++;
+ continue;
+ }
+
+ if (pn && po)
+ diff = memcmp(&pn->server->uuid, &po->server->uuid,
+ sizeof(pn->server->uuid));
+ else
+ diff = pn ? -1 : 1;
+
+ if (diff < 0) {
+ list_for_each(p, &pn->server->volumes) {
+ s = list_entry(p, struct afs_server_entry, slink);
+ if (volume->vid <= s->volume->vid)
+ break;
+ }
+ list_add_tail(&pn->slink, p);
+ n++;
+ } else {
+ list_del(&po->slink);
+ o++;
+ }
+ }
+
+ up_write(&volume->cell->vs_lock);
+}
+
+/*
+ * Detach a volume from the servers it has been using.
+ */
+void afs_detach_volume_from_servers(struct afs_volume *volume, struct afs_server_list *slist)
+{
+ unsigned int i;
+
+ if (!slist->attached)
+ return;
+
+ down_write(&volume->cell->vs_lock);
+
+ for (i = 0; i < slist->nr_servers; i++)
+ list_del(&slist->servers[i].slink);
+
+ slist->attached = false;
+ up_write(&volume->cell->vs_lock);
}
diff --git a/fs/afs/super.c b/fs/afs/super.c
index a01a0fb2cdbb..f3ba1c3e72f5 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -55,7 +55,7 @@ int afs_net_id;
static const struct super_operations afs_super_ops = {
.statfs = afs_statfs,
.alloc_inode = afs_alloc_inode,
- .write_inode = afs_write_inode,
+ .write_inode = netfs_unpin_writeback,
.drop_inode = afs_drop_inode,
.destroy_inode = afs_destroy_inode,
.free_inode = afs_free_inode,
@@ -381,8 +381,7 @@ static int afs_validate_fc(struct fs_context *fc)
ctx->key = key;
if (ctx->volume) {
- afs_put_volume(ctx->net, ctx->volume,
- afs_volume_trace_put_validate_fc);
+ afs_put_volume(ctx->volume, afs_volume_trace_put_validate_fc);
ctx->volume = NULL;
}
@@ -529,7 +528,7 @@ static void afs_destroy_sbi(struct afs_super_info *as)
{
if (as) {
struct afs_net *net = afs_net(as->net_ns);
- afs_put_volume(net, as->volume, afs_volume_trace_put_destroy_sbi);
+ afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi);
afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi);
put_net(as->net_ns);
kfree(as);
@@ -615,7 +614,7 @@ static void afs_free_fc(struct fs_context *fc)
struct afs_fs_context *ctx = fc->fs_private;
afs_destroy_sbi(fc->s_fs_info);
- afs_put_volume(ctx->net, ctx->volume, afs_volume_trace_put_free_fc);
+ afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc);
afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc);
key_put(ctx->key);
kfree(ctx);
diff --git a/fs/afs/validation.c b/fs/afs/validation.c
new file mode 100644
index 000000000000..46b37f2cce7d
--- /dev/null
+++ b/fs/afs/validation.c
@@ -0,0 +1,473 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* vnode and volume validity verification.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include "internal.h"
+
+/*
+ * Data validation is managed through a number of mechanisms from the server:
+ *
+ * (1) On first contact with a server (such as if it has just been rebooted),
+ * the server sends us a CB.InitCallBackState* request.
+ *
+ * (2) On a RW volume, in response to certain vnode (inode)-accessing RPC
+ * calls, the server maintains a time-limited per-vnode promise that it
+ * will send us a CB.CallBack request if a third party alters the vnodes
+ * accessed.
+ *
+ * Note that a vnode-level callbacks may also be sent for other reasons,
+ * such as filelock release.
+ *
+ * (3) On a RO (or Backup) volume, in response to certain vnode-accessing RPC
+ * calls, each server maintains a time-limited per-volume promise that it
+ * will send us a CB.CallBack request if the RO volume is updated to a
+ * snapshot of the RW volume ("vos release"). This is an atomic event
+ * that cuts over all instances of the RO volume across multiple servers
+ * simultaneously.
+ *
+ * Note that a volume-level callbacks may also be sent for other reasons,
+ * such as the volumeserver taking over control of the volume from the
+ * fileserver.
+ *
+ * Note also that each server maintains an independent time limit on an
+ * independent callback.
+ *
+ * (4) Certain RPC calls include a volume information record "VolSync" in
+ * their reply. This contains a creation date for the volume that should
+ * remain unchanged for a RW volume (but will be changed if the volume is
+ * restored from backup) or will be bumped to the time of snapshotting
+ * when a RO volume is released.
+ *
+ * In order to track this events, the following are provided:
+ *
+ * ->cb_v_break. A counter of events that might mean that the contents of
+ * a volume have been altered since we last checked a vnode.
+ *
+ * ->cb_v_check. A counter of the number of events that we've sent a
+ * query to the server for. Everything's up to date if this equals
+ * cb_v_break.
+ *
+ * ->cb_scrub. A counter of the number of regression events for which we
+ * have to completely wipe the cache.
+ *
+ * ->cb_ro_snapshot. A counter of the number of times that we've
+ * recognised that a RO volume has been updated.
+ *
+ * ->cb_break. A counter of events that might mean that the contents of a
+ * vnode have been altered.
+ *
+ * ->cb_expires_at. The time at which the callback promise expires or
+ * AFS_NO_CB_PROMISE if we have no promise.
+ *
+ * The way we manage things is:
+ *
+ * (1) When a volume-level CB.CallBack occurs, we increment ->cb_v_break on
+ * the volume and reset ->cb_expires_at (ie. set AFS_NO_CB_PROMISE) on the
+ * volume and volume's server record.
+ *
+ * (2) When a CB.InitCallBackState occurs, we treat this as a volume-level
+ * callback break on all the volumes that have been using that volume
+ * (ie. increment ->cb_v_break and reset ->cb_expires_at).
+ *
+ * (3) When a vnode-level CB.CallBack occurs, we increment ->cb_break on the
+ * vnode and reset its ->cb_expires_at. If the vnode is mmapped, we also
+ * dispatch a work item to unmap all PTEs to the vnode's pagecache to
+ * force reentry to the filesystem for revalidation.
+ *
+ * (4) When entering the filesystem, we call afs_validate() to check the
+ * validity of a vnode. This first checks to see if ->cb_v_check and
+ * ->cb_v_break match, and if they don't, we lock volume->cb_check_lock
+ * exclusively and perform an FS.FetchStatus on the vnode.
+ *
+ * After checking the volume, we check the vnode. If there's a mismatch
+ * between the volume counters and the vnode's mirrors of those counters,
+ * we lock vnode->validate_lock and issue an FS.FetchStatus on the vnode.
+ *
+ * (5) When the reply from FS.FetchStatus arrives, the VolSync record is
+ * parsed:
+ *
+ * (A) If the Creation timestamp has changed on a RW volume or regressed
+ * on a RO volume, we try to increment ->cb_scrub; if it advances on a
+ * RO volume, we assume "vos release" happened and try to increment
+ * ->cb_ro_snapshot.
+ *
+ * (B) If the Update timestamp has regressed, we try to increment
+ * ->cb_scrub.
+ *
+ * Note that in both of these cases, we only do the increment if we can
+ * cmpxchg the value of the timestamp from the value we noted before the
+ * op. This tries to prevent parallel ops from fighting one another.
+ *
+ * volume->cb_v_check is then set to ->cb_v_break.
+ *
+ * (6) The AFSCallBack record included in the FS.FetchStatus reply is also
+ * parsed and used to set the promise in ->cb_expires_at for the vnode,
+ * the volume and the volume's server record.
+ *
+ * (7) If ->cb_scrub is seen to have advanced, we invalidate the pagecache for
+ * the vnode.
+ */
+
+/*
+ * Check the validity of a vnode/inode and its parent volume.
+ */
+bool afs_check_validity(const struct afs_vnode *vnode)
+{
+ const struct afs_volume *volume = vnode->volume;
+ time64_t deadline = ktime_get_real_seconds() + 10;
+
+ if (atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
+ atomic64_read(&vnode->cb_expires_at) <= deadline ||
+ volume->cb_expires_at <= deadline ||
+ vnode->cb_ro_snapshot != atomic_read(&volume->cb_ro_snapshot) ||
+ vnode->cb_scrub != atomic_read(&volume->cb_scrub) ||
+ test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+ _debug("inval");
+ return false;
+ }
+
+ return true;
+}
+
+/*
+ * See if the server we've just talked to is currently excluded.
+ */
+static bool __afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
+{
+ const struct afs_server_entry *se;
+ const struct afs_server_list *slist;
+ bool is_excluded = true;
+ int i;
+
+ rcu_read_lock();
+
+ slist = rcu_dereference(volume->servers);
+ for (i = 0; i < slist->nr_servers; i++) {
+ se = &slist->servers[i];
+ if (op->server == se->server) {
+ is_excluded = test_bit(AFS_SE_EXCLUDED, &se->flags);
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ return is_excluded;
+}
+
+/*
+ * Update the volume's server list when the creation time changes and see if
+ * the server we've just talked to is currently excluded.
+ */
+static int afs_is_server_excluded(struct afs_operation *op, struct afs_volume *volume)
+{
+ int ret;
+
+ if (__afs_is_server_excluded(op, volume))
+ return 1;
+
+ set_bit(AFS_VOLUME_NEEDS_UPDATE, &volume->flags);
+ ret = afs_check_volume_status(op->volume, op);
+ if (ret < 0)
+ return ret;
+
+ return __afs_is_server_excluded(op, volume);
+}
+
+/*
+ * Handle a change to the volume creation time in the VolSync record.
+ */
+static int afs_update_volume_creation_time(struct afs_operation *op, struct afs_volume *volume)
+{
+ unsigned int snap;
+ time64_t cur = volume->creation_time;
+ time64_t old = op->pre_volsync.creation;
+ time64_t new = op->volsync.creation;
+ int ret;
+
+ _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
+
+ if (cur == TIME64_MIN) {
+ volume->creation_time = new;
+ return 0;
+ }
+
+ if (new == cur)
+ return 0;
+
+ /* Try to advance the creation timestamp from what we had before the
+ * operation to what we got back from the server. This should
+ * hopefully ensure that in a race between multiple operations only one
+ * of them will do this.
+ */
+ if (cur != old)
+ return 0;
+
+ /* If the creation time changes in an unexpected way, we need to scrub
+ * our caches. For a RW vol, this will only change if the volume is
+ * restored from a backup; for a RO/Backup vol, this will advance when
+ * the volume is updated to a new snapshot (eg. "vos release").
+ */
+ if (volume->type == AFSVL_RWVOL)
+ goto regressed;
+ if (volume->type == AFSVL_BACKVOL) {
+ if (new < old)
+ goto regressed;
+ goto advance;
+ }
+
+ /* We have an RO volume, we need to query the VL server and look at the
+ * server flags to see if RW->RO replication is in progress.
+ */
+ ret = afs_is_server_excluded(op, volume);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ snap = atomic_read(&volume->cb_ro_snapshot);
+ trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_volume_excluded);
+ return ret;
+ }
+
+advance:
+ snap = atomic_inc_return(&volume->cb_ro_snapshot);
+ trace_afs_cb_v_break(volume->vid, snap, afs_cb_break_for_vos_release);
+ volume->creation_time = new;
+ return 0;
+
+regressed:
+ atomic_inc(&volume->cb_scrub);
+ trace_afs_cb_v_break(volume->vid, 0, afs_cb_break_for_creation_regress);
+ volume->creation_time = new;
+ return 0;
+}
+
+/*
+ * Handle a change to the volume update time in the VolSync record.
+ */
+static void afs_update_volume_update_time(struct afs_operation *op, struct afs_volume *volume)
+{
+ enum afs_cb_break_reason reason = afs_cb_break_no_break;
+ time64_t cur = volume->update_time;
+ time64_t old = op->pre_volsync.update;
+ time64_t new = op->volsync.update;
+
+ _enter("%llx,%llx,%llx->%llx", volume->vid, cur, old, new);
+
+ if (cur == TIME64_MIN) {
+ volume->update_time = new;
+ return;
+ }
+
+ if (new == cur)
+ return;
+
+ /* If the volume update time changes in an unexpected way, we need to
+ * scrub our caches. For a RW vol, this will advance on every
+ * modification op; for a RO/Backup vol, this will advance when the
+ * volume is updated to a new snapshot (eg. "vos release").
+ */
+ if (new < old)
+ reason = afs_cb_break_for_update_regress;
+
+ /* Try to advance the update timestamp from what we had before the
+ * operation to what we got back from the server. This should
+ * hopefully ensure that in a race between multiple operations only one
+ * of them will do this.
+ */
+ if (cur == old) {
+ if (reason == afs_cb_break_for_update_regress) {
+ atomic_inc(&volume->cb_scrub);
+ trace_afs_cb_v_break(volume->vid, 0, reason);
+ }
+ volume->update_time = new;
+ }
+}
+
+static int afs_update_volume_times(struct afs_operation *op, struct afs_volume *volume)
+{
+ int ret = 0;
+
+ if (likely(op->volsync.creation == volume->creation_time &&
+ op->volsync.update == volume->update_time))
+ return 0;
+
+ mutex_lock(&volume->volsync_lock);
+ if (op->volsync.creation != volume->creation_time) {
+ ret = afs_update_volume_creation_time(op, volume);
+ if (ret < 0)
+ goto out;
+ }
+ if (op->volsync.update != volume->update_time)
+ afs_update_volume_update_time(op, volume);
+out:
+ mutex_unlock(&volume->volsync_lock);
+ return ret;
+}
+
+/*
+ * Update the state of a volume, including recording the expiration time of the
+ * callback promise. Returns 1 to redo the operation from the start.
+ */
+int afs_update_volume_state(struct afs_operation *op)
+{
+ struct afs_server_list *slist = op->server_list;
+ struct afs_server_entry *se = &slist->servers[op->server_index];
+ struct afs_callback *cb = &op->file[0].scb.callback;
+ struct afs_volume *volume = op->volume;
+ unsigned int cb_v_break = atomic_read(&volume->cb_v_break);
+ unsigned int cb_v_check = atomic_read(&volume->cb_v_check);
+ int ret;
+
+ _enter("%llx", op->volume->vid);
+
+ if (op->volsync.creation != TIME64_MIN || op->volsync.update != TIME64_MIN) {
+ ret = afs_update_volume_times(op, volume);
+ if (ret != 0) {
+ _leave(" = %d", ret);
+ return ret;
+ }
+ }
+
+ if (op->cb_v_break == cb_v_break &&
+ (op->file[0].scb.have_cb || op->file[1].scb.have_cb)) {
+ time64_t expires_at = cb->expires_at;
+
+ if (!op->file[0].scb.have_cb)
+ expires_at = op->file[1].scb.callback.expires_at;
+
+ se->cb_expires_at = expires_at;
+ volume->cb_expires_at = expires_at;
+ }
+ if (cb_v_check < op->cb_v_break)
+ atomic_cmpxchg(&volume->cb_v_check, cb_v_check, op->cb_v_break);
+ return 0;
+}
+
+/*
+ * mark the data attached to an inode as obsolete due to a write on the server
+ * - might also want to ditch all the outstanding writes and dirty pages
+ */
+static void afs_zap_data(struct afs_vnode *vnode)
+{
+ _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode);
+
+ afs_invalidate_cache(vnode, 0);
+
+ /* nuke all the non-dirty pages that aren't locked, mapped or being
+ * written back in a regular file and completely discard the pages in a
+ * directory or symlink */
+ if (S_ISREG(vnode->netfs.inode.i_mode))
+ invalidate_remote_inode(&vnode->netfs.inode);
+ else
+ invalidate_inode_pages2(vnode->netfs.inode.i_mapping);
+}
+
+/*
+ * validate a vnode/inode
+ * - there are several things we need to check
+ * - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
+ * symlink)
+ * - parent dir metadata changed (security changes)
+ * - dentry data changed (write, truncate)
+ * - dentry metadata changed (security changes)
+ */
+int afs_validate(struct afs_vnode *vnode, struct key *key)
+{
+ struct afs_volume *volume = vnode->volume;
+ unsigned int cb_ro_snapshot, cb_scrub;
+ time64_t deadline = ktime_get_real_seconds() + 10;
+ bool zap = false, locked_vol = false;
+ int ret;
+
+ _enter("{v={%llx:%llu} fl=%lx},%x",
+ vnode->fid.vid, vnode->fid.vnode, vnode->flags,
+ key_serial(key));
+
+ if (afs_check_validity(vnode))
+ return 0;
+
+ ret = down_write_killable(&vnode->validate_lock);
+ if (ret < 0)
+ goto error;
+
+ /* Validate a volume after the v_break has changed or the volume
+ * callback expired. We only want to do this once per volume per
+ * v_break change. The actual work will be done when parsing the
+ * status fetch reply.
+ */
+ if (volume->cb_expires_at <= deadline ||
+ atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break)) {
+ ret = mutex_lock_interruptible(&volume->cb_check_lock);
+ if (ret < 0)
+ goto error_unlock;
+ locked_vol = true;
+ }
+
+ cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
+ cb_scrub = atomic_read(&volume->cb_scrub);
+ if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
+ vnode->cb_scrub != cb_scrub)
+ unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false);
+
+ if (vnode->cb_ro_snapshot != cb_ro_snapshot ||
+ vnode->cb_scrub != cb_scrub ||
+ volume->cb_expires_at <= deadline ||
+ atomic_read(&volume->cb_v_check) != atomic_read(&volume->cb_v_break) ||
+ atomic64_read(&vnode->cb_expires_at) <= deadline
+ ) {
+ ret = afs_fetch_status(vnode, key, false, NULL);
+ if (ret < 0) {
+ if (ret == -ENOENT) {
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ ret = -ESTALE;
+ }
+ goto error_unlock;
+ }
+
+ _debug("new promise [fl=%lx]", vnode->flags);
+ }
+
+ /* We can drop the volume lock now as. */
+ if (locked_vol) {
+ mutex_unlock(&volume->cb_check_lock);
+ locked_vol = false;
+ }
+
+ cb_ro_snapshot = atomic_read(&volume->cb_ro_snapshot);
+ cb_scrub = atomic_read(&volume->cb_scrub);
+ _debug("vnode inval %x==%x %x==%x",
+ vnode->cb_ro_snapshot, cb_ro_snapshot,
+ vnode->cb_scrub, cb_scrub);
+ if (vnode->cb_scrub != cb_scrub)
+ zap = true;
+ vnode->cb_ro_snapshot = cb_ro_snapshot;
+ vnode->cb_scrub = cb_scrub;
+
+ if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+ _debug("file already deleted");
+ ret = -ESTALE;
+ goto error_unlock;
+ }
+
+ /* if the vnode's data version number changed then its contents are
+ * different */
+ zap |= test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+ if (zap)
+ afs_zap_data(vnode);
+ up_write(&vnode->validate_lock);
+ _leave(" = 0");
+ return 0;
+
+error_unlock:
+ if (locked_vol)
+ mutex_unlock(&volume->cb_check_lock);
+ up_write(&vnode->validate_lock);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
index f04a80e4f5c3..9f36e14f1c2d 100644
--- a/fs/afs/vl_alias.c
+++ b/fs/afs/vl_alias.c
@@ -33,55 +33,6 @@ static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *k
}
/*
- * Compare two addresses.
- */
-static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a,
- const struct sockaddr_rxrpc *srx_b)
-{
- short port_a, port_b;
- int addr_a, addr_b, diff;
-
- diff = (short)srx_a->transport_type - (short)srx_b->transport_type;
- if (diff)
- goto out;
-
- switch (srx_a->transport_type) {
- case AF_INET: {
- const struct sockaddr_in *a = &srx_a->transport.sin;
- const struct sockaddr_in *b = &srx_b->transport.sin;
- addr_a = ntohl(a->sin_addr.s_addr);
- addr_b = ntohl(b->sin_addr.s_addr);
- diff = addr_a - addr_b;
- if (diff == 0) {
- port_a = ntohs(a->sin_port);
- port_b = ntohs(b->sin_port);
- diff = port_a - port_b;
- }
- break;
- }
-
- case AF_INET6: {
- const struct sockaddr_in6 *a = &srx_a->transport.sin6;
- const struct sockaddr_in6 *b = &srx_b->transport.sin6;
- diff = memcmp(&a->sin6_addr, &b->sin6_addr, 16);
- if (diff == 0) {
- port_a = ntohs(a->sin6_port);
- port_b = ntohs(b->sin6_port);
- diff = port_a - port_b;
- }
- break;
- }
-
- default:
- WARN_ON(1);
- diff = 1;
- }
-
-out:
- return diff;
-}
-
-/*
* Compare the address lists of a pair of fileservers.
*/
static int afs_compare_fs_alists(const struct afs_server *server_a,
@@ -90,13 +41,13 @@ static int afs_compare_fs_alists(const struct afs_server *server_a,
const struct afs_addr_list *la, *lb;
int a = 0, b = 0, addr_matches = 0;
- la = rcu_dereference(server_a->addresses);
- lb = rcu_dereference(server_b->addresses);
+ la = rcu_dereference(server_a->endpoint_state)->addresses;
+ lb = rcu_dereference(server_b->endpoint_state)->addresses;
while (a < la->nr_addrs && b < lb->nr_addrs) {
- const struct sockaddr_rxrpc *srx_a = &la->addrs[a];
- const struct sockaddr_rxrpc *srx_b = &lb->addrs[b];
- int diff = afs_compare_addrs(srx_a, srx_b);
+ unsigned long pa = (unsigned long)la->addrs[a].peer;
+ unsigned long pb = (unsigned long)lb->addrs[b].peer;
+ long diff = pa - pb;
if (diff < 0) {
a++;
@@ -126,7 +77,7 @@ static int afs_compare_volume_slists(const struct afs_volume *vol_a,
lb = rcu_dereference(vol_b->servers);
for (i = 0; i < AFS_MAXTYPES; i++)
- if (la->vids[i] != lb->vids[i])
+ if (vol_a->vids[i] != vol_b->vids[i])
return 0;
while (a < la->nr_servers && b < lb->nr_servers) {
@@ -205,7 +156,7 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key,
/* And see if it's in the new cell. */
volume = afs_sample_volume(cell, key, pvol->name, pvol->name_len);
if (IS_ERR(volume)) {
- afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias);
+ afs_put_volume(pvol, afs_volume_trace_put_query_alias);
if (PTR_ERR(volume) != -ENOMEDIUM)
return PTR_ERR(volume);
/* That volume is not in the new cell, so not an alias */
@@ -223,8 +174,8 @@ static int afs_query_for_alias_one(struct afs_cell *cell, struct key *key,
rcu_read_unlock();
}
- afs_put_volume(cell->net, volume, afs_volume_trace_put_query_alias);
- afs_put_volume(cell->net, pvol, afs_volume_trace_put_query_alias);
+ afs_put_volume(volume, afs_volume_trace_put_query_alias);
+ afs_put_volume(pvol, afs_volume_trace_put_query_alias);
return ret;
}
@@ -285,7 +236,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key)
while (afs_select_vlserver(&vc)) {
if (!test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) {
- vc.ac.error = -EOPNOTSUPP;
+ vc.call_error = -EOPNOTSUPP;
skipped = true;
continue;
}
diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
index acc48216136a..9b1c20daac53 100644
--- a/fs/afs/vl_list.c
+++ b/fs/afs/vl_list.c
@@ -13,6 +13,7 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
unsigned short port)
{
struct afs_vlserver *vlserver;
+ static atomic_t debug_ids;
vlserver = kzalloc(struct_size(vlserver, name, name_len + 1),
GFP_KERNEL);
@@ -21,8 +22,10 @@ struct afs_vlserver *afs_alloc_vlserver(const char *name, size_t name_len,
rwlock_init(&vlserver->lock);
init_waitqueue_head(&vlserver->probe_wq);
spin_lock_init(&vlserver->probe_lock);
+ vlserver->debug_id = atomic_inc_return(&debug_ids);
vlserver->rtt = UINT_MAX;
vlserver->name_len = name_len;
+ vlserver->service_id = VL_SERVICE;
vlserver->port = port;
memcpy(vlserver->name, name, name_len);
}
@@ -33,7 +36,8 @@ static void afs_vlserver_rcu(struct rcu_head *rcu)
{
struct afs_vlserver *vlserver = container_of(rcu, struct afs_vlserver, rcu);
- afs_put_addrlist(rcu_access_pointer(vlserver->addresses));
+ afs_put_addrlist(rcu_access_pointer(vlserver->addresses),
+ afs_alist_trace_put_vlserver);
kfree_rcu(vlserver, rcu);
}
@@ -83,14 +87,15 @@ static u16 afs_extract_le16(const u8 **_b)
/*
* Build a VL server address list from a DNS queried server list.
*/
-static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
+static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net,
+ const u8 **_b, const u8 *end,
u8 nr_addrs, u16 port)
{
struct afs_addr_list *alist;
const u8 *b = *_b;
int ret = -EINVAL;
- alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port);
+ alist = afs_alloc_addrlist(nr_addrs);
if (!alist)
return ERR_PTR(-ENOMEM);
if (nr_addrs == 0)
@@ -109,7 +114,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
goto error;
}
memcpy(x, b, 4);
- afs_merge_fs_addr4(alist, x[0], port);
+ ret = afs_merge_fs_addr4(net, alist, x[0], port);
+ if (ret < 0)
+ goto error;
b += 4;
break;
@@ -119,7 +126,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
goto error;
}
memcpy(x, b, 16);
- afs_merge_fs_addr6(alist, x, port);
+ ret = afs_merge_fs_addr6(net, alist, x, port);
+ if (ret < 0)
+ goto error;
b += 16;
break;
@@ -140,7 +149,7 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
error:
*_b = b;
- afs_put_addrlist(alist);
+ afs_put_addrlist(alist, afs_alist_trace_put_parse_error);
return ERR_PTR(ret);
}
@@ -247,7 +256,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
/* Extract the addresses - note that we can't skip this as we
* have to advance the payload pointer.
*/
- addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port);
+ addrs = afs_extract_vl_addrs(cell->net, &b, end, bs.nr_addrs, bs.port);
if (IS_ERR(addrs)) {
ret = PTR_ERR(addrs);
goto error_2;
@@ -255,7 +264,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
if (vllist->nr_servers >= nr_servers) {
_debug("skip %u >= %u", vllist->nr_servers, nr_servers);
- afs_put_addrlist(addrs);
+ afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty);
afs_put_vlserver(cell->net, server);
continue;
}
@@ -264,7 +273,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
addrs->status = bs.status;
if (addrs->nr_addrs == 0) {
- afs_put_addrlist(addrs);
+ afs_put_addrlist(addrs, afs_alist_trace_put_parse_empty);
if (!rcu_access_pointer(server->addresses)) {
afs_put_vlserver(cell->net, server);
continue;
@@ -276,7 +285,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
old = rcu_replace_pointer(server->addresses, old,
lockdep_is_held(&server->lock));
write_unlock(&server->lock);
- afs_put_addrlist(old);
+ afs_put_addrlist(old, afs_alist_trace_put_vlserver_old);
}
diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
index 58452b86e672..3d2e0c925460 100644
--- a/fs/afs/vl_probe.c
+++ b/fs/afs/vl_probe.c
@@ -46,11 +46,12 @@ static void afs_done_one_vl_probe(struct afs_vlserver *server, bool wake_up)
*/
void afs_vlserver_probe_result(struct afs_call *call)
{
- struct afs_addr_list *alist = call->alist;
+ struct afs_addr_list *alist = call->vl_probe;
struct afs_vlserver *server = call->vlserver;
+ struct afs_address *addr = &alist->addrs[call->probe_index];
unsigned int server_index = call->server_index;
unsigned int rtt_us = 0;
- unsigned int index = call->addr_ix;
+ unsigned int index = call->probe_index;
bool have_result = false;
int ret = call->error;
@@ -89,7 +90,7 @@ void afs_vlserver_probe_result(struct afs_call *call)
case -ETIME:
default:
clear_bit(index, &alist->responded);
- set_bit(index, &alist->failed);
+ set_bit(index, &alist->probe_failed);
if (!(server->probe.flags & AFS_VLSERVER_PROBE_RESPONDED) &&
(server->probe.error == 0 ||
server->probe.error == -ETIMEDOUT ||
@@ -101,21 +102,21 @@ void afs_vlserver_probe_result(struct afs_call *call)
responded:
set_bit(index, &alist->responded);
- clear_bit(index, &alist->failed);
+ clear_bit(index, &alist->probe_failed);
if (call->service_id == YFS_VL_SERVICE) {
server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS;
set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
} else {
server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS;
if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) {
clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
- alist->addrs[index].srx_service = call->service_id;
+ server->service_id = call->service_id;
}
}
- rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
+ rtt_us = rxrpc_kernel_get_srtt(addr->peer);
if (rtt_us < server->probe.rtt) {
server->probe.rtt = rtt_us;
server->rtt = rtt_us;
@@ -130,8 +131,10 @@ responded:
out:
spin_unlock(&server->probe_lock);
- _debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
- server_index, index, &alist->addrs[index].transport, rtt_us, ret);
+ trace_afs_vl_probe(server, false, alist, index, call->error, call->abort_code, rtt_us);
+ _debug("probe [%u][%u] %pISpc rtt=%d ret=%d",
+ server_index, index, rxrpc_kernel_remote_addr(addr->peer),
+ rtt_us, ret);
afs_done_one_vl_probe(server, have_result);
}
@@ -146,35 +149,52 @@ static bool afs_do_probe_vlserver(struct afs_net *net,
unsigned int server_index,
struct afs_error *_e)
{
- struct afs_addr_cursor ac = {
- .index = 0,
- };
+ struct afs_addr_list *alist;
struct afs_call *call;
+ unsigned long unprobed;
+ unsigned int index, i;
bool in_progress = false;
+ int best_prio;
_enter("%s", server->name);
read_lock(&server->lock);
- ac.alist = rcu_dereference_protected(server->addresses,
- lockdep_is_held(&server->lock));
+ alist = rcu_dereference_protected(server->addresses,
+ lockdep_is_held(&server->lock));
+ afs_get_addrlist(alist, afs_alist_trace_get_vlprobe);
read_unlock(&server->lock);
- atomic_set(&server->probe_outstanding, ac.alist->nr_addrs);
+ atomic_set(&server->probe_outstanding, alist->nr_addrs);
memset(&server->probe, 0, sizeof(server->probe));
server->probe.rtt = UINT_MAX;
- for (ac.index = 0; ac.index < ac.alist->nr_addrs; ac.index++) {
- call = afs_vl_get_capabilities(net, &ac, key, server,
+ unprobed = (1UL << alist->nr_addrs) - 1;
+ while (unprobed) {
+ best_prio = -1;
+ index = 0;
+ for (i = 0; i < alist->nr_addrs; i++) {
+ if (test_bit(i, &unprobed) &&
+ alist->addrs[i].prio > best_prio) {
+ index = i;
+ best_prio = alist->addrs[i].prio;
+ }
+ }
+ __clear_bit(index, &unprobed);
+
+ trace_afs_vl_probe(server, true, alist, index, 0, 0, 0);
+ call = afs_vl_get_capabilities(net, alist, index, key, server,
server_index);
if (!IS_ERR(call)) {
+ afs_prioritise_error(_e, call->error, call->abort_code);
afs_put_call(call);
in_progress = true;
} else {
- afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
+ afs_prioritise_error(_e, PTR_ERR(call), 0);
afs_done_one_vl_probe(server, false);
}
}
+ afs_put_addrlist(alist, afs_alist_trace_put_vlprobe);
return in_progress;
}
@@ -185,12 +205,10 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key,
struct afs_vlserver_list *vllist)
{
struct afs_vlserver *server;
- struct afs_error e;
+ struct afs_error e = {};
bool in_progress = false;
int i;
- e.error = 0;
- e.responded = false;
for (i = 0; i < vllist->nr_servers; i++) {
server = vllist->servers[i].server;
if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags))
diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
index eb415ce56360..d8f79f6ada3d 100644
--- a/fs/afs/vl_rotate.c
+++ b/fs/afs/vl_rotate.c
@@ -17,18 +17,21 @@
bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cell,
struct key *key)
{
+ static atomic_t debug_ids;
+
memset(vc, 0, sizeof(*vc));
vc->cell = cell;
vc->key = key;
- vc->error = -EDESTADDRREQ;
- vc->ac.error = SHRT_MAX;
+ vc->cumul_error.error = -EDESTADDRREQ;
+ vc->nr_iterations = -1;
if (signal_pending(current)) {
- vc->error = -EINTR;
+ vc->cumul_error.error = -EINTR;
vc->flags |= AFS_VL_CURSOR_STOP;
return false;
}
+ vc->debug_id = atomic_inc_return(&debug_ids);
return true;
}
@@ -52,7 +55,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
&cell->dns_lookup_count,
smp_load_acquire(&cell->dns_lookup_count)
!= dns_lookup_count) < 0) {
- vc->error = -ERESTARTSYS;
+ vc->cumul_error.error = -ERESTARTSYS;
return false;
}
}
@@ -60,12 +63,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
/* Status load is ordered after lookup counter load */
if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) {
pr_warn("No record of cell %s\n", cell->name);
- vc->error = -ENOENT;
+ vc->cumul_error.error = -ENOENT;
return false;
}
if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
- vc->error = -EDESTADDRREQ;
+ vc->cumul_error.error = -EDESTADDRREQ;
return false;
}
}
@@ -78,8 +81,8 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
if (!vc->server_list->nr_servers)
return false;
- vc->untried = (1UL << vc->server_list->nr_servers) - 1;
- vc->index = -1;
+ vc->untried_servers = (1UL << vc->server_list->nr_servers) - 1;
+ vc->server_index = -1;
return true;
}
@@ -89,54 +92,57 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
*/
bool afs_select_vlserver(struct afs_vl_cursor *vc)
{
- struct afs_addr_list *alist;
+ struct afs_addr_list *alist = vc->alist;
struct afs_vlserver *vlserver;
- struct afs_error e;
- u32 rtt;
- int error = vc->ac.error, i;
+ unsigned long set, failed;
+ unsigned int rtt;
+ s32 abort_code = vc->call_abort_code;
+ int error = vc->call_error, i;
+
+ vc->nr_iterations++;
- _enter("%lx[%d],%lx[%d],%d,%d",
- vc->untried, vc->index,
- vc->ac.tried, vc->ac.index,
- error, vc->ac.abort_code);
+ _enter("VC=%x+%x,%d{%lx},%d{%lx},%d,%d",
+ vc->debug_id, vc->nr_iterations, vc->server_index, vc->untried_servers,
+ vc->addr_index, vc->addr_tried,
+ error, abort_code);
if (vc->flags & AFS_VL_CURSOR_STOP) {
_leave(" = f [stopped]");
return false;
}
- vc->nr_iterations++;
+ if (vc->nr_iterations == 0)
+ goto start;
+
+ WRITE_ONCE(alist->addrs[vc->addr_index].last_error, error);
/* Evaluate the result of the previous operation, if there was one. */
switch (error) {
- case SHRT_MAX:
- goto start;
-
default:
case 0:
/* Success or local failure. Stop. */
- vc->error = error;
+ vc->cumul_error.error = error;
vc->flags |= AFS_VL_CURSOR_STOP;
- _leave(" = f [okay/local %d]", vc->ac.error);
+ _leave(" = f [okay/local %d]", vc->cumul_error.error);
return false;
case -ECONNABORTED:
/* The far side rejected the operation on some grounds. This
* might involve the server being busy or the volume having been moved.
*/
- switch (vc->ac.abort_code) {
+ switch (abort_code) {
case AFSVL_IO:
case AFSVL_BADVOLOPER:
case AFSVL_NOMEM:
/* The server went weird. */
- vc->error = -EREMOTEIO;
+ afs_prioritise_error(&vc->cumul_error, -EREMOTEIO, abort_code);
//write_lock(&vc->cell->vl_servers_lock);
- //vc->server_list->weird_mask |= 1 << vc->index;
+ //vc->server_list->weird_mask |= 1 << vc->server_index;
//write_unlock(&vc->cell->vl_servers_lock);
goto next_server;
default:
- vc->error = afs_abort_to_error(vc->ac.abort_code);
+ afs_prioritise_error(&vc->cumul_error, error, abort_code);
goto failed;
}
@@ -149,12 +155,12 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
case -ETIMEDOUT:
case -ETIME:
_debug("no conn %d", error);
- vc->error = error;
+ afs_prioritise_error(&vc->cumul_error, error, 0);
goto iterate_address;
case -ECONNRESET:
_debug("call reset");
- vc->error = error;
+ afs_prioritise_error(&vc->cumul_error, error, 0);
vc->flags |= AFS_VL_CURSOR_RETRY;
goto next_server;
@@ -165,7 +171,13 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
restart_from_beginning:
_debug("restart");
- afs_end_cursor(&vc->ac);
+ if (vc->call_responded &&
+ vc->addr_index != vc->alist->preferred &&
+ test_bit(alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(alist->preferred, vc->addr_index);
+ afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_restart);
+ alist = vc->alist = NULL;
+
afs_put_vlserverlist(vc->cell->net, vc->server_list);
vc->server_list = NULL;
if (vc->flags & AFS_VL_CURSOR_RETRIED)
@@ -173,53 +185,58 @@ restart_from_beginning:
vc->flags |= AFS_VL_CURSOR_RETRIED;
start:
_debug("start");
+ ASSERTCMP(alist, ==, NULL);
if (!afs_start_vl_iteration(vc))
goto failed;
error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list);
- if (error < 0)
- goto failed_set_error;
+ if (error < 0) {
+ afs_prioritise_error(&vc->cumul_error, error, 0);
+ goto failed;
+ }
pick_server:
- _debug("pick [%lx]", vc->untried);
+ _debug("pick [%lx]", vc->untried_servers);
+ ASSERTCMP(alist, ==, NULL);
- error = afs_wait_for_vl_probes(vc->server_list, vc->untried);
- if (error < 0)
- goto failed_set_error;
+ error = afs_wait_for_vl_probes(vc->server_list, vc->untried_servers);
+ if (error < 0) {
+ afs_prioritise_error(&vc->cumul_error, error, 0);
+ goto failed;
+ }
/* Pick the untried server with the lowest RTT. */
- vc->index = vc->server_list->preferred;
- if (test_bit(vc->index, &vc->untried))
+ vc->server_index = vc->server_list->preferred;
+ if (test_bit(vc->server_index, &vc->untried_servers))
goto selected_server;
- vc->index = -1;
- rtt = U32_MAX;
+ vc->server_index = -1;
+ rtt = UINT_MAX;
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
- if (!test_bit(i, &vc->untried) ||
+ if (!test_bit(i, &vc->untried_servers) ||
!test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
continue;
- if (s->probe.rtt < rtt) {
- vc->index = i;
+ if (s->probe.rtt <= rtt) {
+ vc->server_index = i;
rtt = s->probe.rtt;
}
}
- if (vc->index == -1)
+ if (vc->server_index == -1)
goto no_more_servers;
selected_server:
- _debug("use %d", vc->index);
- __clear_bit(vc->index, &vc->untried);
+ _debug("use %d", vc->server_index);
+ __clear_bit(vc->server_index, &vc->untried_servers);
/* We're starting on a different vlserver from the list. We need to
* check it, find its address list and probe its capabilities before we
* use it.
*/
- ASSERTCMP(vc->ac.alist, ==, NULL);
- vlserver = vc->server_list->servers[vc->index].server;
+ vlserver = vc->server_list->servers[vc->server_index].server;
vc->server = vlserver;
_debug("USING VLSERVER: %s", vlserver->name);
@@ -227,34 +244,48 @@ selected_server:
read_lock(&vlserver->lock);
alist = rcu_dereference_protected(vlserver->addresses,
lockdep_is_held(&vlserver->lock));
- afs_get_addrlist(alist);
+ vc->alist = afs_get_addrlist(alist, afs_alist_trace_get_vlrotate_set);
read_unlock(&vlserver->lock);
- memset(&vc->ac, 0, sizeof(vc->ac));
-
- if (!vc->ac.alist)
- vc->ac.alist = alist;
- else
- afs_put_addrlist(alist);
-
- vc->ac.index = -1;
+ vc->addr_tried = 0;
+ vc->addr_index = -1;
iterate_address:
- ASSERT(vc->ac.alist);
/* Iterate over the current server's address list to try and find an
* address on which it will respond to us.
*/
- if (!afs_iterate_addresses(&vc->ac))
+ set = READ_ONCE(alist->responded);
+ failed = READ_ONCE(alist->probe_failed);
+ vc->addr_index = READ_ONCE(alist->preferred);
+
+ _debug("%lx-%lx-%lx,%d", set, failed, vc->addr_tried, vc->addr_index);
+
+ set &= ~(failed | vc->addr_tried);
+
+ if (!set)
goto next_server;
- _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+ if (!test_bit(vc->addr_index, &set))
+ vc->addr_index = __ffs(set);
+
+ set_bit(vc->addr_index, &vc->addr_tried);
+ vc->alist = alist;
- _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport);
+ _debug("VL address %d/%d", vc->addr_index, alist->nr_addrs);
+
+ vc->call_responded = false;
+ _leave(" = t %pISpc", rxrpc_kernel_remote_addr(alist->addrs[vc->addr_index].peer));
return true;
next_server:
_debug("next");
- afs_end_cursor(&vc->ac);
+ ASSERT(alist);
+ if (vc->call_responded &&
+ vc->addr_index != alist->preferred &&
+ test_bit(alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(alist->preferred, vc->addr_index);
+ afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_next);
+ alist = vc->alist = NULL;
goto pick_server;
no_more_servers:
@@ -264,25 +295,26 @@ no_more_servers:
if (vc->flags & AFS_VL_CURSOR_RETRY)
goto restart_from_beginning;
- e.error = -EDESTADDRREQ;
- e.responded = false;
for (i = 0; i < vc->server_list->nr_servers; i++) {
struct afs_vlserver *s = vc->server_list->servers[i].server;
if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
- e.responded = true;
- afs_prioritise_error(&e, READ_ONCE(s->probe.error),
+ vc->cumul_error.responded = true;
+ afs_prioritise_error(&vc->cumul_error, READ_ONCE(s->probe.error),
s->probe.abort_code);
}
- error = e.error;
-
-failed_set_error:
- vc->error = error;
failed:
+ if (alist) {
+ if (vc->call_responded &&
+ vc->addr_index != alist->preferred &&
+ test_bit(alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(alist->preferred, vc->addr_index);
+ afs_put_addrlist(alist, afs_alist_trace_put_vlrotate_fail);
+ alist = vc->alist = NULL;
+ }
vc->flags |= AFS_VL_CURSOR_STOP;
- afs_end_cursor(&vc->ac);
- _leave(" = f [failed %d]", vc->error);
+ _leave(" = f [failed %d]", vc->cumul_error.error);
return false;
}
@@ -305,7 +337,10 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
pr_notice("DNS: src=%u st=%u lc=%x\n",
cell->dns_source, cell->dns_status, cell->dns_lookup_count);
pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
- vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
+ vc->untried_servers, vc->server_index, vc->nr_iterations,
+ vc->flags, vc->cumul_error.error);
+ pr_notice("VC: call er=%d ac=%d r=%u\n",
+ vc->call_error, vc->call_abort_code, vc->call_responded);
if (vc->server_list) {
const struct afs_vlserver_list *sl = vc->server_list;
@@ -322,16 +357,14 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
a->nr_ipv4, a->nr_addrs, a->max_addrs,
a->preferred);
pr_notice("VC: - R=%lx F=%lx\n",
- a->responded, a->failed);
- if (a == vc->ac.alist)
+ a->responded, a->probe_failed);
+ if (a == vc->alist)
pr_notice("VC: - current\n");
}
}
}
- pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
- vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error,
- vc->ac.responded, vc->ac.nr_iterations);
+ pr_notice("AC: t=%lx ax=%u\n", vc->addr_tried, vc->addr_index);
rcu_read_unlock();
}
@@ -342,17 +375,25 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc)
{
struct afs_net *net = vc->cell->net;
- if (vc->error == -EDESTADDRREQ ||
- vc->error == -EADDRNOTAVAIL ||
- vc->error == -ENETUNREACH ||
- vc->error == -EHOSTUNREACH)
+ _enter("VC=%x+%x", vc->debug_id, vc->nr_iterations);
+
+ switch (vc->cumul_error.error) {
+ case -EDESTADDRREQ:
+ case -EADDRNOTAVAIL:
+ case -ENETUNREACH:
+ case -EHOSTUNREACH:
afs_vl_dump_edestaddrreq(vc);
+ break;
+ }
- afs_end_cursor(&vc->ac);
+ if (vc->alist) {
+ if (vc->call_responded &&
+ vc->addr_index != vc->alist->preferred &&
+ test_bit(vc->alist->preferred, &vc->addr_tried))
+ WRITE_ONCE(vc->alist->preferred, vc->addr_index);
+ afs_put_addrlist(vc->alist, afs_alist_trace_put_vlrotate_end);
+ vc->alist = NULL;
+ }
afs_put_vlserverlist(net, vc->server_list);
-
- if (vc->error == -ECONNABORTED)
- vc->error = afs_abort_to_error(vc->ac.abort_code);
-
- return vc->error;
+ return vc->cumul_error.error;
}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 00fca3c66ba6..cac75f89b64a 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -18,8 +18,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
{
struct afs_uvldbentry__xdr *uvldb;
struct afs_vldb_entry *entry;
- bool new_only = false;
- u32 tmp, nr_servers, vlflags;
+ u32 nr_servers, vlflags;
int i, ret;
_enter("");
@@ -41,27 +40,14 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
entry->name[i] = 0;
entry->name_len = strlen(entry->name);
- /* If there is a new replication site that we can use, ignore all the
- * sites that aren't marked as new.
- */
- for (i = 0; i < nr_servers; i++) {
- tmp = ntohl(uvldb->serverFlags[i]);
- if (!(tmp & AFS_VLSF_DONTUSE) &&
- (tmp & AFS_VLSF_NEWREPSITE))
- new_only = true;
- }
-
vlflags = ntohl(uvldb->flags);
for (i = 0; i < nr_servers; i++) {
struct afs_uuid__xdr *xdr;
struct afs_uuid *uuid;
+ u32 tmp = ntohl(uvldb->serverFlags[i]);
int j;
int n = entry->nr_servers;
- tmp = ntohl(uvldb->serverFlags[i]);
- if (tmp & AFS_VLSF_DONTUSE ||
- (new_only && !(tmp & AFS_VLSF_NEWREPSITE)))
- continue;
if (tmp & AFS_VLSF_RWVOL) {
entry->fs_mask[n] |= AFS_VOL_VTM_RW;
if (vlflags & AFS_VLF_BACKEXISTS)
@@ -82,6 +68,7 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
for (j = 0; j < 6; j++)
uuid->node[j] = (u8)ntohl(xdr->node[j]);
+ entry->vlsf_flags[n] = tmp;
entry->addr_version[n] = ntohl(uvldb->serverUnique[i]);
entry->nr_servers++;
}
@@ -106,12 +93,6 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
return 0;
}
-static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call)
-{
- kfree(call->ret_vldb);
- afs_flat_call_destructor(call);
-}
-
/*
* VL.GetEntryByNameU operation type.
*/
@@ -119,7 +100,7 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = {
.name = "VL.GetEntryByNameU",
.op = afs_VL_GetEntryByNameU,
.deliver = afs_deliver_vl_get_entry_by_name_u,
- .destructor = afs_destroy_vl_get_entry_by_name_u,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -155,6 +136,8 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
call->key = vc->key;
call->ret_vldb = entry;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* Marshall the parameters */
bp = call->request;
@@ -165,8 +148,17 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
memset((void *)bp + volnamesz, 0, padsz);
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (struct afs_vldb_entry *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ afs_put_call(call);
+ if (vc->call_error) {
+ kfree(entry);
+ return ERR_PTR(vc->call_error);
+ }
+ return entry;
}
/*
@@ -208,7 +200,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
count = ntohl(*bp);
nentries = min(nentries, count);
- alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT);
+ alist = afs_alloc_addrlist(nentries);
if (!alist)
return -ENOMEM;
alist->version = uniquifier;
@@ -230,9 +222,13 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
alist = call->ret_alist;
bp = call->buffer;
count = min(call->count, 4U);
- for (i = 0; i < count; i++)
- if (alist->nr_addrs < call->count2)
- afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT);
+ for (i = 0; i < count; i++) {
+ if (alist->nr_addrs < call->count2) {
+ ret = afs_merge_fs_addr4(call->net, alist, *bp++, AFS_FS_PORT);
+ if (ret < 0)
+ return ret;
+ }
+ }
call->count -= count;
if (call->count > 0)
@@ -245,12 +241,6 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
return 0;
}
-static void afs_vl_get_addrs_u_destructor(struct afs_call *call)
-{
- afs_put_addrlist(call->ret_alist);
- return afs_flat_call_destructor(call);
-}
-
/*
* VL.GetAddrsU operation type.
*/
@@ -258,7 +248,7 @@ static const struct afs_call_type afs_RXVLGetAddrsU = {
.name = "VL.GetAddrsU",
.op = afs_VL_GetAddrsU,
.deliver = afs_deliver_vl_get_addrs_u,
- .destructor = afs_vl_get_addrs_u_destructor,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -269,6 +259,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
const uuid_t *uuid)
{
struct afs_ListAddrByAttributes__xdr *r;
+ struct afs_addr_list *alist;
const struct afs_uuid *u = (const struct afs_uuid *)uuid;
struct afs_call *call;
struct afs_net *net = vc->cell->net;
@@ -286,6 +277,8 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
call->key = vc->key;
call->ret_alist = NULL;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* Marshall the parameters */
bp = call->request;
@@ -304,8 +297,18 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
r->uuid.node[i] = htonl(u->node[i]);
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ alist = call->ret_alist;
+ afs_put_call(call);
+ if (vc->call_error) {
+ afs_put_addrlist(alist, afs_alist_trace_put_getaddru);
+ return ERR_PTR(vc->call_error);
+ }
+ return alist;
}
/*
@@ -355,6 +358,7 @@ static int afs_deliver_vl_get_capabilities(struct afs_call *call)
static void afs_destroy_vl_get_capabilities(struct afs_call *call)
{
+ afs_put_addrlist(call->vl_probe, afs_alist_trace_put_vlgetcaps);
afs_put_vlserver(call->net, call->vlserver);
afs_flat_call_destructor(call);
}
@@ -378,7 +382,8 @@ static const struct afs_call_type afs_RXVLGetCapabilities = {
* other end supports.
*/
struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
- struct afs_addr_cursor *ac,
+ struct afs_addr_list *alist,
+ unsigned int addr_index,
struct key *key,
struct afs_vlserver *server,
unsigned int server_index)
@@ -395,6 +400,10 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
call->key = key;
call->vlserver = afs_get_vlserver(server);
call->server_index = server_index;
+ call->peer = rxrpc_kernel_get_peer(alist->addrs[addr_index].peer);
+ call->vl_probe = afs_get_addrlist(alist, afs_alist_trace_get_vlgetcaps);
+ call->probe_index = addr_index;
+ call->service_id = server->service_id;
call->upgrade = true;
call->async = true;
call->max_lifespan = AFS_PROBE_MAX_LIFESPAN;
@@ -405,7 +414,7 @@ struct afs_call *afs_vl_get_capabilities(struct afs_net *net,
/* Can't take a ref on server */
trace_afs_make_vl_call(call);
- afs_make_call(ac, call, GFP_KERNEL);
+ afs_make_call(call, GFP_KERNEL);
return call;
}
@@ -450,7 +459,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
if (call->count > YFS_MAXENDPOINTS)
return afs_protocol_error(call, afs_eproto_yvl_fsendpt_num);
- alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT);
+ alist = afs_alloc_addrlist(call->count);
if (!alist)
return -ENOMEM;
alist->version = uniquifier;
@@ -488,14 +497,18 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
if (ntohl(bp[0]) != sizeof(__be32) * 2)
return afs_protocol_error(
call, afs_eproto_yvl_fsendpt4_len);
- afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2]));
+ ret = afs_merge_fs_addr4(call->net, alist, bp[1], ntohl(bp[2]));
+ if (ret < 0)
+ return ret;
bp += 3;
break;
case YFS_ENDPOINT_IPV6:
if (ntohl(bp[0]) != sizeof(__be32) * 5)
return afs_protocol_error(
call, afs_eproto_yvl_fsendpt6_len);
- afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5]));
+ ret = afs_merge_fs_addr6(call->net, alist, bp + 1, ntohl(bp[5]));
+ if (ret < 0)
+ return ret;
bp += 6;
break;
default:
@@ -610,7 +623,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
.name = "YFSVL.GetEndpoints",
.op = afs_YFSVL_GetEndpoints,
.deliver = afs_deliver_yfsvl_get_endpoints,
- .destructor = afs_vl_get_addrs_u_destructor,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -620,6 +633,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
const uuid_t *uuid)
{
+ struct afs_addr_list *alist;
struct afs_call *call;
struct afs_net *net = vc->cell->net;
__be32 *bp;
@@ -635,6 +649,8 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
call->key = vc->key;
call->ret_alist = NULL;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* Marshall the parameters */
bp = call->request;
@@ -643,8 +659,18 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
memcpy(bp, uuid, sizeof(*uuid)); /* Type opr_uuid */
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ alist = call->ret_alist;
+ afs_put_call(call);
+ if (vc->call_error) {
+ afs_put_addrlist(alist, afs_alist_trace_put_getaddru);
+ return ERR_PTR(vc->call_error);
+ }
+ return alist;
}
/*
@@ -709,12 +735,6 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
return 0;
}
-static void afs_destroy_yfsvl_get_cell_name(struct afs_call *call)
-{
- kfree(call->ret_str);
- afs_flat_call_destructor(call);
-}
-
/*
* VL.GetCapabilities operation type
*/
@@ -722,7 +742,7 @@ static const struct afs_call_type afs_YFSVLGetCellName = {
.name = "YFSVL.GetCellName",
.op = afs_YFSVL_GetCellName,
.deliver = afs_deliver_yfsvl_get_cell_name,
- .destructor = afs_destroy_yfsvl_get_cell_name,
+ .destructor = afs_flat_call_destructor,
};
/*
@@ -737,6 +757,7 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
struct afs_call *call;
struct afs_net *net = vc->cell->net;
__be32 *bp;
+ char *cellname;
_enter("");
@@ -747,6 +768,8 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
call->key = vc->key;
call->ret_str = NULL;
call->max_lifespan = AFS_VL_MAX_LIFESPAN;
+ call->peer = rxrpc_kernel_get_peer(vc->alist->addrs[vc->addr_index].peer);
+ call->service_id = vc->server->service_id;
/* marshall the parameters */
bp = call->request;
@@ -754,6 +777,16 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
/* Can't take a ref on server */
trace_afs_make_vl_call(call);
- afs_make_call(&vc->ac, call, GFP_KERNEL);
- return (char *)afs_wait_for_call_to_complete(call, &vc->ac);
+ afs_make_call(call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call);
+ vc->call_abort_code = call->abort_code;
+ vc->call_error = call->error;
+ vc->call_responded = call->responded;
+ cellname = call->ret_str;
+ afs_put_call(call);
+ if (vc->call_error) {
+ kfree(cellname);
+ return ERR_PTR(vc->call_error);
+ }
+ return cellname;
}
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 115c081a8e2c..af3a3f57c1b3 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -11,6 +11,8 @@
static unsigned __read_mostly afs_volume_record_life = 60 * 60;
+static void afs_destroy_volume(struct work_struct *work);
+
/*
* Insert a volume into a cell. If there's an existing volume record, that is
* returned instead with a ref held.
@@ -72,11 +74,11 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume)
*/
static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
struct afs_vldb_entry *vldb,
- unsigned long type_mask)
+ struct afs_server_list **_slist)
{
struct afs_server_list *slist;
struct afs_volume *volume;
- int ret = -ENOMEM;
+ int ret = -ENOMEM, i;
volume = kzalloc(sizeof(struct afs_volume), GFP_KERNEL);
if (!volume)
@@ -88,20 +90,30 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params,
volume->type = params->type;
volume->type_force = params->force;
volume->name_len = vldb->name_len;
+ volume->creation_time = TIME64_MIN;
+ volume->update_time = TIME64_MIN;
refcount_set(&volume->ref, 1);
INIT_HLIST_NODE(&volume->proc_link);
+ INIT_WORK(&volume->destructor, afs_destroy_volume);
rwlock_init(&volume->servers_lock);
+ mutex_init(&volume->volsync_lock);
+ mutex_init(&volume->cb_check_lock);
rwlock_init(&volume->cb_v_break_lock);
+ INIT_LIST_HEAD(&volume->open_mmaps);
+ init_rwsem(&volume->open_mmaps_lock);
memcpy(volume->name, vldb->name, vldb->name_len + 1);
- slist = afs_alloc_server_list(params->cell, params->key, vldb, type_mask);
+ for (i = 0; i < AFS_MAXTYPES; i++)
+ volume->vids[i] = vldb->vid[i];
+
+ slist = afs_alloc_server_list(volume, params->key, vldb);
if (IS_ERR(slist)) {
ret = PTR_ERR(slist);
goto error_1;
}
- refcount_set(&slist->usage, 1);
+ *_slist = slist;
rcu_assign_pointer(volume->servers, slist);
trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc);
return volume;
@@ -117,18 +129,20 @@ error_0:
* Look up or allocate a volume record.
*/
static struct afs_volume *afs_lookup_volume(struct afs_fs_context *params,
- struct afs_vldb_entry *vldb,
- unsigned long type_mask)
+ struct afs_vldb_entry *vldb)
{
+ struct afs_server_list *slist;
struct afs_volume *candidate, *volume;
- candidate = afs_alloc_volume(params, vldb, type_mask);
+ candidate = afs_alloc_volume(params, vldb, &slist);
if (IS_ERR(candidate))
return candidate;
volume = afs_insert_volume_into_cell(params->cell, candidate);
- if (volume != candidate)
- afs_put_volume(params->net, candidate, afs_volume_trace_put_cell_dup);
+ if (volume == candidate)
+ afs_attach_volume_to_servers(volume, slist);
+ else
+ afs_put_volume(candidate, afs_volume_trace_put_cell_dup);
return volume;
}
@@ -208,8 +222,7 @@ struct afs_volume *afs_create_volume(struct afs_fs_context *params)
goto error;
}
- type_mask = 1UL << params->type;
- volume = afs_lookup_volume(params, vldb, type_mask);
+ volume = afs_lookup_volume(params, vldb);
error:
kfree(vldb);
@@ -219,16 +232,20 @@ error:
/*
* Destroy a volume record
*/
-static void afs_destroy_volume(struct afs_net *net, struct afs_volume *volume)
+static void afs_destroy_volume(struct work_struct *work)
{
+ struct afs_volume *volume = container_of(work, struct afs_volume, destructor);
+ struct afs_server_list *slist = rcu_access_pointer(volume->servers);
+
_enter("%p", volume);
#ifdef CONFIG_AFS_FSCACHE
ASSERTCMP(volume->cache, ==, NULL);
#endif
+ afs_detach_volume_from_servers(volume, slist);
afs_remove_volume_from_cell(volume);
- afs_put_serverlist(net, rcu_access_pointer(volume->servers));
+ afs_put_serverlist(volume->cell->net, slist);
afs_put_cell(volume->cell, afs_cell_trace_put_vol);
trace_afs_volume(volume->vid, refcount_read(&volume->ref),
afs_volume_trace_free);
@@ -270,8 +287,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume,
/*
* Drop a reference on a volume record.
*/
-void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
- enum afs_volume_trace reason)
+void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason)
{
if (volume) {
afs_volid_t vid = volume->vid;
@@ -281,7 +297,7 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume,
zero = __refcount_dec_and_test(&volume->ref, &r);
trace_afs_volume(vid, r - 1, reason);
if (zero)
- afs_destroy_volume(net, volume);
+ schedule_work(&volume->destructor);
}
}
@@ -337,7 +353,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
{
struct afs_server_list *new, *old, *discard;
struct afs_vldb_entry *vldb;
- char idbuf[16];
+ char idbuf[24];
int ret, idsz;
_enter("");
@@ -345,7 +361,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
/* We look up an ID by passing it as a decimal string in the
* operation's name parameter.
*/
- idsz = sprintf(idbuf, "%llu", volume->vid);
+ idsz = snprintf(idbuf, sizeof(idbuf), "%llu", volume->vid);
vldb = afs_vl_lookup_vldb(volume->cell, key, idbuf, idsz);
if (IS_ERR(vldb)) {
@@ -362,8 +378,7 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
}
/* See if the volume's server list got updated. */
- new = afs_alloc_server_list(volume->cell, key,
- vldb, (1 << volume->type));
+ new = afs_alloc_server_list(volume, key, vldb);
if (IS_ERR(new)) {
ret = PTR_ERR(new);
goto error_vldb;
@@ -382,11 +397,17 @@ static int afs_update_volume_status(struct afs_volume *volume, struct key *key)
discard = old;
}
- volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
+ /* Check more often if replication is ongoing. */
+ if (new->ro_replicating)
+ volume->update_at = ktime_get_real_seconds() + 10 * 60;
+ else
+ volume->update_at = ktime_get_real_seconds() + afs_volume_record_life;
write_unlock(&volume->servers_lock);
- ret = 0;
+ if (discard == old)
+ afs_reattach_volume_to_servers(volume, new, old);
afs_put_serverlist(volume->cell->net, discard);
+ ret = 0;
error_vldb:
kfree(vldb);
error:
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 4a168781936b..74402d95a884 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -12,309 +12,17 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/netfs.h>
+#include <trace/events/netfs.h>
#include "internal.h"
-static int afs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- loff_t start, loff_t end, loff_t *_next,
- bool max_one_loop);
-
-static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len,
- loff_t i_size, bool caching);
-
-#ifdef CONFIG_AFS_FSCACHE
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-bool afs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- return fscache_dirty_folio(mapping, folio,
- afs_vnode_cache(AFS_FS_I(mapping->host)));
-}
-static void afs_folio_start_fscache(bool caching, struct folio *folio)
-{
- if (caching)
- folio_start_fscache(folio);
-}
-#else
-static void afs_folio_start_fscache(bool caching, struct folio *folio)
-{
-}
-#endif
-
-/*
- * Flush out a conflicting write. This may extend the write to the surrounding
- * pages if also dirty and contiguous to the conflicting region..
- */
-static int afs_flush_conflicting_write(struct address_space *mapping,
- struct folio *folio)
-{
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .range_start = folio_pos(folio),
- .range_end = LLONG_MAX,
- };
- loff_t next;
-
- return afs_writepages_region(mapping, &wbc, folio_pos(folio), LLONG_MAX,
- &next, true);
-}
-
-/*
- * prepare to perform part of a write to a page
- */
-int afs_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len,
- struct page **_page, void **fsdata)
-{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- struct folio *folio;
- unsigned long priv;
- unsigned f, from;
- unsigned t, to;
- pgoff_t index;
- int ret;
-
- _enter("{%llx:%llu},%llx,%x",
- vnode->fid.vid, vnode->fid.vnode, pos, len);
-
- /* Prefetch area to be written into the cache if we're caching this
- * file. We need to do this before we get a lock on the page in case
- * there's more than one writer competing for the same cache block.
- */
- ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata);
- if (ret < 0)
- return ret;
-
- index = folio_index(folio);
- from = pos - index * PAGE_SIZE;
- to = from + len;
-
-try_again:
- /* See if this page is already partially written in a way that we can
- * merge the new write with.
- */
- if (folio_test_private(folio)) {
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- ASSERTCMP(f, <=, t);
-
- if (folio_test_writeback(folio)) {
- trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio);
- folio_unlock(folio);
- goto wait_for_writeback;
- }
- /* If the file is being filled locally, allow inter-write
- * spaces to be merged into writes. If it's not, only write
- * back what the user gives us.
- */
- if (!test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags) &&
- (to < f || from > t))
- goto flush_conflicting_write;
- }
-
- *_page = folio_file_page(folio, pos / PAGE_SIZE);
- _leave(" = 0");
- return 0;
-
- /* The previous write and this write aren't adjacent or overlapping, so
- * flush the page out.
- */
-flush_conflicting_write:
- trace_afs_folio_dirty(vnode, tracepoint_string("confl"), folio);
- folio_unlock(folio);
-
- ret = afs_flush_conflicting_write(mapping, folio);
- if (ret < 0)
- goto error;
-
-wait_for_writeback:
- ret = folio_wait_writeback_killable(folio);
- if (ret < 0)
- goto error;
-
- ret = folio_lock_killable(folio);
- if (ret < 0)
- goto error;
- goto try_again;
-
-error:
- folio_put(folio);
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
- * finalise part of a write to a page
- */
-int afs_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *subpage, void *fsdata)
-{
- struct folio *folio = page_folio(subpage);
- struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- unsigned long priv;
- unsigned int f, from = offset_in_folio(folio, pos);
- unsigned int t, to = from + copied;
- loff_t i_size, write_end_pos;
-
- _enter("{%llx:%llu},{%lx}",
- vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
-
- if (!folio_test_uptodate(folio)) {
- if (copied < len) {
- copied = 0;
- goto out;
- }
-
- folio_mark_uptodate(folio);
- }
-
- if (copied == 0)
- goto out;
-
- write_end_pos = pos + copied;
-
- i_size = i_size_read(&vnode->netfs.inode);
- if (write_end_pos > i_size) {
- write_seqlock(&vnode->cb_lock);
- i_size = i_size_read(&vnode->netfs.inode);
- if (write_end_pos > i_size)
- afs_set_i_size(vnode, write_end_pos);
- write_sequnlock(&vnode->cb_lock);
- fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos);
- }
-
- if (folio_test_private(folio)) {
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- if (from < f)
- f = from;
- if (to > t)
- t = to;
- priv = afs_folio_dirty(folio, f, t);
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio);
- } else {
- priv = afs_folio_dirty(folio, from, to);
- folio_attach_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio);
- }
-
- if (folio_mark_dirty(folio))
- _debug("dirtied %lx", folio_index(folio));
-
-out:
- folio_unlock(folio);
- folio_put(folio);
- return copied;
-}
-
-/*
- * kill all the pages in the given range
- */
-static void afs_kill_pages(struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("{%llx:%llu},%llx @%llx",
- vnode->fid.vid, vnode->fid.vnode, len, start);
-
- do {
- _debug("kill %lx (to %lx)", index, last);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = folio_next_index(folio);
-
- folio_clear_uptodate(folio);
- folio_end_writeback(folio);
- folio_lock(folio);
- generic_error_remove_page(mapping, &folio->page);
- folio_unlock(folio);
- folio_put(folio);
-
- } while (index = next, index <= last);
-
- _leave("");
-}
-
-/*
- * Redirty all the pages in a given range.
- */
-static void afs_redirty_pages(struct writeback_control *wbc,
- struct address_space *mapping,
- loff_t start, loff_t len)
-{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct folio *folio;
- pgoff_t index = start / PAGE_SIZE;
- pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
-
- _enter("{%llx:%llu},%llx @%llx",
- vnode->fid.vid, vnode->fid.vnode, len, start);
-
- do {
- _debug("redirty %llx @%llx", len, start);
-
- folio = filemap_get_folio(mapping, index);
- if (IS_ERR(folio)) {
- next = index + 1;
- continue;
- }
-
- next = index + folio_nr_pages(folio);
- folio_redirty_for_writepage(wbc, folio);
- folio_end_writeback(folio);
- folio_put(folio);
- } while (index = next, index <= last);
-
- _leave("");
-}
-
/*
* completion of write to server
*/
static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
- struct address_space *mapping = vnode->netfs.inode.i_mapping;
- struct folio *folio;
- pgoff_t end;
-
- XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
-
_enter("{%llx:%llu},{%x @%llx}",
vnode->fid.vid, vnode->fid.vnode, len, start);
- rcu_read_lock();
-
- end = (start + len - 1) / PAGE_SIZE;
- xas_for_each(&xas, folio, end) {
- if (!folio_test_writeback(folio)) {
- kdebug("bad %x @%llx page %lx %lx",
- len, start, folio_index(folio), end);
- ASSERT(folio_test_writeback(folio));
- }
-
- trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio);
- folio_detach_private(folio);
- folio_end_writeback(folio);
- }
-
- rcu_read_unlock();
-
afs_prune_wb_keys(vnode);
_leave("");
}
@@ -366,7 +74,7 @@ static void afs_store_data_success(struct afs_operation *op)
op->ctime = op->file[0].scb.status.mtime_client;
afs_vnode_commit_status(op, &op->file[0]);
- if (op->error == 0) {
+ if (!afs_op_error(op)) {
if (!op->store.laundering)
afs_pages_written_back(vnode, op->store.pos, op->store.size);
afs_stat_v(vnode, n_stores);
@@ -428,7 +136,7 @@ try_next_key:
afs_wait_for_operation(op);
- switch (op->error) {
+ switch (afs_op_error(op)) {
case -EACCES:
case -EPERM:
case -ENOKEY:
@@ -447,369 +155,57 @@ try_next_key:
}
afs_put_wb_key(wbk);
- _leave(" = %d", op->error);
+ _leave(" = %d", afs_op_error(op));
return afs_put_operation(op);
}
-/*
- * Extend the region to be written back to include subsequent contiguously
- * dirty pages if possible, but don't sleep while doing so.
- *
- * If this page holds new content, then we can include filler zeros in the
- * writeback.
- */
-static void afs_extend_writeback(struct address_space *mapping,
- struct afs_vnode *vnode,
- long *_count,
- loff_t start,
- loff_t max_len,
- bool new_content,
- bool caching,
- unsigned int *_len)
+static void afs_upload_to_server(struct netfs_io_subrequest *subreq)
{
- struct folio_batch fbatch;
- struct folio *folio;
- unsigned long priv;
- unsigned int psize, filler = 0;
- unsigned int f, t;
- loff_t len = *_len;
- pgoff_t index = (start + len) / PAGE_SIZE;
- bool stop = true;
- unsigned int i;
-
- XA_STATE(xas, &mapping->i_pages, index);
- folio_batch_init(&fbatch);
-
- do {
- /* Firstly, we gather up a batch of contiguous dirty pages
- * under the RCU read lock - but we can't clear the dirty flags
- * there if any of those pages are mapped.
- */
- rcu_read_lock();
-
- xas_for_each(&xas, folio, ULONG_MAX) {
- stop = true;
- if (xas_retry(&xas, folio))
- continue;
- if (xa_is_value(folio))
- break;
- if (folio_index(folio) != index)
- break;
-
- if (!folio_try_get_rcu(folio)) {
- xas_reset(&xas);
- continue;
- }
-
- /* Has the page moved or been split? */
- if (unlikely(folio != xas_reload(&xas))) {
- folio_put(folio);
- break;
- }
-
- if (!folio_trylock(folio)) {
- folio_put(folio);
- break;
- }
- if (!folio_test_dirty(folio) ||
- folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- folio_put(folio);
- break;
- }
-
- psize = folio_size(folio);
- priv = (unsigned long)folio_get_private(folio);
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- if (f != 0 && !new_content) {
- folio_unlock(folio);
- folio_put(folio);
- break;
- }
-
- len += filler + t;
- filler = psize - t;
- if (len >= max_len || *_count <= 0)
- stop = true;
- else if (t == psize || new_content)
- stop = false;
-
- index += folio_nr_pages(folio);
- if (!folio_batch_add(&fbatch, folio))
- break;
- if (stop)
- break;
- }
-
- if (!stop)
- xas_pause(&xas);
- rcu_read_unlock();
-
- /* Now, if we obtained any folios, we can shift them to being
- * writable and mark them for caching.
- */
- if (!folio_batch_count(&fbatch))
- break;
-
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
- folio = fbatch.folios[i];
- trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio);
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- if (folio_start_writeback(folio))
- BUG();
- afs_folio_start_fscache(caching, folio);
-
- *_count -= folio_nr_pages(folio);
- folio_unlock(folio);
- }
+ struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
+ ssize_t ret;
- folio_batch_release(&fbatch);
- cond_resched();
- } while (!stop);
+ _enter("%x[%x],%zx",
+ subreq->rreq->debug_id, subreq->debug_index, subreq->io_iter.count);
- *_len = len;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+ ret = afs_store_data(vnode, &subreq->io_iter, subreq->start,
+ subreq->rreq->origin == NETFS_LAUNDER_WRITE);
+ netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len,
+ false);
}
-/*
- * Synchronously write back the locked page and any subsequent non-locked dirty
- * pages.
- */
-static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping,
- struct writeback_control *wbc,
- struct folio *folio,
- loff_t start, loff_t end)
+static void afs_upload_to_server_worker(struct work_struct *work)
{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct iov_iter iter;
- unsigned long priv;
- unsigned int offset, to, len, max_len;
- loff_t i_size = i_size_read(&vnode->netfs.inode);
- bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
- bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode));
- long count = wbc->nr_to_write;
- int ret;
-
- _enter(",%lx,%llx-%llx", folio_index(folio), start, end);
-
- if (folio_start_writeback(folio))
- BUG();
- afs_folio_start_fscache(caching, folio);
-
- count -= folio_nr_pages(folio);
-
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- priv = (unsigned long)folio_get_private(folio);
- offset = afs_folio_dirty_from(folio, priv);
- to = afs_folio_dirty_to(folio, priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio);
-
- len = to - offset;
- start += offset;
- if (start < i_size) {
- /* Trim the write to the EOF; the extra data is ignored. Also
- * put an upper limit on the size of a single storedata op.
- */
- max_len = 65536 * 4096;
- max_len = min_t(unsigned long long, max_len, end - start + 1);
- max_len = min_t(unsigned long long, max_len, i_size - start);
-
- if (len < max_len &&
- (to == folio_size(folio) || new_content))
- afs_extend_writeback(mapping, vnode, &count,
- start, max_len, new_content,
- caching, &len);
- len = min_t(loff_t, len, max_len);
- }
-
- /* We now have a contiguous set of dirty pages, each with writeback
- * set; the first page is still locked at this point, but all the rest
- * have been unlocked.
- */
- folio_unlock(folio);
-
- if (start < i_size) {
- _debug("write back %x @%llx [%llx]", len, start, i_size);
-
- /* Speculatively write to the cache. We have to fix this up
- * later if the store fails.
- */
- afs_write_to_cache(vnode, start, len, i_size, caching);
-
- iov_iter_xarray(&iter, ITER_SOURCE, &mapping->i_pages, start, len);
- ret = afs_store_data(vnode, &iter, start, false);
- } else {
- _debug("write discard %x @%llx [%llx]", len, start, i_size);
-
- /* The dirty region was entirely beyond the EOF. */
- fscache_clear_page_bits(mapping, start, len, caching);
- afs_pages_written_back(vnode, start, len);
- ret = 0;
- }
-
- switch (ret) {
- case 0:
- wbc->nr_to_write = count;
- ret = len;
- break;
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
- default:
- pr_notice("kAFS: Unexpected error from FS.StoreData %d\n", ret);
- fallthrough;
- case -EACCES:
- case -EPERM:
- case -ENOKEY:
- case -EKEYEXPIRED:
- case -EKEYREJECTED:
- case -EKEYREVOKED:
- case -ENETRESET:
- afs_redirty_pages(wbc, mapping, start, len);
- mapping_set_error(mapping, ret);
- break;
-
- case -EDQUOT:
- case -ENOSPC:
- afs_redirty_pages(wbc, mapping, start, len);
- mapping_set_error(mapping, -ENOSPC);
- break;
-
- case -EROFS:
- case -EIO:
- case -EREMOTEIO:
- case -EFBIG:
- case -ENOENT:
- case -ENOMEDIUM:
- case -ENXIO:
- trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail);
- afs_kill_pages(mapping, start, len);
- mapping_set_error(mapping, ret);
- break;
- }
-
- _leave(" = %d", ret);
- return ret;
+ afs_upload_to_server(subreq);
}
/*
- * write a region of pages back to the server
+ * Set up write requests for a writeback slice. We need to add a write request
+ * for each write we want to make.
*/
-static int afs_writepages_region(struct address_space *mapping,
- struct writeback_control *wbc,
- loff_t start, loff_t end, loff_t *_next,
- bool max_one_loop)
+void afs_create_write_requests(struct netfs_io_request *wreq, loff_t start, size_t len)
{
- struct folio *folio;
- struct folio_batch fbatch;
- ssize_t ret;
- unsigned int i;
- int n, skips = 0;
-
- _enter("%llx,%llx,", start, end);
- folio_batch_init(&fbatch);
-
- do {
- pgoff_t index = start / PAGE_SIZE;
-
- n = filemap_get_folios_tag(mapping, &index, end / PAGE_SIZE,
- PAGECACHE_TAG_DIRTY, &fbatch);
-
- if (!n)
- break;
- for (i = 0; i < n; i++) {
- folio = fbatch.folios[i];
- start = folio_pos(folio); /* May regress with THPs */
-
- _debug("wback %lx", folio_index(folio));
-
- /* At this point we hold neither the i_pages lock nor the
- * page lock: the page may be truncated or invalidated
- * (changing page->mapping to NULL), or even swizzled
- * back from swapper_space to tmpfs file mapping
- */
-try_again:
- if (wbc->sync_mode != WB_SYNC_NONE) {
- ret = folio_lock_killable(folio);
- if (ret < 0) {
- folio_batch_release(&fbatch);
- return ret;
- }
- } else {
- if (!folio_trylock(folio))
- continue;
- }
-
- if (folio->mapping != mapping ||
- !folio_test_dirty(folio)) {
- start += folio_size(folio);
- folio_unlock(folio);
- continue;
- }
-
- if (folio_test_writeback(folio) ||
- folio_test_fscache(folio)) {
- folio_unlock(folio);
- if (wbc->sync_mode != WB_SYNC_NONE) {
- folio_wait_writeback(folio);
-#ifdef CONFIG_AFS_FSCACHE
- folio_wait_fscache(folio);
-#endif
- goto try_again;
- }
-
- start += folio_size(folio);
- if (wbc->sync_mode == WB_SYNC_NONE) {
- if (skips >= 5 || need_resched()) {
- *_next = start;
- folio_batch_release(&fbatch);
- _leave(" = 0 [%llx]", *_next);
- return 0;
- }
- skips++;
- }
- continue;
- }
-
- if (!folio_clear_dirty_for_io(folio))
- BUG();
- ret = afs_write_back_from_locked_folio(mapping, wbc,
- folio, start, end);
- if (ret < 0) {
- _leave(" = %zd", ret);
- folio_batch_release(&fbatch);
- return ret;
- }
-
- start += ret;
- }
+ struct netfs_io_subrequest *subreq;
- folio_batch_release(&fbatch);
- cond_resched();
- } while (wbc->nr_to_write > 0);
+ _enter("%x,%llx-%llx", wreq->debug_id, start, start + len);
- *_next = start;
- _leave(" = 0 [%llx]", *_next);
- return 0;
+ subreq = netfs_create_write_request(wreq, NETFS_UPLOAD_TO_SERVER,
+ start, len, afs_upload_to_server_worker);
+ if (subreq)
+ netfs_queue_write_request(subreq);
}
/*
* write some of the pending data back to the server
*/
-int afs_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
+int afs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- loff_t start, next;
int ret;
- _enter("");
-
/* We have to be careful as we can end up racing with setattr()
* truncating the pagecache since the caller doesn't take a lock here
* to prevent it.
@@ -819,69 +215,12 @@ int afs_writepages(struct address_space *mapping,
else if (!down_read_trylock(&vnode->validate_lock))
return 0;
- if (wbc->range_cyclic) {
- start = mapping->writeback_index * PAGE_SIZE;
- ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX,
- &next, false);
- if (ret == 0) {
- mapping->writeback_index = next / PAGE_SIZE;
- if (start > 0 && wbc->nr_to_write > 0) {
- ret = afs_writepages_region(mapping, wbc, 0,
- start, &next, false);
- if (ret == 0)
- mapping->writeback_index =
- next / PAGE_SIZE;
- }
- }
- } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX,
- &next, false);
- if (wbc->nr_to_write > 0 && ret == 0)
- mapping->writeback_index = next / PAGE_SIZE;
- } else {
- ret = afs_writepages_region(mapping, wbc,
- wbc->range_start, wbc->range_end,
- &next, false);
- }
-
+ ret = netfs_writepages(mapping, wbc);
up_read(&vnode->validate_lock);
- _leave(" = %d", ret);
return ret;
}
/*
- * write to an AFS file
- */
-ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from)
-{
- struct afs_vnode *vnode = AFS_FS_I(file_inode(iocb->ki_filp));
- struct afs_file *af = iocb->ki_filp->private_data;
- ssize_t result;
- size_t count = iov_iter_count(from);
-
- _enter("{%llx:%llu},{%zu},",
- vnode->fid.vid, vnode->fid.vnode, count);
-
- if (IS_SWAPFILE(&vnode->netfs.inode)) {
- printk(KERN_INFO
- "AFS: Attempt to write to active swap file!\n");
- return -EBUSY;
- }
-
- if (!count)
- return 0;
-
- result = afs_validate(vnode, af->key);
- if (result < 0)
- return result;
-
- result = generic_file_write_iter(iocb, from);
-
- _leave(" = %zd", result);
- return result;
-}
-
-/*
* flush any dirty pages for this process, and check for write errors.
* - the return status from this call provides a reliable indication of
* whether any write errors occurred for this process.
@@ -909,59 +248,11 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
{
- struct folio *folio = page_folio(vmf->page);
struct file *file = vmf->vma->vm_file;
- struct inode *inode = file_inode(file);
- struct afs_vnode *vnode = AFS_FS_I(inode);
- struct afs_file *af = file->private_data;
- unsigned long priv;
- vm_fault_t ret = VM_FAULT_RETRY;
-
- _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio));
-
- afs_validate(vnode, af->key);
- sb_start_pagefault(inode->i_sb);
-
- /* Wait for the page to be written to the cache before we allow it to
- * be modified. We then assume the entire page will need writing back.
- */
-#ifdef CONFIG_AFS_FSCACHE
- if (folio_test_fscache(folio) &&
- folio_wait_fscache_killable(folio) < 0)
- goto out;
-#endif
-
- if (folio_wait_writeback_killable(folio))
- goto out;
-
- if (folio_lock_killable(folio) < 0)
- goto out;
-
- /* We mustn't change folio->private until writeback is complete as that
- * details the portion of the page we need to write back and we might
- * need to redirty the page if there's a problem.
- */
- if (folio_wait_writeback_killable(folio) < 0) {
- folio_unlock(folio);
- goto out;
- }
-
- priv = afs_folio_dirty(folio, 0, folio_size(folio));
- priv = afs_folio_dirty_mmapped(priv);
- if (folio_test_private(folio)) {
- folio_change_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio);
- } else {
- folio_attach_private(folio, (void *)priv);
- trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio);
- }
- file_update_time(file);
-
- ret = VM_FAULT_LOCKED;
-out:
- sb_end_pagefault(inode->i_sb);
- return ret;
+ if (afs_validate(AFS_FS_I(file_inode(file)), afs_file_key(file)) < 0)
+ return VM_FAULT_SIGBUS;
+ return netfs_page_mkwrite(vmf, NULL);
}
/*
@@ -991,64 +282,3 @@ void afs_prune_wb_keys(struct afs_vnode *vnode)
afs_put_wb_key(wbk);
}
}
-
-/*
- * Clean up a page during invalidation.
- */
-int afs_launder_folio(struct folio *folio)
-{
- struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio));
- struct iov_iter iter;
- struct bio_vec bv;
- unsigned long priv;
- unsigned int f, t;
- int ret = 0;
-
- _enter("{%lx}", folio->index);
-
- priv = (unsigned long)folio_get_private(folio);
- if (folio_clear_dirty_for_io(folio)) {
- f = 0;
- t = folio_size(folio);
- if (folio_test_private(folio)) {
- f = afs_folio_dirty_from(folio, priv);
- t = afs_folio_dirty_to(folio, priv);
- }
-
- bvec_set_folio(&bv, folio, t - f, f);
- iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, bv.bv_len);
-
- trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio);
- ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true);
- }
-
- trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio);
- folio_detach_private(folio);
- folio_wait_fscache(folio);
- return ret;
-}
-
-/*
- * Deal with the completion of writing the data to the cache.
- */
-static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error,
- bool was_async)
-{
- struct afs_vnode *vnode = priv;
-
- if (IS_ERR_VALUE(transferred_or_error) &&
- transferred_or_error != -ENOBUFS)
- afs_invalidate_cache(vnode, 0);
-}
-
-/*
- * Save the write to the cache also.
- */
-static void afs_write_to_cache(struct afs_vnode *vnode,
- loff_t start, size_t len, loff_t i_size,
- bool caching)
-{
- fscache_write_to_cache(afs_vnode_cache(vnode),
- vnode->netfs.inode.i_mapping, start, len, i_size,
- afs_write_to_cache_done, vnode, caching);
-}
diff --git a/fs/afs/xattr.c b/fs/afs/xattr.c
index 64b2c0224f62..e19f396aa370 100644
--- a/fs/afs/xattr.c
+++ b/fs/afs/xattr.c
@@ -75,7 +75,7 @@ static bool afs_make_acl(struct afs_operation *op,
{
struct afs_acl *acl;
- acl = kmalloc(sizeof(*acl) + size, GFP_KERNEL);
+ acl = kmalloc(struct_size(acl, data, size), GFP_KERNEL);
if (!acl) {
afs_op_nomem(op);
return false;
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index 11571cca86c1..f521e66d3bf6 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -245,12 +245,15 @@ static void xdr_decode_YFSVolSync(const __be32 **_bp,
struct afs_volsync *volsync)
{
struct yfs_xdr_YFSVolSync *x = (void *)*_bp;
- u64 creation;
+ u64 creation, update;
if (volsync) {
creation = xdr_to_u64(x->vol_creation_date);
do_div(creation, 10 * 1000 * 1000);
volsync->creation = creation;
+ update = xdr_to_u64(x->vol_update_date);
+ do_div(update, 10 * 1000 * 1000);
+ volsync->update = update;
}
*_bp += xdr_size(x);
@@ -490,6 +493,7 @@ void yfs_fs_fetch_data(struct afs_operation *op)
bp = xdr_encode_u64(bp, req->len);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -572,6 +576,7 @@ void yfs_fs_create_file(struct afs_operation *op)
bp = xdr_encode_u32(bp, yfs_LockNone); /* ViceLockType */
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -620,6 +625,7 @@ void yfs_fs_make_dir(struct afs_operation *op)
bp = xdr_encode_YFSStoreStatus(bp, &op->create.mode, &op->mtime);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -704,6 +710,7 @@ void yfs_fs_remove_file2(struct afs_operation *op)
bp = xdr_encode_name(bp, name);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -773,6 +780,7 @@ void yfs_fs_remove_file(struct afs_operation *op)
bp = xdr_encode_name(bp, name);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -814,6 +822,7 @@ void yfs_fs_remove_dir(struct afs_operation *op)
bp = xdr_encode_name(bp, name);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -887,6 +896,7 @@ void yfs_fs_link(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call1(call, &vp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -968,6 +978,7 @@ void yfs_fs_symlink(struct afs_operation *op)
bp = xdr_encode_YFSStoreStatus(bp, &mode, &op->mtime);
yfs_check_req(call, bp);
+ call->fid = dvp->fid;
trace_afs_make_fs_call1(call, &dvp->fid, name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1047,6 +1058,7 @@ void yfs_fs_rename(struct afs_operation *op)
bp = xdr_encode_name(bp, new_name);
yfs_check_req(call, bp);
+ call->fid = orig_dvp->fid;
trace_afs_make_fs_call2(call, &orig_dvp->fid, orig_name, new_name);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1102,6 +1114,7 @@ void yfs_fs_store_data(struct afs_operation *op)
bp = xdr_encode_u64(bp, op->store.i_size);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1158,6 +1171,7 @@ static void yfs_fs_setattr_size(struct afs_operation *op)
bp = xdr_encode_u64(bp, attr->ia_size); /* new file length */
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1196,6 +1210,7 @@ void yfs_fs_setattr(struct afs_operation *op)
bp = xdr_encode_YFS_StoreStatus(bp, attr);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1366,6 +1381,7 @@ void yfs_fs_get_volume_status(struct afs_operation *op)
bp = xdr_encode_u64(bp, vp->fid.vid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1430,6 +1446,7 @@ void yfs_fs_set_lock(struct afs_operation *op)
bp = xdr_encode_u32(bp, op->lock.type);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_calli(call, &vp->fid, op->lock.type);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1460,6 +1477,7 @@ void yfs_fs_extend_lock(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1490,6 +1508,7 @@ void yfs_fs_release_lock(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1556,6 +1575,7 @@ void yfs_fs_fetch_status(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1736,6 +1756,7 @@ void yfs_fs_inline_bulk_status(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &op->more_files[i].fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
@@ -1898,6 +1919,7 @@ void yfs_fs_fetch_opaque_acl(struct afs_operation *op)
bp = xdr_encode_YFSFid(bp, &vp->fid);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}
@@ -1948,6 +1970,7 @@ void yfs_fs_store_opaque_acl2(struct afs_operation *op)
bp += size / sizeof(__be32);
yfs_check_req(call, bp);
+ call->fid = vp->fid;
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_KERNEL);
}
diff --git a/fs/aio.c b/fs/aio.c
index f8589caef9c1..9cdaa2faa536 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -239,7 +239,6 @@ static struct ctl_table aio_sysctls[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
- {}
};
static void __init aio_sysctl_init(void)
@@ -266,7 +265,7 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
return ERR_CAST(inode);
inode->i_mapping->a_ops = &aio_ctx_aops;
- inode->i_mapping->private_data = ctx;
+ inode->i_mapping->i_private_data = ctx;
inode->i_size = PAGE_SIZE * nr_pages;
file = alloc_file_pseudo(inode, aio_mnt, "[aio]",
@@ -316,10 +315,10 @@ static void put_aio_ring_file(struct kioctx *ctx)
/* Prevent further access to the kioctx from migratepages */
i_mapping = aio_ring_file->f_mapping;
- spin_lock(&i_mapping->private_lock);
- i_mapping->private_data = NULL;
+ spin_lock(&i_mapping->i_private_lock);
+ i_mapping->i_private_data = NULL;
ctx->aio_ring_file = NULL;
- spin_unlock(&i_mapping->private_lock);
+ spin_unlock(&i_mapping->i_private_lock);
fput(aio_ring_file);
}
@@ -422,9 +421,9 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
rc = 0;
- /* mapping->private_lock here protects against the kioctx teardown. */
- spin_lock(&mapping->private_lock);
- ctx = mapping->private_data;
+ /* mapping->i_private_lock here protects against the kioctx teardown. */
+ spin_lock(&mapping->i_private_lock);
+ ctx = mapping->i_private_data;
if (!ctx) {
rc = -EINVAL;
goto out;
@@ -476,7 +475,7 @@ static int aio_migrate_folio(struct address_space *mapping, struct folio *dst,
out_unlock:
mutex_unlock(&ctx->ring_lock);
out:
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return rc;
}
#else
@@ -590,13 +589,24 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int nr_events)
void kiocb_set_cancel_fn(struct kiocb *iocb, kiocb_cancel_fn *cancel)
{
- struct aio_kiocb *req = container_of(iocb, struct aio_kiocb, rw);
- struct kioctx *ctx = req->ki_ctx;
+ struct aio_kiocb *req;
+ struct kioctx *ctx;
unsigned long flags;
+ /*
+ * kiocb didn't come from aio or is neither a read nor a write, hence
+ * ignore it.
+ */
+ if (!(iocb->ki_flags & IOCB_AIO_RW))
+ return;
+
+ req = container_of(iocb, struct aio_kiocb, rw);
+
if (WARN_ON_ONCE(!list_empty(&req->ki_list)))
return;
+ ctx = req->ki_ctx;
+
spin_lock_irqsave(&ctx->ctx_lock, flags);
list_add_tail(&req->ki_list, &ctx->active_reqs);
req->ki_cancel = cancel;
@@ -1106,6 +1116,11 @@ static inline void iocb_destroy(struct aio_kiocb *iocb)
kmem_cache_free(kiocb_cachep, iocb);
}
+struct aio_waiter {
+ struct wait_queue_entry w;
+ size_t min_nr;
+};
+
/* aio_complete
* Called when the io request on the given iocb is complete.
*/
@@ -1114,7 +1129,7 @@ static void aio_complete(struct aio_kiocb *iocb)
struct kioctx *ctx = iocb->ki_ctx;
struct aio_ring *ring;
struct io_event *ev_page, *event;
- unsigned tail, pos, head;
+ unsigned tail, pos, head, avail;
unsigned long flags;
/*
@@ -1156,6 +1171,10 @@ static void aio_complete(struct aio_kiocb *iocb)
ctx->completed_events++;
if (ctx->completed_events > 1)
refill_reqs_available(ctx, head, tail);
+
+ avail = tail > head
+ ? tail - head
+ : tail + ctx->nr_events - head;
spin_unlock_irqrestore(&ctx->completion_lock, flags);
pr_debug("added to ring %p at [%u]\n", iocb, tail);
@@ -1166,7 +1185,7 @@ static void aio_complete(struct aio_kiocb *iocb)
* from IRQ context.
*/
if (iocb->ki_eventfd)
- eventfd_signal(iocb->ki_eventfd, 1);
+ eventfd_signal(iocb->ki_eventfd);
/*
* We have to order our ring_info tail store above and test
@@ -1176,8 +1195,18 @@ static void aio_complete(struct aio_kiocb *iocb)
*/
smp_mb();
- if (waitqueue_active(&ctx->wait))
- wake_up(&ctx->wait);
+ if (waitqueue_active(&ctx->wait)) {
+ struct aio_waiter *curr, *next;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ctx->wait.lock, flags);
+ list_for_each_entry_safe(curr, next, &ctx->wait.head, w.entry)
+ if (avail >= curr->min_nr) {
+ list_del_init_careful(&curr->w.entry);
+ wake_up_process(curr->w.private);
+ }
+ spin_unlock_irqrestore(&ctx->wait.lock, flags);
+ }
}
static inline void iocb_put(struct aio_kiocb *iocb)
@@ -1290,7 +1319,9 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
struct io_event __user *event,
ktime_t until)
{
- long ret = 0;
+ struct hrtimer_sleeper t;
+ struct aio_waiter w;
+ long ret = 0, ret2 = 0;
/*
* Note that aio_read_events() is being called as the conditional - i.e.
@@ -1306,12 +1337,38 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
* the ringbuffer empty. So in practice we should be ok, but it's
* something to be aware of when touching this code.
*/
- if (until == 0)
- aio_read_events(ctx, min_nr, nr, event, &ret);
- else
- wait_event_interruptible_hrtimeout(ctx->wait,
- aio_read_events(ctx, min_nr, nr, event, &ret),
- until);
+ aio_read_events(ctx, min_nr, nr, event, &ret);
+ if (until == 0 || ret < 0 || ret >= min_nr)
+ return ret;
+
+ hrtimer_init_sleeper_on_stack(&t, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ if (until != KTIME_MAX) {
+ hrtimer_set_expires_range_ns(&t.timer, until, current->timer_slack_ns);
+ hrtimer_sleeper_start_expires(&t, HRTIMER_MODE_REL);
+ }
+
+ init_wait(&w.w);
+
+ while (1) {
+ unsigned long nr_got = ret;
+
+ w.min_nr = min_nr - ret;
+
+ ret2 = prepare_to_wait_event(&ctx->wait, &w.w, TASK_INTERRUPTIBLE);
+ if (!ret2 && !t.task)
+ ret2 = -ETIME;
+
+ if (aio_read_events(ctx, min_nr, nr, event, &ret) || ret2)
+ break;
+
+ if (nr_got == ret)
+ schedule();
+ }
+
+ finish_wait(&ctx->wait, &w.w);
+ hrtimer_cancel(&t.timer);
+ destroy_hrtimer_on_stack(&t.timer);
+
return ret;
}
@@ -1463,7 +1520,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
req->ki_complete = aio_complete_rw;
req->private = NULL;
req->ki_pos = iocb->aio_offset;
- req->ki_flags = req->ki_filp->f_iocb_flags;
+ req->ki_flags = req->ki_filp->f_iocb_flags | IOCB_AIO_RW;
if (iocb->aio_flags & IOCB_FLAG_RESFD)
req->ki_flags |= IOCB_EVENTFD;
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
@@ -1498,7 +1555,7 @@ static ssize_t aio_setup_rw(int rw, const struct iocb *iocb,
size_t len = iocb->aio_nbytes;
if (!vectored) {
- ssize_t ret = import_single_range(rw, buf, len, *iovec, iter);
+ ssize_t ret = import_ubuf(rw, buf, len, iter);
*iovec = NULL;
return ret;
}
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index d26222b7eefe..0496cb5b6eab 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,7 +79,7 @@ static struct file *__anon_inode_getfile(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode,
- bool secure)
+ bool make_inode)
{
struct inode *inode;
struct file *file;
@@ -87,7 +87,7 @@ static struct file *__anon_inode_getfile(const char *name,
if (fops->owner && !try_module_get(fops->owner))
return ERR_PTR(-ENOENT);
- if (secure) {
+ if (make_inode) {
inode = anon_inode_make_secure_inode(name, context_inode);
if (IS_ERR(inode)) {
file = ERR_CAST(inode);
@@ -149,13 +149,10 @@ struct file *anon_inode_getfile(const char *name,
EXPORT_SYMBOL_GPL(anon_inode_getfile);
/**
- * anon_inode_getfile_secure - Like anon_inode_getfile(), but creates a new
+ * anon_inode_create_getfile - Like anon_inode_getfile(), but creates a new
* !S_PRIVATE anon inode rather than reuse the
* singleton anon inode and calls the
- * inode_init_security_anon() LSM hook. This
- * allows for both the inode to have its own
- * security context and for the LSM to enforce
- * policy on the inode's creation.
+ * inode_init_security_anon() LSM hook.
*
* @name: [in] name of the "class" of the new file
* @fops: [in] file operations for the new file
@@ -164,11 +161,21 @@ EXPORT_SYMBOL_GPL(anon_inode_getfile);
* @context_inode:
* [in] the logical relationship with the new inode (optional)
*
+ * Create a new anonymous inode and file pair. This can be done for two
+ * reasons:
+ *
+ * - for the inode to have its own security context, so that LSMs can enforce
+ * policy on the inode's creation;
+ *
+ * - if the caller needs a unique inode, for example in order to customize
+ * the size returned by fstat()
+ *
* The LSM may use @context_inode in inode_init_security_anon(), but a
- * reference to it is not held. Returns the newly created file* or an error
- * pointer. See the anon_inode_getfile() documentation for more information.
+ * reference to it is not held.
+ *
+ * Returns the newly created file* or an error pointer.
*/
-struct file *anon_inode_getfile_secure(const char *name,
+struct file *anon_inode_create_getfile(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode)
@@ -176,12 +183,13 @@ struct file *anon_inode_getfile_secure(const char *name,
return __anon_inode_getfile(name, fops, priv, flags,
context_inode, true);
}
+EXPORT_SYMBOL_GPL(anon_inode_create_getfile);
static int __anon_inode_getfd(const char *name,
const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode,
- bool secure)
+ bool make_inode)
{
int error, fd;
struct file *file;
@@ -192,7 +200,7 @@ static int __anon_inode_getfd(const char *name,
fd = error;
file = __anon_inode_getfile(name, fops, priv, flags, context_inode,
- secure);
+ make_inode);
if (IS_ERR(file)) {
error = PTR_ERR(file);
goto err_put_unused_fd;
@@ -231,10 +239,9 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
EXPORT_SYMBOL_GPL(anon_inode_getfd);
/**
- * anon_inode_getfd_secure - Like anon_inode_getfd(), but creates a new
+ * anon_inode_create_getfd - Like anon_inode_getfd(), but creates a new
* !S_PRIVATE anon inode rather than reuse the singleton anon inode, and calls
- * the inode_init_security_anon() LSM hook. This allows the inode to have its
- * own security context and for a LSM to reject creation of the inode.
+ * the inode_init_security_anon() LSM hook.
*
* @name: [in] name of the "class" of the new file
* @fops: [in] file operations for the new file
@@ -243,16 +250,26 @@ EXPORT_SYMBOL_GPL(anon_inode_getfd);
* @context_inode:
* [in] the logical relationship with the new inode (optional)
*
+ * Create a new anonymous inode and file pair. This can be done for two
+ * reasons:
+ *
+ * - for the inode to have its own security context, so that LSMs can enforce
+ * policy on the inode's creation;
+ *
+ * - if the caller needs a unique inode, for example in order to customize
+ * the size returned by fstat()
+ *
* The LSM may use @context_inode in inode_init_security_anon(), but a
* reference to it is not held.
+ *
+ * Returns a newly created file descriptor or an error code.
*/
-int anon_inode_getfd_secure(const char *name, const struct file_operations *fops,
+int anon_inode_create_getfd(const char *name, const struct file_operations *fops,
void *priv, int flags,
const struct inode *context_inode)
{
return __anon_inode_getfd(name, fops, priv, flags, context_inode, true);
}
-EXPORT_SYMBOL_GPL(anon_inode_getfd_secure);
static int __init anon_inode_init(void)
{
diff --git a/fs/attr.c b/fs/attr.c
index bdf5deb06ea9..5a13f0c8495f 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -157,7 +157,7 @@ static bool chgrp_ok(struct mnt_idmap *idmap,
* the vfsmount must be passed through @idmap. This function will then
* take care to map the inode according to @idmap before checking
* permissions. On non-idmapped mounts or if permission checking is to be
- * performed on the raw inode simply passs @nop_mnt_idmap.
+ * performed on the raw inode simply pass @nop_mnt_idmap.
*
* Should be called as the first thing in ->setattr implementations,
* possibly after taking additional locks.
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index 038b3d2d9f57..39d8c84c16f4 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -73,12 +73,9 @@ done:
/* p->d_lock held */
static struct dentry *positive_after(struct dentry *p, struct dentry *child)
{
- if (child)
- child = list_next_entry(child, d_child);
- else
- child = list_first_entry(&p->d_subdirs, struct dentry, d_child);
+ child = child ? d_next_sibling(child) : d_first_child(p);
- list_for_each_entry_from(child, &p->d_subdirs, d_child) {
+ hlist_for_each_entry_from(child, d_sib) {
spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(child)) {
dget_dlock(child);
diff --git a/fs/backing-file.c b/fs/backing-file.c
new file mode 100644
index 000000000000..a681f38d84d8
--- /dev/null
+++ b/fs/backing-file.c
@@ -0,0 +1,336 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Common helpers for stackable filesystems and backing files.
+ *
+ * Forked from fs/overlayfs/file.c.
+ *
+ * Copyright (C) 2017 Red Hat, Inc.
+ * Copyright (C) 2023 CTERA Networks.
+ */
+
+#include <linux/fs.h>
+#include <linux/backing-file.h>
+#include <linux/splice.h>
+#include <linux/mm.h>
+
+#include "internal.h"
+
+/**
+ * backing_file_open - open a backing file for kernel internal use
+ * @user_path: path that the user reuqested to open
+ * @flags: open flags
+ * @real_path: path of the backing file
+ * @cred: credentials for open
+ *
+ * Open a backing file for a stackable filesystem (e.g., overlayfs).
+ * @user_path may be on the stackable filesystem and @real_path on the
+ * underlying filesystem. In this case, we want to be able to return the
+ * @user_path of the stackable filesystem. This is done by embedding the
+ * returned file into a container structure that also stores the stacked
+ * file's path, which can be retrieved using backing_file_user_path().
+ */
+struct file *backing_file_open(const struct path *user_path, int flags,
+ const struct path *real_path,
+ const struct cred *cred)
+{
+ struct file *f;
+ int error;
+
+ f = alloc_empty_backing_file(flags, cred);
+ if (IS_ERR(f))
+ return f;
+
+ path_get(user_path);
+ *backing_file_user_path(f) = *user_path;
+ error = vfs_open(real_path, f);
+ if (error) {
+ fput(f);
+ f = ERR_PTR(error);
+ }
+
+ return f;
+}
+EXPORT_SYMBOL_GPL(backing_file_open);
+
+struct backing_aio {
+ struct kiocb iocb;
+ refcount_t ref;
+ struct kiocb *orig_iocb;
+ /* used for aio completion */
+ void (*end_write)(struct file *);
+ struct work_struct work;
+ long res;
+};
+
+static struct kmem_cache *backing_aio_cachep;
+
+#define BACKING_IOCB_MASK \
+ (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
+
+static rwf_t iocb_to_rw_flags(int flags)
+{
+ return (__force rwf_t)(flags & BACKING_IOCB_MASK);
+}
+
+static void backing_aio_put(struct backing_aio *aio)
+{
+ if (refcount_dec_and_test(&aio->ref)) {
+ fput(aio->iocb.ki_filp);
+ kmem_cache_free(backing_aio_cachep, aio);
+ }
+}
+
+static void backing_aio_cleanup(struct backing_aio *aio, long res)
+{
+ struct kiocb *iocb = &aio->iocb;
+ struct kiocb *orig_iocb = aio->orig_iocb;
+
+ if (aio->end_write)
+ aio->end_write(orig_iocb->ki_filp);
+
+ orig_iocb->ki_pos = iocb->ki_pos;
+ backing_aio_put(aio);
+}
+
+static void backing_aio_rw_complete(struct kiocb *iocb, long res)
+{
+ struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
+ struct kiocb *orig_iocb = aio->orig_iocb;
+
+ if (iocb->ki_flags & IOCB_WRITE)
+ kiocb_end_write(iocb);
+
+ backing_aio_cleanup(aio, res);
+ orig_iocb->ki_complete(orig_iocb, res);
+}
+
+static void backing_aio_complete_work(struct work_struct *work)
+{
+ struct backing_aio *aio = container_of(work, struct backing_aio, work);
+
+ backing_aio_rw_complete(&aio->iocb, aio->res);
+}
+
+static void backing_aio_queue_completion(struct kiocb *iocb, long res)
+{
+ struct backing_aio *aio = container_of(iocb, struct backing_aio, iocb);
+
+ /*
+ * Punt to a work queue to serialize updates of mtime/size.
+ */
+ aio->res = res;
+ INIT_WORK(&aio->work, backing_aio_complete_work);
+ queue_work(file_inode(aio->orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
+ &aio->work);
+}
+
+static int backing_aio_init_wq(struct kiocb *iocb)
+{
+ struct super_block *sb = file_inode(iocb->ki_filp)->i_sb;
+
+ if (sb->s_dio_done_wq)
+ return 0;
+
+ return sb_init_dio_done_wq(sb);
+}
+
+
+ssize_t backing_file_read_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags,
+ struct backing_file_ctx *ctx)
+{
+ struct backing_aio *aio = NULL;
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ if (!iov_iter_count(iter))
+ return 0;
+
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ !(file->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
+
+ old_cred = override_creds(ctx->cred);
+ if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(flags);
+
+ ret = vfs_iter_read(file, iter, &iocb->ki_pos, rwf);
+ } else {
+ ret = -ENOMEM;
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ goto out;
+
+ aio->orig_iocb = iocb;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_complete = backing_aio_rw_complete;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_read(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
+ }
+out:
+ revert_creds(old_cred);
+
+ if (ctx->accessed)
+ ctx->accessed(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_read_iter);
+
+ssize_t backing_file_write_iter(struct file *file, struct iov_iter *iter,
+ struct kiocb *iocb, int flags,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ if (!iov_iter_count(iter))
+ return 0;
+
+ ret = file_remove_privs(ctx->user_file);
+ if (ret)
+ return ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT &&
+ !(file->f_mode & FMODE_CAN_ODIRECT))
+ return -EINVAL;
+
+ /*
+ * Stacked filesystems don't support deferred completions, don't copy
+ * this property in case it is set by the issuer.
+ */
+ flags &= ~IOCB_DIO_CALLER_COMP;
+
+ old_cred = override_creds(ctx->cred);
+ if (is_sync_kiocb(iocb)) {
+ rwf_t rwf = iocb_to_rw_flags(flags);
+
+ ret = vfs_iter_write(file, iter, &iocb->ki_pos, rwf);
+ if (ctx->end_write)
+ ctx->end_write(ctx->user_file);
+ } else {
+ struct backing_aio *aio;
+
+ ret = backing_aio_init_wq(iocb);
+ if (ret)
+ goto out;
+
+ ret = -ENOMEM;
+ aio = kmem_cache_zalloc(backing_aio_cachep, GFP_KERNEL);
+ if (!aio)
+ goto out;
+
+ aio->orig_iocb = iocb;
+ aio->end_write = ctx->end_write;
+ kiocb_clone(&aio->iocb, iocb, get_file(file));
+ aio->iocb.ki_flags = flags;
+ aio->iocb.ki_complete = backing_aio_queue_completion;
+ refcount_set(&aio->ref, 2);
+ ret = vfs_iocb_iter_write(file, &aio->iocb, iter);
+ backing_aio_put(aio);
+ if (ret != -EIOCBQUEUED)
+ backing_aio_cleanup(aio, ret);
+ }
+out:
+ revert_creds(old_cred);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_write_iter);
+
+ssize_t backing_file_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(in->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ old_cred = override_creds(ctx->cred);
+ ret = vfs_splice_read(in, ppos, pipe, len, flags);
+ revert_creds(old_cred);
+
+ if (ctx->accessed)
+ ctx->accessed(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_splice_read);
+
+ssize_t backing_file_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos, size_t len,
+ unsigned int flags,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ ssize_t ret;
+
+ if (WARN_ON_ONCE(!(out->f_mode & FMODE_BACKING)))
+ return -EIO;
+
+ ret = file_remove_privs(ctx->user_file);
+ if (ret)
+ return ret;
+
+ old_cred = override_creds(ctx->cred);
+ file_start_write(out);
+ ret = iter_file_splice_write(pipe, out, ppos, len, flags);
+ file_end_write(out);
+ revert_creds(old_cred);
+
+ if (ctx->end_write)
+ ctx->end_write(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_splice_write);
+
+int backing_file_mmap(struct file *file, struct vm_area_struct *vma,
+ struct backing_file_ctx *ctx)
+{
+ const struct cred *old_cred;
+ int ret;
+
+ if (WARN_ON_ONCE(!(file->f_mode & FMODE_BACKING)) ||
+ WARN_ON_ONCE(ctx->user_file != vma->vm_file))
+ return -EIO;
+
+ if (!file->f_op->mmap)
+ return -ENODEV;
+
+ vma_set_file(vma, file);
+
+ old_cred = override_creds(ctx->cred);
+ ret = call_mmap(vma->vm_file, vma);
+ revert_creds(old_cred);
+
+ if (ctx->accessed)
+ ctx->accessed(ctx->user_file);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(backing_file_mmap);
+
+static int __init backing_aio_init(void)
+{
+ backing_aio_cachep = kmem_cache_create("backing_aio",
+ sizeof(struct backing_aio),
+ 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!backing_aio_cachep)
+ return -ENOMEM;
+
+ return 0;
+}
+fs_initcall(backing_aio_init);
diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig
index fddc7be58022..5cdfef3b551a 100644
--- a/fs/bcachefs/Kconfig
+++ b/fs/bcachefs/Kconfig
@@ -50,14 +50,6 @@ config BCACHEFS_POSIX_ACL
depends on BCACHEFS_FS
select FS_POSIX_ACL
-config BCACHEFS_DEBUG_TRANSACTIONS
- bool "bcachefs runtime info"
- depends on BCACHEFS_FS
- help
- This makes the list of running btree transactions available in debugfs.
-
- This is a highly useful debugging feature but does add a small amount of overhead.
-
config BCACHEFS_DEBUG
bool "bcachefs debugging"
depends on BCACHEFS_FS
@@ -85,6 +77,16 @@ config BCACHEFS_NO_LATENCY_ACCT
help
This disables device latency tracking and time stats, only for performance testing
+config BCACHEFS_SIX_OPTIMISTIC_SPIN
+ bool "Optimistic spinning for six locks"
+ depends on BCACHEFS_FS
+ depends on SMP
+ default y
+ help
+ Instead of immediately sleeping when attempting to take a six lock that
+ is held by another thread, spin for a short while, as long as the
+ thread owning the lock is running.
+
config MEAN_AND_VARIANCE_UNIT_TEST
tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS
depends on KUNIT
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile
index b81268418174..1a05cecda7cc 100644
--- a/fs/bcachefs/Makefile
+++ b/fs/bcachefs/Makefile
@@ -27,7 +27,6 @@ bcachefs-y := \
checksum.o \
clock.o \
compress.o \
- counters.o \
darray.o \
debug.o \
dirent.o \
@@ -71,6 +70,7 @@ bcachefs-y := \
reflink.o \
replicas.o \
sb-clean.o \
+ sb-counters.o \
sb-downgrade.o \
sb-errors.o \
sb-members.o \
@@ -82,6 +82,7 @@ bcachefs-y := \
super-io.o \
sysfs.o \
tests.o \
+ thread_with_file.o \
trace.o \
two_state_shared_lock.o \
util.o \
diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c
index 1fec0e67891f..fd3e175d8342 100644
--- a/fs/bcachefs/alloc_background.c
+++ b/fs/bcachefs/alloc_background.c
@@ -261,10 +261,8 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_DATA_free:
case BCH_DATA_need_gc_gens:
case BCH_DATA_need_discard:
- bkey_fsck_err_on(a.v->dirty_sectors ||
- a.v->cached_sectors ||
- a.v->stripe, c, err,
- alloc_key_empty_but_have_data,
+ bkey_fsck_err_on(bch2_bucket_sectors(*a.v) || a.v->stripe,
+ c, err, alloc_key_empty_but_have_data,
"empty data type free but have data");
break;
case BCH_DATA_sb:
@@ -272,22 +270,21 @@ int bch2_alloc_v4_invalid(struct bch_fs *c, struct bkey_s_c k,
case BCH_DATA_btree:
case BCH_DATA_user:
case BCH_DATA_parity:
- bkey_fsck_err_on(!a.v->dirty_sectors, c, err,
- alloc_key_dirty_sectors_0,
+ bkey_fsck_err_on(!bch2_bucket_sectors_dirty(*a.v),
+ c, err, alloc_key_dirty_sectors_0,
"data_type %s but dirty_sectors==0",
- bch2_data_types[a.v->data_type]);
+ bch2_data_type_str(a.v->data_type));
break;
case BCH_DATA_cached:
bkey_fsck_err_on(!a.v->cached_sectors ||
- a.v->dirty_sectors ||
- a.v->stripe, c, err,
- alloc_key_cached_inconsistency,
+ bch2_bucket_sectors_dirty(*a.v) ||
+ a.v->stripe,
+ c, err, alloc_key_cached_inconsistency,
"data type inconsistency");
bkey_fsck_err_on(!a.v->io_time[READ] &&
c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs,
- c, err,
- alloc_key_cached_but_read_time_zero,
+ c, err, alloc_key_cached_but_read_time_zero,
"cached bucket with read_time == 0");
break;
case BCH_DATA_stripe:
@@ -324,16 +321,12 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
{
struct bch_alloc_v4 _a;
const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a);
- unsigned i;
prt_newline(out);
printbuf_indent_add(out, 2);
- prt_printf(out, "gen %u oldest_gen %u data_type %s",
- a->gen, a->oldest_gen,
- a->data_type < BCH_DATA_NR
- ? bch2_data_types[a->data_type]
- : "(invalid data type)");
+ prt_printf(out, "gen %u oldest_gen %u data_type ", a->gen, a->oldest_gen);
+ bch2_prt_data_type(out, a->data_type);
prt_newline(out);
prt_printf(out, "journal_seq %llu", a->journal_seq);
prt_newline(out);
@@ -356,23 +349,6 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c
prt_printf(out, "fragmentation %llu", a->fragmentation_lru);
prt_newline(out);
prt_printf(out, "bp_start %llu", BCH_ALLOC_V4_BACKPOINTERS_START(a));
- prt_newline(out);
-
- if (BCH_ALLOC_V4_NR_BACKPOINTERS(a)) {
- struct bkey_s_c_alloc_v4 a_raw = bkey_s_c_to_alloc_v4(k);
- const struct bch_backpointer *bps = alloc_v4_backpointers_c(a_raw.v);
-
- prt_printf(out, "backpointers: %llu", BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v));
- printbuf_indent_add(out, 2);
-
- for (i = 0; i < BCH_ALLOC_V4_NR_BACKPOINTERS(a_raw.v); i++) {
- prt_newline(out);
- bch2_backpointer_to_text(out, &bps[i]);
- }
-
- printbuf_indent_sub(out, 2);
- }
-
printbuf_indent_sub(out, 2);
}
@@ -537,18 +513,12 @@ void bch2_bucket_gens_to_text(struct printbuf *out, struct bch_fs *c, struct bke
int bch2_bucket_gens_init(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_alloc_v4 a;
struct bkey_i_bucket_gens g;
bool have_bucket_gens_key = false;
- unsigned offset;
- struct bpos pos;
- u8 gen;
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
@@ -556,13 +526,14 @@ int bch2_bucket_gens_init(struct bch_fs *c)
if (!bch2_dev_bucket_exists(c, k.k->p))
continue;
- gen = bch2_alloc_to_v4(k, &a)->gen;
- pos = alloc_gens_pos(iter.pos, &offset);
+ struct bch_alloc_v4 a;
+ u8 gen = bch2_alloc_to_v4(k, &a)->gen;
+ unsigned offset;
+ struct bpos pos = alloc_gens_pos(iter.pos, &offset);
if (have_bucket_gens_key && bkey_cmp(iter.pos, pos)) {
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
if (ret)
break;
@@ -576,45 +547,37 @@ int bch2_bucket_gens_init(struct bch_fs *c)
}
g.v.gens[offset] = gen;
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
if (have_bucket_gens_key && !ret)
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ BCH_TRANS_COMMIT_no_enospc,
bch2_btree_insert_trans(trans, BTREE_ID_bucket_gens, &g.k_i, 0));
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
int bch2_alloc_read(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_dev *ca;
int ret;
down_read(&c->gc_lock);
if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_bucket_gens) {
- const struct bch_bucket_gens *g;
- u64 b;
-
- for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_bucket_gens, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
u64 start = bucket_gens_pos_to_alloc(k.k->p, 0).offset;
u64 end = bucket_gens_pos_to_alloc(bpos_nosnap_successor(k.k->p), 0).offset;
if (k.k->type != KEY_TYPE_bucket_gens)
continue;
- g = bkey_s_c_to_bucket_gens(k).v;
+ const struct bch_bucket_gens *g = bkey_s_c_to_bucket_gens(k).v;
/*
* Not a fsck error because this is checked/repaired by
@@ -623,19 +586,17 @@ int bch2_alloc_read(struct bch_fs *c)
if (!bch2_dev_exists2(c, k.k->p.inode))
continue;
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
- for (b = max_t(u64, ca->mi.first_bucket, start);
+ for (u64 b = max_t(u64, ca->mi.first_bucket, start);
b < min_t(u64, ca->mi.nbuckets, end);
b++)
*bucket_gen(ca, b) = g->gens[b & KEY_TYPE_BUCKET_GENS_MASK];
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
} else {
- struct bch_alloc_v4 a;
-
- for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
/*
* Not a fsck error because this is checked/repaired by
* bch2_check_alloc_key() which runs later:
@@ -643,19 +604,18 @@ int bch2_alloc_read(struct bch_fs *c)
if (!bch2_dev_bucket_exists(c, k.k->p))
continue;
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bch_alloc_v4 a;
*bucket_gen(ca, k.k->p.offset) = bch2_alloc_to_v4(k, &a)->gen;
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
}
bch2_trans_put(trans);
up_read(&c->gc_lock);
- if (ret)
- bch_err_fn(c, ret);
-
+ bch_err_fn(c, ret);
return ret;
}
@@ -768,83 +728,177 @@ static noinline int bch2_bucket_gen_update(struct btree_trans *trans,
return ret;
}
-int bch2_trans_mark_alloc(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_alloc(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
struct bch_fs *c = trans->c;
- struct bch_alloc_v4 old_a_convert, *new_a;
- const struct bch_alloc_v4 *old_a;
- u64 old_lru, new_lru;
int ret = 0;
- /*
- * Deletion only happens in the device removal path, with
- * BTREE_TRIGGER_NORUN:
- */
- BUG_ON(new->k.type != KEY_TYPE_alloc_v4);
+ if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
+ "alloc key for invalid device or bucket"))
+ return -EIO;
- old_a = bch2_alloc_to_v4(old, &old_a_convert);
- new_a = &bkey_i_to_alloc_v4(new)->v;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, new.k->p.inode);
- new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
+ struct bch_alloc_v4 old_a_convert;
+ const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert);
- if (new_a->dirty_sectors > old_a->dirty_sectors ||
- new_a->cached_sectors > old_a->cached_sectors) {
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
- SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
- SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
- }
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
- if (data_type_is_empty(new_a->data_type) &&
- BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
- !bch2_bucket_is_open_safe(c, new->k.p.inode, new->k.p.offset)) {
- new_a->gen++;
- SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
- }
+ new_a->data_type = alloc_data_type(*new_a, new_a->data_type);
- if (old_a->data_type != new_a->data_type ||
- (new_a->data_type == BCH_DATA_free &&
- alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
- ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
- bch2_bucket_do_index(trans, bkey_i_to_s_c(new), new_a, true);
- if (ret)
- return ret;
- }
+ if (bch2_bucket_sectors(*new_a) > bch2_bucket_sectors(*old_a)) {
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ new_a->io_time[WRITE]= max_t(u64, 1, atomic64_read(&c->io_clock[WRITE].now));
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, true);
+ SET_BCH_ALLOC_V4_NEED_DISCARD(new_a, true);
+ }
- if (new_a->data_type == BCH_DATA_cached &&
- !new_a->io_time[READ])
- new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
+ if (data_type_is_empty(new_a->data_type) &&
+ BCH_ALLOC_V4_NEED_INC_GEN(new_a) &&
+ !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) {
+ new_a->gen++;
+ SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false);
+ }
+
+ if (old_a->data_type != new_a->data_type ||
+ (new_a->data_type == BCH_DATA_free &&
+ alloc_freespace_genbits(*old_a) != alloc_freespace_genbits(*new_a))) {
+ ret = bch2_bucket_do_index(trans, old, old_a, false) ?:
+ bch2_bucket_do_index(trans, new.s_c, new_a, true);
+ if (ret)
+ return ret;
+ }
- old_lru = alloc_lru_idx_read(*old_a);
- new_lru = alloc_lru_idx_read(*new_a);
+ if (new_a->data_type == BCH_DATA_cached &&
+ !new_a->io_time[READ])
+ new_a->io_time[READ] = max_t(u64, 1, atomic64_read(&c->io_clock[READ].now));
- if (old_lru != new_lru) {
- ret = bch2_lru_change(trans, new->k.p.inode,
- bucket_to_u64(new->k.p),
- old_lru, new_lru);
- if (ret)
- return ret;
+ u64 old_lru = alloc_lru_idx_read(*old_a);
+ u64 new_lru = alloc_lru_idx_read(*new_a);
+ if (old_lru != new_lru) {
+ ret = bch2_lru_change(trans, new.k->p.inode,
+ bucket_to_u64(new.k->p),
+ old_lru, new_lru);
+ if (ret)
+ return ret;
+ }
+
+ new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
+ bch_dev_bkey_exists(c, new.k->p.inode));
+ if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
+ ret = bch2_lru_change(trans,
+ BCH_LRU_FRAGMENTATION_START,
+ bucket_to_u64(new.k->p),
+ old_a->fragmentation_lru, new_a->fragmentation_lru);
+ if (ret)
+ return ret;
+ }
+
+ if (old_a->gen != new_a->gen) {
+ ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen);
+ if (ret)
+ return ret;
+ }
+
+ /*
+ * need to know if we're getting called from the invalidate path or
+ * not:
+ */
+
+ if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
+ old_a->cached_sectors) {
+ ret = bch2_update_cached_sectors_list(trans, new.k->p.inode,
+ -((s64) old_a->cached_sectors));
+ if (ret)
+ return ret;
+ }
}
- new_a->fragmentation_lru = alloc_lru_idx_fragmentation(*new_a,
- bch_dev_bkey_exists(c, new->k.p.inode));
+ if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
+ struct bch_alloc_v4 *new_a = bkey_s_to_alloc_v4(new).v;
+ u64 journal_seq = trans->journal_res.seq;
+ u64 bucket_journal_seq = new_a->journal_seq;
- if (old_a->fragmentation_lru != new_a->fragmentation_lru) {
- ret = bch2_lru_change(trans,
- BCH_LRU_FRAGMENTATION_START,
- bucket_to_u64(new->k.p),
- old_a->fragmentation_lru, new_a->fragmentation_lru);
- if (ret)
- return ret;
+ if ((flags & BTREE_TRIGGER_INSERT) &&
+ data_type_is_empty(old_a->data_type) !=
+ data_type_is_empty(new_a->data_type) &&
+ new.k->type == KEY_TYPE_alloc_v4) {
+ struct bch_alloc_v4 *v = bkey_s_to_alloc_v4(new).v;
+
+ /*
+ * If the btree updates referring to a bucket weren't flushed
+ * before the bucket became empty again, then the we don't have
+ * to wait on a journal flush before we can reuse the bucket:
+ */
+ v->journal_seq = bucket_journal_seq =
+ data_type_is_empty(new_a->data_type) &&
+ (journal_seq == v->journal_seq ||
+ bch2_journal_noflush_seq(&c->journal, v->journal_seq))
+ ? 0 : journal_seq;
+ }
+
+ if (!data_type_is_empty(old_a->data_type) &&
+ data_type_is_empty(new_a->data_type) &&
+ bucket_journal_seq) {
+ ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
+ c->journal.flushed_seq_ondisk,
+ new.k->p.inode, new.k->p.offset,
+ bucket_journal_seq);
+ if (ret) {
+ bch2_fs_fatal_error(c,
+ "error setting bucket_needs_journal_commit: %i", ret);
+ return ret;
+ }
+ }
+
+ percpu_down_read(&c->mark_lock);
+ if (new_a->gen != old_a->gen)
+ *bucket_gen(ca, new.k->p.offset) = new_a->gen;
+
+ bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, false);
+
+ if (new_a->data_type == BCH_DATA_free &&
+ (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
+ closure_wake_up(&c->freelist_wait);
+
+ if (new_a->data_type == BCH_DATA_need_discard &&
+ (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
+ bch2_do_discards(c);
+
+ if (old_a->data_type != BCH_DATA_cached &&
+ new_a->data_type == BCH_DATA_cached &&
+ should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
+ bch2_do_invalidates(c);
+
+ if (new_a->data_type == BCH_DATA_need_gc_gens)
+ bch2_do_gc_gens(c);
+ percpu_up_read(&c->mark_lock);
}
- if (old_a->gen != new_a->gen) {
- ret = bch2_bucket_gen_update(trans, new->k.p, new_a->gen);
- if (ret)
- return ret;
+ if ((flags & BTREE_TRIGGER_GC) &&
+ (flags & BTREE_TRIGGER_BUCKET_INVALIDATE)) {
+ struct bch_alloc_v4 new_a_convert;
+ const struct bch_alloc_v4 *new_a = bch2_alloc_to_v4(new.s_c, &new_a_convert);
+
+ percpu_down_read(&c->mark_lock);
+ struct bucket *g = gc_bucket(ca, new.k->p.offset);
+
+ bucket_lock(g);
+
+ g->gen_valid = 1;
+ g->gen = new_a->gen;
+ g->data_type = new_a->data_type;
+ g->stripe = new_a->stripe;
+ g->stripe_redundancy = new_a->stripe_redundancy;
+ g->dirty_sectors = new_a->dirty_sectors;
+ g->cached_sectors = new_a->cached_sectors;
+
+ bucket_unlock(g);
+ percpu_up_read(&c->mark_lock);
}
return 0;
@@ -869,8 +923,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
bch2_trans_copy_iter(&iter2, iter);
- if (!bpos_eq(iter->path->l[0].b->key.k.p, SPOS_MAX))
- end = bkey_min(end, bpos_nosnap_successor(iter->path->l[0].b->key.k.p));
+ struct btree_path *path = btree_iter_path(iter->trans, iter);
+ if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX))
+ end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p));
end = bkey_min(end, POS(iter->pos.inode, iter->pos.offset + U32_MAX - 1));
@@ -898,7 +953,6 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos
static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
{
struct bch_dev *ca;
- unsigned iter;
if (bch2_dev_bucket_exists(c, *bucket))
return true;
@@ -916,8 +970,7 @@ static bool next_bucket(struct bch_fs *c, struct bpos *bucket)
}
rcu_read_lock();
- iter = bucket->inode;
- ca = __bch2_next_dev(c, &iter, NULL);
+ ca = __bch2_next_dev_idx(c, bucket->inode, NULL);
if (ca)
*bucket = POS(ca->dev_idx, ca->mi.first_bucket);
rcu_read_unlock();
@@ -1158,9 +1211,6 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans,
unsigned i, gens_offset, gens_end_offset;
int ret;
- if (c->sb.version < bcachefs_metadata_version_bucket_gens)
- return 0;
-
bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset));
k = bch2_btree_iter_peek_slot(bucket_gens_iter);
@@ -1212,7 +1262,7 @@ fsck_err:
return ret;
}
-static noinline_for_stack int __bch2_check_discard_freespace_key(struct btree_trans *trans,
+static noinline_for_stack int bch2_check_discard_freespace_key(struct btree_trans *trans,
struct btree_iter *iter)
{
struct bch_fs *c = trans->c;
@@ -1267,28 +1317,10 @@ delete:
ret = bch2_btree_delete_extent_at(trans, iter,
iter->btree_id == BTREE_ID_freespace ? 1 : 0, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW);
+ BCH_TRANS_COMMIT_no_enospc);
goto out;
}
-static int bch2_check_discard_freespace_key(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end)
-{
- if (!btree_id_is_extents(iter->btree_id)) {
- return __bch2_check_discard_freespace_key(trans, iter);
- } else {
- int ret = 0;
-
- while (!bkey_eq(iter->pos, end) &&
- !(ret = btree_trans_too_many_iters(trans) ?:
- __bch2_check_discard_freespace_key(trans, iter)))
- bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos));
-
- return ret;
- }
-}
-
/*
* We've already checked that generation numbers in the bucket_gens btree are
* valid for buckets that exist; this just checks for keys for nonexistent
@@ -1422,8 +1454,7 @@ int bch2_check_alloc_info(struct bch_fs *c)
}
ret = bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
@@ -1442,23 +1473,50 @@ bkey_err:
if (ret < 0)
goto err;
- ret = for_each_btree_key2(trans, iter,
+ ret = for_each_btree_key(trans, iter,
BTREE_ID_need_discard, POS_MIN,
BTREE_ITER_PREFETCH, k,
- bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
- for_each_btree_key2(trans, iter,
- BTREE_ID_freespace, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- bch2_check_discard_freespace_key(trans, &iter, k.k->p)) ?:
- for_each_btree_key_commit(trans, iter,
+ bch2_check_discard_freespace_key(trans, &iter));
+ if (ret)
+ goto err;
+
+ bch2_trans_iter_init(trans, &iter, BTREE_ID_freespace, POS_MIN,
+ BTREE_ITER_PREFETCH);
+ while (1) {
+ bch2_trans_begin(trans);
+ k = bch2_btree_iter_peek(&iter);
+ if (!k.k)
+ break;
+
+ ret = bkey_err(k) ?:
+ bch2_check_discard_freespace_key(trans, &iter);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ ret = 0;
+ continue;
+ }
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ bch_err(c, "while checking %s", buf.buf);
+ printbuf_exit(&buf);
+ break;
+ }
+
+ bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos));
+ }
+ bch2_trans_iter_exit(trans, &iter);
+ if (ret)
+ goto err;
+
+ ret = for_each_btree_key_commit(trans, iter,
BTREE_ID_bucket_gens, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_bucket_gens_key(trans, &iter, k));
err:
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1486,6 +1544,27 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (a->data_type != BCH_DATA_cached)
return 0;
+ if (fsck_err_on(!a->io_time[READ], c,
+ alloc_key_cached_but_read_time_zero,
+ "cached bucket with read_time 0\n"
+ " %s",
+ (printbuf_reset(&buf),
+ bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
+ struct bkey_i_alloc_v4 *a_mut =
+ bch2_alloc_to_v4_mut(trans, alloc_k);
+ ret = PTR_ERR_OR_ZERO(a_mut);
+ if (ret)
+ goto err;
+
+ a_mut->v.io_time[READ] = atomic64_read(&c->io_clock[READ].now);
+ ret = bch2_trans_update(trans, alloc_iter,
+ &a_mut->k_i, BTREE_TRIGGER_NORUN);
+ if (ret)
+ goto err;
+
+ a = &a_mut->v;
+ }
+
lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru,
lru_pos(alloc_k.k->p.inode,
bucket_to_u64(alloc_k.k->p),
@@ -1494,41 +1573,18 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans,
if (ret)
return ret;
- if (fsck_err_on(!a->io_time[READ], c,
- alloc_key_cached_but_read_time_zero,
- "cached bucket with read_time 0\n"
- " %s",
- (printbuf_reset(&buf),
- bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf)) ||
- fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
+ if (fsck_err_on(lru_k.k->type != KEY_TYPE_set, c,
alloc_key_to_missing_lru_entry,
"missing lru entry\n"
" %s",
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) {
- u64 read_time = a->io_time[READ] ?:
- atomic64_read(&c->io_clock[READ].now);
-
ret = bch2_lru_set(trans,
alloc_k.k->p.inode,
bucket_to_u64(alloc_k.k->p),
- read_time);
+ a->io_time[READ]);
if (ret)
goto err;
-
- if (a->io_time[READ] != read_time) {
- struct bkey_i_alloc_v4 *a_mut =
- bch2_alloc_to_v4_mut(trans, alloc_k);
- ret = PTR_ERR_OR_ZERO(a_mut);
- if (ret)
- goto err;
-
- a_mut->v.io_time[READ] = read_time;
- ret = bch2_trans_update(trans, alloc_iter,
- &a_mut->k_i, BTREE_TRIGGER_NORUN);
- if (ret)
- goto err;
- }
}
err:
fsck_err:
@@ -1539,27 +1595,45 @@ fsck_err:
int bch2_check_alloc_to_lru_refs(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_alloc_to_lru_ref(trans, &iter)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
+struct discard_buckets_state {
+ u64 seen;
+ u64 open;
+ u64 need_journal_commit;
+ u64 discarded;
+ struct bch_dev *ca;
+ u64 need_journal_commit_this_dev;
+};
+
+static void discard_buckets_next_dev(struct bch_fs *c, struct discard_buckets_state *s, struct bch_dev *ca)
+{
+ if (s->ca == ca)
+ return;
+
+ if (s->ca && s->need_journal_commit_this_dev >
+ bch2_dev_usage_read(s->ca).d[BCH_DATA_free].buckets)
+ bch2_journal_flush_async(&c->journal, NULL);
+
+ if (s->ca)
+ percpu_ref_put(&s->ca->ref);
+ if (ca)
+ percpu_ref_get(&ca->ref);
+ s->ca = ca;
+ s->need_journal_commit_this_dev = 0;
+}
+
static int bch2_discard_one_bucket(struct btree_trans *trans,
struct btree_iter *need_discard_iter,
struct bpos *discard_pos_done,
- u64 *seen,
- u64 *open,
- u64 *need_journal_commit,
- u64 *discarded)
+ struct discard_buckets_state *s)
{
struct bch_fs *c = trans->c;
struct bpos pos = need_discard_iter->pos;
@@ -1571,20 +1645,24 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
int ret = 0;
ca = bch_dev_bkey_exists(c, pos.inode);
+
if (!percpu_ref_tryget(&ca->io_ref)) {
bch2_btree_iter_set_pos(need_discard_iter, POS(pos.inode + 1, 0));
return 0;
}
+ discard_buckets_next_dev(c, s, ca);
+
if (bch2_bucket_is_open_safe(c, pos.inode, pos.offset)) {
- (*open)++;
+ s->open++;
goto out;
}
if (bch2_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
c->journal.flushed_seq_ondisk,
pos.inode, pos.offset)) {
- (*need_journal_commit)++;
+ s->need_journal_commit++;
+ s->need_journal_commit_this_dev++;
goto out;
}
@@ -1637,7 +1715,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans,
* This works without any other locks because this is the only
* thread that removes items from the need_discard tree
*/
- bch2_trans_unlock(trans);
+ bch2_trans_unlock_long(trans);
blkdev_issue_discard(ca->disk_sb.bdev,
k.k->p.offset * ca->mi.bucket_size,
ca->mi.bucket_size,
@@ -1655,14 +1733,14 @@ write:
ret = bch2_trans_update(trans, &iter, &a->k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
- this_cpu_inc(c->counters[BCH_COUNTER_bucket_discard]);
- (*discarded)++;
+ count_event(c, bucket_discard);
+ s->discarded++;
out:
- (*seen)++;
+ s->seen++;
bch2_trans_iter_exit(trans, &iter);
percpu_ref_put(&ca->io_ref);
printbuf_exit(&buf);
@@ -1672,9 +1750,7 @@ out:
static void bch2_do_discards_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, discard_work);
- struct btree_iter iter;
- struct bkey_s_c k;
- u64 seen = 0, open = 0, need_journal_commit = 0, discarded = 0;
+ struct discard_buckets_state s = {};
struct bpos discard_pos_done = POS_MAX;
int ret;
@@ -1684,21 +1760,16 @@ static void bch2_do_discards_work(struct work_struct *work)
* successful commit:
*/
ret = bch2_trans_run(c,
- for_each_btree_key2(trans, iter,
- BTREE_ID_need_discard, POS_MIN, 0, k,
- bch2_discard_one_bucket(trans, &iter, &discard_pos_done,
- &seen,
- &open,
- &need_journal_commit,
- &discarded)));
-
- if (need_journal_commit * 2 > seen)
- bch2_journal_flush_async(&c->journal, NULL);
+ for_each_btree_key(trans, iter,
+ BTREE_ID_need_discard, POS_MIN, 0, k,
+ bch2_discard_one_bucket(trans, &iter, &discard_pos_done, &s)));
- bch2_write_ref_put(c, BCH_WRITE_REF_discard);
+ discard_buckets_next_dev(c, &s, NULL);
- trace_discard_buckets(c, seen, open, need_journal_commit, discarded,
+ trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded,
bch2_err_str(ret));
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_discard);
}
void bch2_do_discards(struct bch_fs *c)
@@ -1760,7 +1831,7 @@ static int invalidate_one_bucket(struct btree_trans *trans,
BTREE_TRIGGER_BUCKET_INVALIDATE) ?:
bch2_trans_commit(trans, NULL, NULL,
BCH_WATERMARK_btree|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto out;
@@ -1795,22 +1866,18 @@ err:
static void bch2_do_invalidates_work(struct work_struct *work)
{
struct bch_fs *c = container_of(work, struct bch_fs, invalidate_work);
- struct bch_dev *ca;
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- unsigned i;
int ret = 0;
- ret = bch2_btree_write_buffer_flush(trans);
+ ret = bch2_btree_write_buffer_tryflush(trans);
if (ret)
goto err;
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
s64 nr_to_invalidate =
should_invalidate_buckets(ca, bch2_dev_usage_read(ca));
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+ ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
lru_pos(ca->dev_idx, 0, 0),
lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX),
BTREE_ITER_INTENT, k,
@@ -1884,8 +1951,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_bucket_do_index(trans, k, a, true) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
@@ -1905,8 +1971,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca,
ret = bch2_btree_insert_trans(trans, BTREE_ID_freespace, freespace, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
if (ret)
goto bkey_err;
@@ -1937,8 +2002,6 @@ bkey_err:
int bch2_fs_freespace_init(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
int ret = 0;
bool doing_init = false;
@@ -1947,7 +2010,7 @@ int bch2_fs_freespace_init(struct bch_fs *c)
* every mount:
*/
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
if (ca->mi.freespace_initialized)
continue;
@@ -2007,15 +2070,13 @@ out:
void bch2_recalc_capacity(struct bch_fs *c)
{
- struct bch_dev *ca;
u64 capacity = 0, reserved_sectors = 0, gc_reserve;
unsigned bucket_size_max = 0;
unsigned long ra_pages = 0;
- unsigned i;
lockdep_assert_held(&c->state_lock);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi;
ra_pages += bdi->ra_pages;
@@ -2023,7 +2084,7 @@ void bch2_recalc_capacity(struct bch_fs *c)
bch2_set_ra_pages(c, ra_pages);
- for_each_rw_member(ca, c, i) {
+ for_each_rw_member(c, ca) {
u64 dev_reserve = 0;
/*
@@ -2079,11 +2140,9 @@ void bch2_recalc_capacity(struct bch_fs *c)
u64 bch2_min_rw_member_capacity(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
u64 ret = U64_MAX;
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size);
return ret;
}
diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h
index 73faf99a222a..e7f7e842ee1b 100644
--- a/fs/bcachefs/alloc_background.h
+++ b/fs/bcachefs/alloc_background.h
@@ -71,6 +71,24 @@ static inline enum bch_data_type bucket_data_type(enum bch_data_type data_type)
return data_type == BCH_DATA_stripe ? BCH_DATA_user : data_type;
}
+static inline unsigned bch2_bucket_sectors(struct bch_alloc_v4 a)
+{
+ return a.dirty_sectors + a.cached_sectors;
+}
+
+static inline unsigned bch2_bucket_sectors_dirty(struct bch_alloc_v4 a)
+{
+ return a.dirty_sectors;
+}
+
+static inline unsigned bch2_bucket_sectors_fragmented(struct bch_dev *ca,
+ struct bch_alloc_v4 a)
+{
+ int d = bch2_bucket_sectors_dirty(a);
+
+ return d ? max(0, ca->mi.bucket_size - d) : 0;
+}
+
static inline u64 alloc_lru_idx_read(struct bch_alloc_v4 a)
{
return a.data_type == BCH_DATA_cached ? a.io_time[READ] : 0;
@@ -90,10 +108,11 @@ static inline u64 alloc_lru_idx_fragmentation(struct bch_alloc_v4 a,
struct bch_dev *ca)
{
if (!data_type_movable(a.data_type) ||
- a.dirty_sectors >= ca->mi.bucket_size)
+ !bch2_bucket_sectors_fragmented(ca, a))
return 0;
- return div_u64((u64) a.dirty_sectors * (1ULL << 31), ca->mi.bucket_size);
+ u64 d = bch2_bucket_sectors_dirty(a);
+ return div_u64(d * (1ULL << 31), ca->mi.bucket_size);
}
static inline u64 alloc_freespace_genbits(struct bch_alloc_v4 a)
@@ -163,24 +182,21 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
#define bch2_bkey_ops_alloc ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v1_invalid, \
.val_to_text = bch2_alloc_to_text, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_alloc_v2 ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v2_invalid, \
.val_to_text = bch2_alloc_to_text, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 8, \
})
#define bch2_bkey_ops_alloc_v3 ((struct bkey_ops) { \
.key_invalid = bch2_alloc_v3_invalid, \
.val_to_text = bch2_alloc_to_text, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 16, \
})
@@ -188,8 +204,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
.key_invalid = bch2_alloc_v4_invalid, \
.val_to_text = bch2_alloc_to_text, \
.swab = bch2_alloc_v4_swab, \
- .trans_trigger = bch2_trans_mark_alloc, \
- .atomic_trigger = bch2_mark_alloc, \
+ .trigger = bch2_trigger_alloc, \
.min_val_size = 48, \
})
@@ -213,8 +228,8 @@ static inline bool bkey_is_alloc(const struct bkey *k)
int bch2_alloc_read(struct bch_fs *);
-int bch2_trans_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trigger_alloc(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
int bch2_check_alloc_info(struct bch_fs *);
int bch2_check_alloc_to_lru_refs(struct bch_fs *);
void bch2_do_discards(struct bch_fs *);
diff --git a/fs/bcachefs/alloc_background_format.h b/fs/bcachefs/alloc_background_format.h
new file mode 100644
index 000000000000..b4ec20be93b8
--- /dev/null
+++ b/fs/bcachefs/alloc_background_format.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+#define _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H
+
+struct bch_alloc {
+ struct bch_val v;
+ __u8 fields;
+ __u8 gen;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V1() \
+ x(read_time, 16) \
+ x(write_time, 16) \
+ x(data_type, 8) \
+ x(dirty_sectors, 16) \
+ x(cached_sectors, 16) \
+ x(oldest_gen, 8) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+enum {
+#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
+ BCH_ALLOC_FIELDS_V1()
+#undef x
+};
+
+struct bch_alloc_v2 {
+ struct bch_val v;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+#define BCH_ALLOC_FIELDS_V2() \
+ x(read_time, 64) \
+ x(write_time, 64) \
+ x(dirty_sectors, 32) \
+ x(cached_sectors, 32) \
+ x(stripe, 32) \
+ x(stripe_redundancy, 8)
+
+struct bch_alloc_v3 {
+ struct bch_val v;
+ __le64 journal_seq;
+ __le32 flags;
+ __u8 nr_fields;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 data[];
+} __packed __aligned(8);
+
+LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
+LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
+
+struct bch_alloc_v4 {
+ struct bch_val v;
+ __u64 journal_seq;
+ __u32 flags;
+ __u8 gen;
+ __u8 oldest_gen;
+ __u8 data_type;
+ __u8 stripe_redundancy;
+ __u32 dirty_sectors;
+ __u32 cached_sectors;
+ __u64 io_time[2];
+ __u32 stripe;
+ __u32 nr_external_backpointers;
+ __u64 fragmentation_lru;
+} __packed __aligned(8);
+
+#define BCH_ALLOC_V4_U64s_V0 6
+#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
+
+BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
+BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
+BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
+BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
+
+#define KEY_TYPE_BUCKET_GENS_BITS 8
+#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
+#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
+
+struct bch_bucket_gens {
+ struct bch_val v;
+ u8 gens[KEY_TYPE_BUCKET_GENS_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_ALLOC_BACKGROUND_FORMAT_H */
diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c
index 0e6157982607..633d3223b353 100644
--- a/fs/bcachefs/alloc_foreground.c
+++ b/fs/bcachefs/alloc_foreground.c
@@ -69,11 +69,8 @@ const char * const bch2_watermarks[] = {
void bch2_reset_alloc_cursors(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL)
+ for_each_member_device_rcu(c, ca, NULL)
ca->alloc_cursor = 0;
rcu_read_unlock();
}
@@ -239,9 +236,8 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
if (cl)
closure_wait(&c->open_buckets_wait, cl);
- if (!c->blocked_allocate_open_bucket)
- c->blocked_allocate_open_bucket = local_clock();
-
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
+ &c->blocked_allocate_open_bucket, true);
spin_unlock(&c->freelist_lock);
return ERR_PTR(-BCH_ERR_open_buckets_empty);
}
@@ -267,19 +263,11 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *
ca->nr_open_buckets++;
bch2_open_bucket_hash_add(c, ob);
- if (c->blocked_allocate_open_bucket) {
- bch2_time_stats_update(
- &c->times[BCH_TIME_blocked_allocate_open_bucket],
- c->blocked_allocate_open_bucket);
- c->blocked_allocate_open_bucket = 0;
- }
+ track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket],
+ &c->blocked_allocate_open_bucket, false);
- if (c->blocked_allocate) {
- bch2_time_stats_update(
- &c->times[BCH_TIME_blocked_allocate],
- c->blocked_allocate);
- c->blocked_allocate = 0;
- }
+ track_event_change(&c->times[BCH_TIME_blocked_allocate],
+ &c->blocked_allocate, false);
spin_unlock(&c->freelist_lock);
return ob;
@@ -377,9 +365,9 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc
ob = __try_alloc_bucket(c, ca, b, watermark, a, s, cl);
if (!ob)
- iter.path->preserve = false;
+ set_btree_iter_dontneed(&iter);
err:
- if (iter.trans && iter.path)
+ if (iter.path)
set_btree_iter_dontneed(&iter);
bch2_trans_iter_exit(trans, &iter);
printbuf_exit(&buf);
@@ -447,7 +435,7 @@ again:
ob = __try_alloc_bucket(trans->c, ca, k.k->p.offset, watermark, a, s, cl);
next:
- citer.path->preserve = false;
+ set_btree_iter_dontneed(&citer);
bch2_trans_iter_exit(trans, &citer);
if (ob)
break;
@@ -502,7 +490,7 @@ again:
ob = try_alloc_bucket(trans, ca, watermark,
alloc_cursor, s, k, cl);
if (ob) {
- iter.path->preserve = false;
+ set_btree_iter_dontneed(&iter);
break;
}
}
@@ -567,8 +555,8 @@ again:
goto again;
}
- if (!c->blocked_allocate)
- c->blocked_allocate = local_clock();
+ track_event_change(&c->times[BCH_TIME_blocked_allocate],
+ &c->blocked_allocate, true);
ob = ERR_PTR(-BCH_ERR_freelist_empty);
goto err;
@@ -697,11 +685,9 @@ static int add_new_bucket(struct bch_fs *c,
bch_dev_bkey_exists(c, ob->dev)->mi.durability;
BUG_ON(*nr_effective >= nr_replicas);
- BUG_ON(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS);
__clear_bit(ob->dev, devs_may_alloc->d);
- *nr_effective += (flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)
- ? durability : 1;
+ *nr_effective += durability;
*have_cache |= !durability;
ob_push(c, ptrs, ob);
@@ -972,8 +958,8 @@ static int __open_bucket_add_buckets(struct btree_trans *trans,
devs = target_rw_devs(c, wp->data_type, target);
/* Don't allocate from devices we already have pointers to: */
- for (i = 0; i < devs_have->nr; i++)
- __clear_bit(devs_have->devs[i], devs.d);
+ darray_for_each(*devs_have, i)
+ __clear_bit(*i, devs.d);
open_bucket_for_each(c, ptrs, ob, i)
__clear_bit(ob->dev, devs.d);
@@ -1539,10 +1525,11 @@ static void bch2_open_bucket_to_text(struct printbuf *out, struct bch_fs *c, str
unsigned data_type = ob->data_type;
barrier(); /* READ_ONCE() doesn't work on bitfields */
- prt_printf(out, "%zu ref %u %s %u:%llu gen %u allocated %u/%u",
+ prt_printf(out, "%zu ref %u ",
ob - c->open_buckets,
- atomic_read(&ob->pin),
- data_type < BCH_DATA_NR ? bch2_data_types[data_type] : "invalid data type",
+ atomic_read(&ob->pin));
+ bch2_prt_data_type(out, data_type);
+ prt_printf(out, " %u:%llu gen %u allocated %u/%u",
ob->dev, ob->bucket, ob->gen,
ca->mi.bucket_size - ob->sectors_free, ca->mi.bucket_size);
if (ob->ec)
diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c
index 23c0834a97a4..569b97904da4 100644
--- a/fs/bcachefs/backpointers.c
+++ b/fs/bcachefs/backpointers.c
@@ -3,6 +3,7 @@
#include "bbpos.h"
#include "alloc_background.h"
#include "backpointers.h"
+#include "bkey_buf.h"
#include "btree_cache.h"
#include "btree_update.h"
#include "btree_update_interior.h"
@@ -67,9 +68,11 @@ void bch2_backpointer_to_text(struct printbuf *out, const struct bch_backpointer
void bch2_backpointer_k_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k)
{
- prt_str(out, "bucket=");
- bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
- prt_str(out, " ");
+ if (bch2_dev_exists2(c, k.k->p.inode)) {
+ prt_str(out, "bucket=");
+ bch2_bpos_to_text(out, bp_pos_to_bucket(c, k.k->p));
+ prt_str(out, " ");
+ }
bch2_backpointer_to_text(out, bkey_s_c_to_backpointer(k).v);
}
@@ -136,15 +139,30 @@ static noinline int backpointer_mod_err(struct btree_trans *trans,
}
int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans,
- struct bkey_i_backpointer *bp_k,
+ struct bpos bucket,
struct bch_backpointer bp,
struct bkey_s_c orig_k,
bool insert)
{
struct btree_iter bp_iter;
struct bkey_s_c k;
+ struct bkey_i_backpointer *bp_k;
int ret;
+ bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
+ ret = PTR_ERR_OR_ZERO(bp_k);
+ if (ret)
+ return ret;
+
+ bkey_backpointer_init(&bp_k->k_i);
+ bp_k->k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ bp_k->v = bp;
+
+ if (!insert) {
+ bp_k->k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp_k->k, 0);
+ }
+
k = bch2_bkey_get_iter(trans, &bp_iter, BTREE_ID_backpointers,
bp_k->k.p,
BTREE_ITER_INTENT|
@@ -375,41 +393,45 @@ fsck_err:
/* verify that every backpointer has a corresponding alloc key */
int bch2_check_btree_backpointers(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_backpointers, POS_MIN, 0, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_check_btree_backpointer(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
-struct bpos_level {
- unsigned level;
- struct bpos pos;
+static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r)
+{
+ return bpos_eq(l.k->p, r.k->p) &&
+ bkey_bytes(l.k) == bkey_bytes(r.k) &&
+ !memcmp(l.v, r.v, bkey_val_bytes(l.k));
+}
+
+struct extents_to_bp_state {
+ struct bpos bucket_start;
+ struct bpos bucket_end;
+ struct bkey_buf last_flushed;
};
static int check_bp_exists(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
struct bpos bucket,
struct bch_backpointer bp,
- struct bkey_s_c orig_k,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bpos_level *last_flushed)
+ struct bkey_s_c orig_k)
{
struct bch_fs *c = trans->c;
struct btree_iter bp_iter = { NULL };
struct printbuf buf = PRINTBUF;
struct bkey_s_c bp_k;
+ struct bkey_buf tmp;
int ret;
- if (bpos_lt(bucket, bucket_start) ||
- bpos_gt(bucket, bucket_end))
+ bch2_bkey_buf_init(&tmp);
+
+ if (bpos_lt(bucket, s->bucket_start) ||
+ bpos_gt(bucket, s->bucket_end))
return 0;
if (!bch2_dev_bucket_exists(c, bucket))
@@ -424,13 +446,20 @@ static int check_bp_exists(struct btree_trans *trans,
if (bp_k.k->type != KEY_TYPE_backpointer ||
memcmp(bkey_s_c_to_backpointer(bp_k).v, &bp, sizeof(bp))) {
- if (last_flushed->level != bp.level ||
- !bpos_eq(last_flushed->pos, orig_k.k->p)) {
- last_flushed->level = bp.level;
- last_flushed->pos = orig_k.k->p;
+ bch2_bkey_buf_reassemble(&tmp, c, orig_k);
+
+ if (!bkey_and_val_eq(orig_k, bkey_i_to_s_c(s->last_flushed.k))) {
+ if (bp.level) {
+ bch2_trans_unlock(trans);
+ bch2_btree_interior_updates_flush(c);
+ }
- ret = bch2_btree_write_buffer_flush_sync(trans) ?:
- -BCH_ERR_transaction_restart_write_buffer_flush;
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
+
+ bch2_bkey_buf_copy(&s->last_flushed, c, tmp.k);
+ ret = -BCH_ERR_transaction_restart_write_buffer_flush;
goto out;
}
goto missing;
@@ -439,6 +468,7 @@ out:
err:
fsck_err:
bch2_trans_iter_exit(trans, &bp_iter);
+ bch2_bkey_buf_exit(&tmp, c);
printbuf_exit(&buf);
return ret;
missing:
@@ -448,8 +478,7 @@ missing:
prt_printf(&buf, "\nbp pos ");
bch2_bpos_to_text(&buf, bp_iter.pos);
- if (c->sb.version_upgrade_complete < bcachefs_metadata_version_backpointers ||
- c->opts.reconstruct_alloc ||
+ if (c->opts.reconstruct_alloc ||
fsck_err(c, ptr_to_missing_backpointer, "%s", buf.buf))
ret = bch2_bucket_backpointer_mod(trans, bucket, bp, orig_k, true);
@@ -457,25 +486,16 @@ missing:
}
static int check_extent_to_backpointers(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bpos_level *last_flushed)
+ struct extents_to_bp_state *s,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c k)
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs;
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- struct bkey_s_c k;
int ret;
- k = bch2_btree_iter_peek_all_levels(iter);
- ret = bkey_err(k);
- if (ret)
- return ret;
- if (!k.k)
- return 0;
-
ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
struct bpos bucket_pos;
@@ -484,12 +504,10 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
if (p.ptr.cached)
continue;
- bch2_extent_ptr_to_bp(c, iter->btree_id, iter->path->level,
+ bch2_extent_ptr_to_bp(c, btree, level,
k, p, &bucket_pos, &bp);
- ret = check_bp_exists(trans, bucket_pos, bp, k,
- bucket_start, bucket_end,
- last_flushed);
+ ret = check_bp_exists(trans, s, bucket_pos, bp, k);
if (ret)
return ret;
}
@@ -498,47 +516,32 @@ static int check_extent_to_backpointers(struct btree_trans *trans,
}
static int check_btree_root_to_backpointers(struct btree_trans *trans,
+ struct extents_to_bp_state *s,
enum btree_id btree_id,
- struct bpos bucket_start,
- struct bpos bucket_end,
- struct bpos_level *last_flushed)
+ int *level)
{
struct bch_fs *c = trans->c;
- struct btree_root *r = bch2_btree_id_root(c, btree_id);
struct btree_iter iter;
struct btree *b;
struct bkey_s_c k;
- struct bkey_ptrs_c ptrs;
- struct extent_ptr_decoded p;
- const union bch_extent_entry *entry;
int ret;
-
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, r->level, 0);
+retry:
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN,
+ 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0);
b = bch2_btree_iter_peek_node(&iter);
ret = PTR_ERR_OR_ZERO(b);
if (ret)
goto err;
- BUG_ON(b != btree_node_root(c, b));
-
- k = bkey_i_to_s_c(&b->key);
- ptrs = bch2_bkey_ptrs_c(k);
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- struct bpos bucket_pos;
- struct bch_backpointer bp;
-
- if (p.ptr.cached)
- continue;
+ if (b != btree_node_root(c, b)) {
+ bch2_trans_iter_exit(trans, &iter);
+ goto retry;
+ }
- bch2_extent_ptr_to_bp(c, iter.btree_id, b->c.level + 1,
- k, p, &bucket_pos, &bp);
+ *level = b->c.level;
- ret = check_bp_exists(trans, bucket_pos, bp, k,
- bucket_start, bucket_end,
- last_flushed);
- if (ret)
- goto err;
- }
+ k = bkey_i_to_s_c(&b->key);
+ ret = check_extent_to_backpointers(trans, s, btree_id, b->c.level + 1, k);
err:
bch2_trans_iter_exit(trans, &iter);
return ret;
@@ -559,7 +562,7 @@ static size_t btree_nodes_fit_in_ram(struct bch_fs *c)
si_meminfo(&i);
mem_bytes = i.totalram * i.mem_unit;
- return div_u64(mem_bytes >> 1, btree_bytes(c));
+ return div_u64(mem_bytes >> 1, c->opts.btree_node_size);
}
static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
@@ -610,49 +613,57 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans,
}
static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans,
- struct bpos bucket_start,
- struct bpos bucket_end)
+ struct extents_to_bp_state *s)
{
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- enum btree_id btree_id;
- struct bpos_level last_flushed = { UINT_MAX, POS_MIN };
int ret = 0;
- for (btree_id = 0; btree_id < btree_id_nr_alive(c); btree_id++) {
- unsigned depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
+ for (enum btree_id btree_id = 0;
+ btree_id < btree_id_nr_alive(c);
+ btree_id++) {
+ int level, depth = btree_type_has_ptrs(btree_id) ? 0 : 1;
- bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
- depth,
- BTREE_ITER_ALL_LEVELS|
- BTREE_ITER_PREFETCH);
-
- do {
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_extent_to_backpointers(trans, &iter,
- bucket_start, bucket_end,
- &last_flushed));
- if (ret)
- break;
- } while (!bch2_btree_iter_advance(&iter));
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ check_btree_root_to_backpointers(trans, s, btree_id, &level));
+ if (ret)
+ return ret;
- bch2_trans_iter_exit(trans, &iter);
+ while (level >= depth) {
+ struct btree_iter iter;
+ bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0,
+ level,
+ BTREE_ITER_PREFETCH);
+ while (1) {
+ bch2_trans_begin(trans);
+
+ struct bkey_s_c k = bch2_btree_iter_peek(&iter);
+ if (!k.k)
+ break;
+ ret = bkey_err(k) ?:
+ check_extent_to_backpointers(trans, s, btree_id, level, k) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc);
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ ret = 0;
+ continue;
+ }
+ if (ret)
+ break;
+ if (bpos_eq(iter.pos, SPOS_MAX))
+ break;
+ bch2_btree_iter_advance(&iter);
+ }
+ bch2_trans_iter_exit(trans, &iter);
- if (ret)
- break;
+ if (ret)
+ return ret;
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- check_btree_root_to_backpointers(trans, btree_id,
- bucket_start, bucket_end,
- &last_flushed));
- if (ret)
- break;
+ --level;
+ }
}
- return ret;
+
+ return 0;
}
static struct bpos bucket_pos_to_bp_safe(const struct bch_fs *c,
@@ -714,40 +725,45 @@ static int bch2_get_alloc_in_memory_pos(struct btree_trans *trans,
int bch2_check_extents_to_backpointers(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct bpos start = POS_MIN, end;
+ struct extents_to_bp_state s = { .bucket_start = POS_MIN };
int ret;
+ bch2_bkey_buf_init(&s.last_flushed);
+ bkey_init(&s.last_flushed.k->k);
+
while (1) {
- ret = bch2_get_alloc_in_memory_pos(trans, start, &end);
+ ret = bch2_get_alloc_in_memory_pos(trans, s.bucket_start, &s.bucket_end);
if (ret)
break;
- if (bpos_eq(start, POS_MIN) && !bpos_eq(end, SPOS_MAX))
+ if ( bpos_eq(s.bucket_start, POS_MIN) &&
+ !bpos_eq(s.bucket_end, SPOS_MAX))
bch_verbose(c, "%s(): alloc info does not fit in ram, running in multiple passes with %zu nodes per pass",
__func__, btree_nodes_fit_in_ram(c));
- if (!bpos_eq(start, POS_MIN) || !bpos_eq(end, SPOS_MAX)) {
+ if (!bpos_eq(s.bucket_start, POS_MIN) ||
+ !bpos_eq(s.bucket_end, SPOS_MAX)) {
struct printbuf buf = PRINTBUF;
prt_str(&buf, "check_extents_to_backpointers(): ");
- bch2_bpos_to_text(&buf, start);
+ bch2_bpos_to_text(&buf, s.bucket_start);
prt_str(&buf, "-");
- bch2_bpos_to_text(&buf, end);
+ bch2_bpos_to_text(&buf, s.bucket_end);
bch_verbose(c, "%s", buf.buf);
printbuf_exit(&buf);
}
- ret = bch2_check_extents_to_backpointers_pass(trans, start, end);
- if (ret || bpos_eq(end, SPOS_MAX))
+ ret = bch2_check_extents_to_backpointers_pass(trans, &s);
+ if (ret || bpos_eq(s.bucket_end, SPOS_MAX))
break;
- start = bpos_successor(end);
+ s.bucket_start = bpos_successor(s.bucket_end);
}
bch2_trans_put(trans);
+ bch2_bkey_buf_exit(&s.last_flushed, c);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -801,13 +817,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans,
struct bbpos start,
struct bbpos end)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bpos last_flushed_pos = SPOS_MAX;
return for_each_btree_key_commit(trans, iter, BTREE_ID_backpointers,
POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_one_backpointer(trans, start, end,
bkey_s_c_to_backpointer(k),
&last_flushed_pos));
@@ -854,7 +868,6 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c)
}
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h
index ab866feeaf66..327365a9feac 100644
--- a/fs/bcachefs/backpointers.h
+++ b/fs/bcachefs/backpointers.h
@@ -2,6 +2,7 @@
#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H
#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H
+#include "btree_cache.h"
#include "btree_iter.h"
#include "btree_update.h"
#include "buckets.h"
@@ -63,7 +64,7 @@ static inline struct bpos bucket_pos_to_bp(const struct bch_fs *c,
return ret;
}
-int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bkey_i_backpointer *,
+int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *, struct bpos bucket,
struct bch_backpointer, struct bkey_s_c, bool);
static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
@@ -72,28 +73,21 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans,
struct bkey_s_c orig_k,
bool insert)
{
- struct bch_fs *c = trans->c;
- struct bkey_i_backpointer *bp_k;
- int ret;
+ if (unlikely(bch2_backpointers_no_use_write_buffer))
+ return bch2_bucket_backpointer_mod_nowritebuffer(trans, bucket, bp, orig_k, insert);
- bp_k = bch2_trans_kmalloc_nomemzero(trans, sizeof(struct bkey_i_backpointer));
- ret = PTR_ERR_OR_ZERO(bp_k);
- if (ret)
- return ret;
+ struct bkey_i_backpointer bp_k;
- bkey_backpointer_init(&bp_k->k_i);
- bp_k->k.p = bucket_pos_to_bp(c, bucket, bp.bucket_offset);
- bp_k->v = bp;
+ bkey_backpointer_init(&bp_k.k_i);
+ bp_k.k.p = bucket_pos_to_bp(trans->c, bucket, bp.bucket_offset);
+ bp_k.v = bp;
if (!insert) {
- bp_k->k.type = KEY_TYPE_deleted;
- set_bkey_val_u64s(&bp_k->k, 0);
+ bp_k.k.type = KEY_TYPE_deleted;
+ set_bkey_val_u64s(&bp_k.k, 0);
}
- if (unlikely(bch2_backpointers_no_use_write_buffer))
- return bch2_bucket_backpointer_mod_nowritebuffer(trans, bp_k, bp, orig_k, insert);
-
- return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k->k_i);
+ return bch2_trans_update_buffered(trans, BTREE_ID_backpointers, &bp_k.k_i);
}
static inline enum bch_data_type bkey_ptr_data_type(enum btree_id btree_id, unsigned level,
diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h
index b62737fdf5ab..69d0d60d50e3 100644
--- a/fs/bcachefs/bcachefs.h
+++ b/fs/bcachefs/bcachefs.h
@@ -193,6 +193,7 @@
#include <linux/mutex.h>
#include <linux/percpu-refcount.h>
#include <linux/percpu-rwsem.h>
+#include <linux/refcount.h>
#include <linux/rhashtable.h>
#include <linux/rwsem.h>
#include <linux/semaphore.h>
@@ -223,9 +224,11 @@
#define race_fault(...) dynamic_fault("bcachefs:race")
+#define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name])
+
#define trace_and_count(_c, _name, ...) \
do { \
- this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]); \
+ count_event(_c, _name); \
trace_##_name(__VA_ARGS__); \
} while (0)
@@ -262,46 +265,76 @@ do { \
#define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n")
+__printf(2, 3)
+void __bch2_print(struct bch_fs *c, const char *fmt, ...);
+
+#define maybe_dev_to_fs(_c) _Generic((_c), \
+ struct bch_dev *: ((struct bch_dev *) (_c))->fs, \
+ struct bch_fs *: (_c))
+
+#define bch2_print(_c, ...) __bch2_print(maybe_dev_to_fs(_c), __VA_ARGS__)
+
+#define bch2_print_ratelimited(_c, ...) \
+do { \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ \
+ if (__ratelimit(&_rs)) \
+ bch2_print(_c, __VA_ARGS__); \
+} while (0)
+
#define bch_info(c, fmt, ...) \
- printk(KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_notice(c, fmt, ...) \
- printk(KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_NOTICE bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn(c, fmt, ...) \
- printk(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_warn_ratelimited(c, fmt, ...) \
- printk_ratelimited(KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_WARNING bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err(c, fmt, ...) \
- printk(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err_dev(ca, fmt, ...) \
- printk(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_err_dev_offset(ca, _offset, fmt, ...) \
- printk(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
#define bch_err_inum(c, _inum, fmt, ...) \
- printk(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
#define bch_err_inum_offset(c, _inum, _offset, fmt, ...) \
- printk(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+ bch2_print(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
#define bch_err_ratelimited(c, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt(c, fmt), ##__VA_ARGS__)
#define bch_err_dev_ratelimited(ca, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev(ca, fmt), ##__VA_ARGS__)
#define bch_err_dev_offset_ratelimited(ca, _offset, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(ca, KERN_ERR bch2_fmt_dev_offset(ca, _offset, fmt), ##__VA_ARGS__)
#define bch_err_inum_ratelimited(c, _inum, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum(c, _inum, fmt), ##__VA_ARGS__)
#define bch_err_inum_offset_ratelimited(c, _inum, _offset, fmt, ...) \
- printk_ratelimited(KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+ bch2_print_ratelimited(c, KERN_ERR bch2_fmt_inum_offset(c, _inum, _offset, fmt), ##__VA_ARGS__)
+
+static inline bool should_print_err(int err)
+{
+ return err && !bch2_err_matches(err, BCH_ERR_transaction_restart);
+}
#define bch_err_fn(_c, _ret) \
do { \
- if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ if (should_print_err(_ret)) \
bch_err(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
} while (0)
+#define bch_err_fn_ratelimited(_c, _ret) \
+do { \
+ if (should_print_err(_ret)) \
+ bch_err_ratelimited(_c, "%s(): error %s", __func__, bch2_err_str(_ret));\
+} while (0)
+
#define bch_err_msg(_c, _ret, _msg, ...) \
do { \
- if (_ret && !bch2_err_matches(_ret, BCH_ERR_transaction_restart))\
+ if (should_print_err(_ret)) \
bch_err(_c, "%s(): error " _msg " %s", __func__, \
##__VA_ARGS__, bch2_err_str(_ret)); \
} while (0)
@@ -392,6 +425,7 @@ BCH_DEBUG_PARAMS_DEBUG()
x(btree_node_merge) \
x(btree_node_sort) \
x(btree_node_read) \
+ x(btree_node_read_done) \
x(btree_interior_update_foreground) \
x(btree_interior_update_total) \
x(btree_gc) \
@@ -401,9 +435,12 @@ BCH_DEBUG_PARAMS_DEBUG()
x(journal_flush_write) \
x(journal_noflush_write) \
x(journal_flush_seq) \
- x(blocked_journal) \
+ x(blocked_journal_low_on_space) \
+ x(blocked_journal_low_on_pin) \
+ x(blocked_journal_max_in_flight) \
x(blocked_allocate) \
x(blocked_allocate_open_bucket) \
+ x(blocked_write_buffer_full) \
x(nocow_lock_contended)
enum bch_time_stats {
@@ -428,6 +465,7 @@ enum bch_time_stats {
#include "replicas_types.h"
#include "subvolume_types.h"
#include "super_types.h"
+#include "thread_with_file_types.h"
/* Number of nodes btree coalesce will try to coalesce at once */
#define GC_MERGE_NODES 4U
@@ -564,32 +602,35 @@ struct bch_dev {
struct io_count __percpu *io_done;
};
-enum {
- /* startup: */
- BCH_FS_STARTED,
- BCH_FS_MAY_GO_RW,
- BCH_FS_RW,
- BCH_FS_WAS_RW,
-
- /* shutdown: */
- BCH_FS_STOPPING,
- BCH_FS_EMERGENCY_RO,
- BCH_FS_GOING_RO,
- BCH_FS_WRITE_DISABLE_COMPLETE,
- BCH_FS_CLEAN_SHUTDOWN,
-
- /* fsck passes: */
- BCH_FS_FSCK_DONE,
- BCH_FS_INITIAL_GC_UNFIXED, /* kill when we enumerate fsck errors */
- BCH_FS_NEED_ANOTHER_GC,
-
- BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS,
-
- /* errors: */
- BCH_FS_ERROR,
- BCH_FS_TOPOLOGY_ERROR,
- BCH_FS_ERRORS_FIXED,
- BCH_FS_ERRORS_NOT_FIXED,
+/*
+ * initial_gc_unfixed
+ * error
+ * topology error
+ */
+
+#define BCH_FS_FLAGS() \
+ x(started) \
+ x(may_go_rw) \
+ x(rw) \
+ x(was_rw) \
+ x(stopping) \
+ x(emergency_ro) \
+ x(going_ro) \
+ x(write_disable_complete) \
+ x(clean_shutdown) \
+ x(fsck_running) \
+ x(initial_gc_unfixed) \
+ x(need_another_gc) \
+ x(need_delete_dead_snapshots) \
+ x(error) \
+ x(topology_error) \
+ x(errors_fixed) \
+ x(errors_not_fixed)
+
+enum bch_fs_flags {
+#define x(n) BCH_FS_##n,
+ BCH_FS_FLAGS()
+#undef x
};
struct btree_debug {
@@ -599,10 +640,11 @@ struct btree_debug {
#define BCH_TRANSACTIONS_NR 128
struct btree_transaction_stats {
+ struct bch2_time_stats duration;
struct bch2_time_stats lock_hold_times;
struct mutex lock;
unsigned nr_max_paths;
- unsigned wb_updates_size;
+ unsigned journal_entries_size;
unsigned max_mem;
char *max_paths_text;
};
@@ -664,7 +706,8 @@ struct btree_trans_buf {
x(invalidate) \
x(delete_dead_snapshots) \
x(snapshot_delete_pagecache) \
- x(sysfs)
+ x(sysfs) \
+ x(btree_write_buffer)
enum bch_write_ref {
#define x(n) BCH_WRITE_REF_##n,
@@ -689,6 +732,8 @@ struct bch_fs {
struct super_block *vfs_sb;
dev_t dev;
char name[40];
+ struct stdio_redirect *stdio;
+ struct task_struct *stdio_filter;
/* ro/rw, add/remove/resize devices: */
struct rw_semaphore state_lock;
@@ -699,6 +744,13 @@ struct bch_fs {
#else
struct percpu_ref writes;
#endif
+ /*
+ * Analagous to c->writes, for asynchronous ops that don't necessarily
+ * need fs to be read-write
+ */
+ refcount_t ro_ref;
+ wait_queue_head_t ro_ref_wait;
+
struct work_struct read_only_work;
struct bch_dev __rcu *devs[BCH_SB_MEMBERS_MAX];
@@ -1002,10 +1054,21 @@ struct bch_fs {
/* RECOVERY */
u64 journal_replay_seq_start;
u64 journal_replay_seq_end;
+ /*
+ * Two different uses:
+ * "Has this fsck pass?" - i.e. should this type of error be an
+ * emergency read-only
+ * And, in certain situations fsck will rewind to an earlier pass: used
+ * for signaling to the toplevel code which pass we want to run now.
+ */
enum bch_recovery_pass curr_recovery_pass;
/* bitmap of explicitly enabled recovery passes: */
u64 recovery_passes_explicit;
+ /* bitmask of recovery passes that we actually ran */
u64 recovery_passes_complete;
+ /* never rewinds version of curr_recovery_pass */
+ enum bch_recovery_pass recovery_pass_done;
+ struct semaphore online_fsck_mutex;
/* DEBUG JUNK */
struct dentry *fs_debug_dir;
@@ -1065,10 +1128,20 @@ static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref)
#endif
}
+static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
+{
+#ifdef BCH_WRITE_REF_DEBUG
+ return !test_bit(BCH_FS_going_ro, &c->flags) &&
+ atomic_long_inc_not_zero(&c->writes[ref]);
+#else
+ return percpu_ref_tryget(&c->writes);
+#endif
+}
+
static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref)
{
#ifdef BCH_WRITE_REF_DEBUG
- return !test_bit(BCH_FS_GOING_RO, &c->flags) &&
+ return !test_bit(BCH_FS_going_ro, &c->flags) &&
atomic_long_inc_not_zero(&c->writes[ref]);
#else
return percpu_ref_tryget_live(&c->writes);
@@ -1087,13 +1160,27 @@ static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref)
if (atomic_long_read(&c->writes[i]))
return;
- set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+ set_bit(BCH_FS_write_disable_complete, &c->flags);
wake_up(&bch2_read_only_wait);
#else
percpu_ref_put(&c->writes);
#endif
}
+static inline bool bch2_ro_ref_tryget(struct bch_fs *c)
+{
+ if (test_bit(BCH_FS_stopping, &c->flags))
+ return false;
+
+ return refcount_inc_not_zero(&c->ro_ref);
+}
+
+static inline void bch2_ro_ref_put(struct bch_fs *c)
+{
+ if (refcount_dec_and_test(&c->ro_ref))
+ wake_up(&c->ro_ref_wait);
+}
+
static inline void bch2_set_ra_pages(struct bch_fs *c, unsigned ra_pages)
{
#ifndef NO_BCACHEFS_FS
@@ -1117,11 +1204,6 @@ static inline unsigned block_sectors(const struct bch_fs *c)
return c->opts.block_size >> 9;
}
-static inline size_t btree_sectors(const struct bch_fs *c)
-{
- return c->opts.btree_node_size >> 9;
-}
-
static inline bool btree_id_cached(const struct bch_fs *c, enum btree_id btree)
{
return c->btree_key_cache_btrees & (1U << btree);
@@ -1158,6 +1240,27 @@ static inline bool bch2_dev_exists2(const struct bch_fs *c, unsigned dev)
return dev < c->sb.nr_devices && c->devs[dev];
}
+static inline struct stdio_redirect *bch2_fs_stdio_redirect(struct bch_fs *c)
+{
+ struct stdio_redirect *stdio = c->stdio;
+
+ if (c->stdio_filter && c->stdio_filter != current)
+ stdio = NULL;
+ return stdio;
+}
+
+static inline unsigned metadata_replicas_required(struct bch_fs *c)
+{
+ return min(c->opts.metadata_replicas,
+ c->opts.metadata_replicas_required);
+}
+
+static inline unsigned data_replicas_required(struct bch_fs *c)
+{
+ return min(c->opts.data_replicas,
+ c->opts.data_replicas_required);
+}
+
#define BKEY_PADDED_ONSTACK(key, pad) \
struct { struct bkey_i key; __u64 key ## _pad[pad]; }
diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h
index fe78e87603fc..0668b682a21c 100644
--- a/fs/bcachefs/bcachefs_format.h
+++ b/fs/bcachefs/bcachefs_format.h
@@ -307,6 +307,13 @@ struct bkey_i {
struct bch_val v;
};
+#define POS_KEY(_pos) \
+((struct bkey) { \
+ .u64s = BKEY_U64s, \
+ .format = KEY_FORMAT_CURRENT, \
+ .p = _pos, \
+})
+
#define KEY(_inode, _offset, _size) \
((struct bkey) { \
.u64s = BKEY_U64s, \
@@ -410,600 +417,12 @@ struct bch_set {
struct bch_val v;
};
-/* Extents */
-
-/*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the data that is currently
- * live. The size field in struct bkey records the current (live) size of the
- * extent, and is also used to mean "size of region on disk that we point to" in
- * this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8 (except we're encoding a
- * type, not a size), encoding the type in the position of the first set bit:
- *
- * bch_extent_crc32 - 0b1
- * bch_extent_ptr - 0b10
- * bch_extent_crc64 - 0b100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
/* 128 bits, sufficient for cryptographic MACs: */
struct bch_csum {
__le64 lo;
__le64 hi;
} __packed __aligned(8);
-#define BCH_EXTENT_ENTRY_TYPES() \
- x(ptr, 0) \
- x(crc32, 1) \
- x(crc64, 2) \
- x(crc128, 3) \
- x(stripe_ptr, 4) \
- x(rebalance, 5)
-#define BCH_EXTENT_ENTRY_MAX 6
-
-enum bch_extent_entry_type {
-#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-/* Compressed/uncompressed size are stored biased by 1: */
-struct bch_extent_crc32 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u32 type:2,
- _compressed_size:7,
- _uncompressed_size:7,
- offset:7,
- _unused:1,
- csum_type:4,
- compression_type:4;
- __u32 csum;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u32 csum;
- __u32 compression_type:4,
- csum_type:4,
- _unused:1,
- offset:7,
- _uncompressed_size:7,
- _compressed_size:7,
- type:2;
-#endif
-} __packed __aligned(8);
-
-#define CRC32_SIZE_MAX (1U << 7)
-#define CRC32_NONCE_MAX 0
-
-struct bch_extent_crc64 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:3,
- _compressed_size:9,
- _uncompressed_size:9,
- offset:9,
- nonce:10,
- csum_type:4,
- compression_type:4,
- csum_hi:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 csum_hi:16,
- compression_type:4,
- csum_type:4,
- nonce:10,
- offset:9,
- _uncompressed_size:9,
- _compressed_size:9,
- type:3;
-#endif
- __u64 csum_lo;
-} __packed __aligned(8);
-
-#define CRC64_SIZE_MAX (1U << 9)
-#define CRC64_NONCE_MAX ((1U << 10) - 1)
-
-struct bch_extent_crc128 {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:4,
- _compressed_size:13,
- _uncompressed_size:13,
- offset:13,
- nonce:13,
- csum_type:4,
- compression_type:4;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 compression_type:4,
- csum_type:4,
- nonce:13,
- offset:13,
- _uncompressed_size:13,
- _compressed_size:13,
- type:4;
-#endif
- struct bch_csum csum;
-} __packed __aligned(8);
-
-#define CRC128_SIZE_MAX (1U << 13)
-#define CRC128_NONCE_MAX ((1U << 13) - 1)
-
-/*
- * @reservation - pointer hasn't been written to, just reserved
- */
-struct bch_extent_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:1,
- cached:1,
- unused:1,
- unwritten:1,
- offset:44, /* 8 petabytes */
- dev:8,
- gen:8;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 gen:8,
- dev:8,
- offset:44,
- unwritten:1,
- unused:1,
- cached:1,
- type:1;
-#endif
-} __packed __aligned(8);
-
-struct bch_extent_stripe_ptr {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:5,
- block:8,
- redundancy:4,
- idx:47;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 idx:47,
- redundancy:4,
- block:8,
- type:5;
-#endif
-};
-
-struct bch_extent_rebalance {
-#if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 type:6,
- unused:34,
- compression:8, /* enum bch_compression_opt */
- target:16;
-#elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 target:16,
- compression:8,
- unused:34,
- type:6;
-#endif
-};
-
-union bch_extent_entry {
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
- unsigned long type;
-#elif __BITS_PER_LONG == 32
- struct {
- unsigned long pad;
- unsigned long type;
- };
-#else
-#error edit for your odd byteorder.
-#endif
-
-#define x(f, n) struct bch_extent_##f f;
- BCH_EXTENT_ENTRY_TYPES()
-#undef x
-};
-
-struct bch_btree_ptr {
- struct bch_val v;
-
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-struct bch_btree_ptr_v2 {
- struct bch_val v;
-
- __u64 mem_ptr;
- __le64 seq;
- __le16 sectors_written;
- __le16 flags;
- struct bpos min_key;
- __u64 _data[0];
- struct bch_extent_ptr start[];
-} __packed __aligned(8);
-
-LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
-
-struct bch_extent {
- struct bch_val v;
-
- __u64 _data[0];
- union bch_extent_entry start[];
-} __packed __aligned(8);
-
-struct bch_reservation {
- struct bch_val v;
-
- __le32 generation;
- __u8 nr_replicas;
- __u8 pad[3];
-} __packed __aligned(8);
-
-/* Maximum size (in u64s) a single pointer could be: */
-#define BKEY_EXTENT_PTR_U64s_MAX\
- ((sizeof(struct bch_extent_crc128) + \
- sizeof(struct bch_extent_ptr)) / sizeof(__u64))
-
-/* Maximum possible size of an entire extent value: */
-#define BKEY_EXTENT_VAL_U64s_MAX \
- (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
-
-/* * Maximum possible size of an entire extent, key + value: */
-#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
-
-/* Btree pointers don't carry around checksums: */
-#define BKEY_BTREE_PTR_VAL_U64s_MAX \
- ((sizeof(struct bch_btree_ptr_v2) + \
- sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
-#define BKEY_BTREE_PTR_U64s_MAX \
- (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
-
-/* Inodes */
-
-#define BLOCKDEV_INODE_MAX 4096
-
-#define BCACHEFS_ROOT_INO 4096
-
-struct bch_inode {
- struct bch_val v;
-
- __le64 bi_hash_seed;
- __le32 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v2 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le16 bi_mode;
- __u8 fields[];
-} __packed __aligned(8);
-
-struct bch_inode_v3 {
- struct bch_val v;
-
- __le64 bi_journal_seq;
- __le64 bi_hash_seed;
- __le64 bi_flags;
- __le64 bi_sectors;
- __le64 bi_size;
- __le64 bi_version;
- __u8 fields[];
-} __packed __aligned(8);
-
-#define INODEv3_FIELDS_START_INITIAL 6
-#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
-
-struct bch_inode_generation {
- struct bch_val v;
-
- __le32 bi_generation;
- __le32 pad;
-} __packed __aligned(8);
-
-/*
- * bi_subvol and bi_parent_subvol are only set for subvolume roots:
- */
-
-#define BCH_INODE_FIELDS_v2() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_size, 64) \
- x(bi_sectors, 64) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32)
-
-#define BCH_INODE_FIELDS_v3() \
- x(bi_atime, 96) \
- x(bi_ctime, 96) \
- x(bi_mtime, 96) \
- x(bi_otime, 96) \
- x(bi_uid, 32) \
- x(bi_gid, 32) \
- x(bi_nlink, 32) \
- x(bi_generation, 32) \
- x(bi_dev, 32) \
- x(bi_data_checksum, 8) \
- x(bi_compression, 8) \
- x(bi_project, 32) \
- x(bi_background_compression, 8) \
- x(bi_data_replicas, 8) \
- x(bi_promote_target, 16) \
- x(bi_foreground_target, 16) \
- x(bi_background_target, 16) \
- x(bi_erasure_code, 16) \
- x(bi_fields_set, 16) \
- x(bi_dir, 64) \
- x(bi_dir_offset, 64) \
- x(bi_subvol, 32) \
- x(bi_parent_subvol, 32) \
- x(bi_nocow, 8)
-
-/* subset of BCH_INODE_FIELDS */
-#define BCH_INODE_OPTS() \
- x(data_checksum, 8) \
- x(compression, 8) \
- x(project, 32) \
- x(background_compression, 8) \
- x(data_replicas, 8) \
- x(promote_target, 16) \
- x(foreground_target, 16) \
- x(background_target, 16) \
- x(erasure_code, 16) \
- x(nocow, 8)
-
-enum inode_opt_id {
-#define x(name, ...) \
- Inode_opt_##name,
- BCH_INODE_OPTS()
-#undef x
- Inode_opt_nr,
-};
-
-#define BCH_INODE_FLAGS() \
- x(sync, 0) \
- x(immutable, 1) \
- x(append, 2) \
- x(nodump, 3) \
- x(noatime, 4) \
- x(i_size_dirty, 5) \
- x(i_sectors_dirty, 6) \
- x(unlinked, 7) \
- x(backptr_untrusted, 8)
-
-/* bits 20+ reserved for packed fields below: */
-
-enum bch_inode_flags {
-#define x(t, n) BCH_INODE_##t = 1U << n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-enum __bch_inode_flags {
-#define x(t, n) __BCH_INODE_##t = n,
- BCH_INODE_FLAGS()
-#undef x
-};
-
-LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
-LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
-LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
-
-LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
-LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
-LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
-
-LE64_BITMASK(INODEv3_FIELDS_START,
- struct bch_inode_v3, bi_flags, 31, 36);
-LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
-
-/* Dirents */
-
-/*
- * Dirents (and xattrs) have to implement string lookups; since our b-tree
- * doesn't support arbitrary length strings for the key, we instead index by a
- * 64 bit hash (currently truncated sha1) of the string, stored in the offset
- * field of the key - using linear probing to resolve hash collisions. This also
- * provides us with the readdir cookie posix requires.
- *
- * Linear probing requires us to use whiteouts for deletions, in the event of a
- * collision:
- */
-
-struct bch_dirent {
- struct bch_val v;
-
- /* Target inode number: */
- union {
- __le64 d_inum;
- struct { /* DT_SUBVOL */
- __le32 d_child_subvol;
- __le32 d_parent_subvol;
- };
- };
-
- /*
- * Copy of mode bits 12-15 from the target inode - so userspace can get
- * the filetype without having to do a stat()
- */
- __u8 d_type;
-
- __u8 d_name[];
-} __packed __aligned(8);
-
-#define DT_SUBVOL 16
-#define BCH_DT_MAX 17
-
-#define BCH_NAME_MAX 512
-
-/* Xattrs */
-
-#define KEY_TYPE_XATTR_INDEX_USER 0
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
-#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
-#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
-#define KEY_TYPE_XATTR_INDEX_SECURITY 4
-
-struct bch_xattr {
- struct bch_val v;
- __u8 x_type;
- __u8 x_name_len;
- __le16 x_val_len;
- __u8 x_name[];
-} __packed __aligned(8);
-
-/* Bucket/allocation information: */
-
-struct bch_alloc {
- struct bch_val v;
- __u8 fields;
- __u8 gen;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V1() \
- x(read_time, 16) \
- x(write_time, 16) \
- x(data_type, 8) \
- x(dirty_sectors, 16) \
- x(cached_sectors, 16) \
- x(oldest_gen, 8) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-enum {
-#define x(name, _bits) BCH_ALLOC_FIELD_V1_##name,
- BCH_ALLOC_FIELDS_V1()
-#undef x
-};
-
-struct bch_alloc_v2 {
- struct bch_val v;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-#define BCH_ALLOC_FIELDS_V2() \
- x(read_time, 64) \
- x(write_time, 64) \
- x(dirty_sectors, 32) \
- x(cached_sectors, 32) \
- x(stripe, 32) \
- x(stripe_redundancy, 8)
-
-struct bch_alloc_v3 {
- struct bch_val v;
- __le64 journal_seq;
- __le32 flags;
- __u8 nr_fields;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 data[];
-} __packed __aligned(8);
-
-LE32_BITMASK(BCH_ALLOC_V3_NEED_DISCARD,struct bch_alloc_v3, flags, 0, 1)
-LE32_BITMASK(BCH_ALLOC_V3_NEED_INC_GEN,struct bch_alloc_v3, flags, 1, 2)
-
-struct bch_alloc_v4 {
- struct bch_val v;
- __u64 journal_seq;
- __u32 flags;
- __u8 gen;
- __u8 oldest_gen;
- __u8 data_type;
- __u8 stripe_redundancy;
- __u32 dirty_sectors;
- __u32 cached_sectors;
- __u64 io_time[2];
- __u32 stripe;
- __u32 nr_external_backpointers;
- __u64 fragmentation_lru;
-} __packed __aligned(8);
-
-#define BCH_ALLOC_V4_U64s_V0 6
-#define BCH_ALLOC_V4_U64s (sizeof(struct bch_alloc_v4) / sizeof(__u64))
-
-BITMASK(BCH_ALLOC_V4_NEED_DISCARD, struct bch_alloc_v4, flags, 0, 1)
-BITMASK(BCH_ALLOC_V4_NEED_INC_GEN, struct bch_alloc_v4, flags, 1, 2)
-BITMASK(BCH_ALLOC_V4_BACKPOINTERS_START,struct bch_alloc_v4, flags, 2, 8)
-BITMASK(BCH_ALLOC_V4_NR_BACKPOINTERS, struct bch_alloc_v4, flags, 8, 14)
-
-#define BCH_ALLOC_V4_NR_BACKPOINTERS_MAX 40
-
struct bch_backpointer {
struct bch_val v;
__u8 btree_id;
@@ -1014,154 +433,6 @@ struct bch_backpointer {
struct bpos pos;
} __packed __aligned(8);
-#define KEY_TYPE_BUCKET_GENS_BITS 8
-#define KEY_TYPE_BUCKET_GENS_NR (1U << KEY_TYPE_BUCKET_GENS_BITS)
-#define KEY_TYPE_BUCKET_GENS_MASK (KEY_TYPE_BUCKET_GENS_NR - 1)
-
-struct bch_bucket_gens {
- struct bch_val v;
- u8 gens[KEY_TYPE_BUCKET_GENS_NR];
-} __packed __aligned(8);
-
-/* Quotas: */
-
-enum quota_types {
- QTYP_USR = 0,
- QTYP_GRP = 1,
- QTYP_PRJ = 2,
- QTYP_NR = 3,
-};
-
-enum quota_counters {
- Q_SPC = 0,
- Q_INO = 1,
- Q_COUNTERS = 2,
-};
-
-struct bch_quota_counter {
- __le64 hardlimit;
- __le64 softlimit;
-};
-
-struct bch_quota {
- struct bch_val v;
- struct bch_quota_counter c[Q_COUNTERS];
-} __packed __aligned(8);
-
-/* Erasure coding */
-
-struct bch_stripe {
- struct bch_val v;
- __le16 sectors;
- __u8 algorithm;
- __u8 nr_blocks;
- __u8 nr_redundant;
-
- __u8 csum_granularity_bits;
- __u8 csum_type;
- __u8 pad;
-
- struct bch_extent_ptr ptrs[];
-} __packed __aligned(8);
-
-/* Reflink: */
-
-struct bch_reflink_p {
- struct bch_val v;
- __le64 idx;
- /*
- * A reflink pointer might point to an indirect extent which is then
- * later split (by copygc or rebalance). If we only pointed to part of
- * the original indirect extent, and then one of the fragments is
- * outside the range we point to, we'd leak a refcount: so when creating
- * reflink pointers, we need to store pad values to remember the full
- * range we were taking a reference on.
- */
- __le32 front_pad;
- __le32 back_pad;
-} __packed __aligned(8);
-
-struct bch_reflink_v {
- struct bch_val v;
- __le64 refcount;
- union bch_extent_entry start[0];
- __u64 _data[];
-} __packed __aligned(8);
-
-struct bch_indirect_inline_data {
- struct bch_val v;
- __le64 refcount;
- u8 data[];
-};
-
-/* Inline data */
-
-struct bch_inline_data {
- struct bch_val v;
- u8 data[];
-};
-
-/* Subvolumes: */
-
-#define SUBVOL_POS_MIN POS(0, 1)
-#define SUBVOL_POS_MAX POS(0, S32_MAX)
-#define BCACHEFS_ROOT_SUBVOL 1
-
-struct bch_subvolume {
- struct bch_val v;
- __le32 flags;
- __le32 snapshot;
- __le64 inode;
- /*
- * Snapshot subvolumes form a tree, separate from the snapshot nodes
- * tree - if this subvolume is a snapshot, this is the ID of the
- * subvolume it was created from:
- */
- __le32 parent;
- __le32 pad;
- bch_le128 otime;
-};
-
-LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
-/*
- * We need to know whether a subvolume is a snapshot so we can know whether we
- * can delete it (or whether it should just be rm -rf'd)
- */
-LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
-LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
-
-/* Snapshots */
-
-struct bch_snapshot {
- struct bch_val v;
- __le32 flags;
- __le32 parent;
- __le32 children[2];
- __le32 subvol;
- /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
- __le32 tree;
- __le32 depth;
- __le32 skip[3];
-};
-
-LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
-
-/* True if a subvolume points to this snapshot node: */
-LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
-
-/*
- * Snapshot trees:
- *
- * The snapshot_trees btree gives us persistent indentifier for each tree of
- * bch_snapshot nodes, and allow us to record and easily find the root/master
- * subvolume that other snapshots were created from:
- */
-struct bch_snapshot_tree {
- struct bch_val v;
- __le32 master_subvol;
- __le32 root_snapshot;
-};
-
/* LRU btree: */
struct bch_lru {
@@ -1171,33 +442,6 @@ struct bch_lru {
#define LRU_ID_STRIPES (1U << 16)
-/* Logged operations btree: */
-
-struct bch_logged_op_truncate {
- struct bch_val v;
- __le32 subvol;
- __le32 pad;
- __le64 inum;
- __le64 new_i_size;
-};
-
-enum logged_op_finsert_state {
- LOGGED_OP_FINSERT_start,
- LOGGED_OP_FINSERT_shift_extents,
- LOGGED_OP_FINSERT_finish,
-};
-
-struct bch_logged_op_finsert {
- struct bch_val v;
- __u8 state;
- __u8 pad[3];
- __le32 subvol;
- __le64 inum;
- __le64 dst_offset;
- __le64 src_offset;
- __le64 pos;
-};
-
/* Optional/variable size superblock sections: */
struct bch_sb_field {
@@ -1223,6 +467,19 @@ struct bch_sb_field {
x(ext, 13) \
x(downgrade, 14)
+#include "alloc_background_format.h"
+#include "extents_format.h"
+#include "reflink_format.h"
+#include "ec_format.h"
+#include "inode_format.h"
+#include "dirent_format.h"
+#include "xattr_format.h"
+#include "quota_format.h"
+#include "logged_ops_format.h"
+#include "snapshot_format.h"
+#include "subvolume_format.h"
+#include "sb-counters_format.h"
+
enum bch_sb_field_type {
#define x(f, nr) BCH_SB_FIELD_##f = nr,
BCH_SB_FIELDS()
@@ -1296,6 +553,7 @@ struct bch_member {
__le64 errors[BCH_MEMBER_ERROR_NR];
__le64 errors_at_reset[BCH_MEMBER_ERROR_NR];
__le64 errors_reset_time;
+ __le64 seq;
};
#define BCH_MEMBER_V1_BYTES 56
@@ -1442,7 +700,7 @@ struct bch_sb_field_replicas_v0 {
struct bch_replicas_entry_v0 entries[];
} __packed __aligned(8);
-struct bch_replicas_entry {
+struct bch_replicas_entry_v1 {
__u8 data_type;
__u8 nr_devs;
__u8 nr_required;
@@ -1454,24 +712,7 @@ struct bch_replicas_entry {
struct bch_sb_field_replicas {
struct bch_sb_field field;
- struct bch_replicas_entry entries[];
-} __packed __aligned(8);
-
-/* BCH_SB_FIELD_quota: */
-
-struct bch_sb_quota_counter {
- __le32 timelimit;
- __le32 warnlimit;
-};
-
-struct bch_sb_quota_type {
- __le64 flags;
- struct bch_sb_quota_counter c[Q_COUNTERS];
-};
-
-struct bch_sb_field_quota {
- struct bch_sb_field field;
- struct bch_sb_quota_type q[QTYP_NR];
+ struct bch_replicas_entry_v1 entries[];
} __packed __aligned(8);
/* BCH_SB_FIELD_disk_groups: */
@@ -1492,99 +733,6 @@ struct bch_sb_field_disk_groups {
struct bch_disk_group entries[];
} __packed __aligned(8);
-/* BCH_SB_FIELD_counters */
-
-#define BCH_PERSISTENT_COUNTERS() \
- x(io_read, 0) \
- x(io_write, 1) \
- x(io_move, 2) \
- x(bucket_invalidate, 3) \
- x(bucket_discard, 4) \
- x(bucket_alloc, 5) \
- x(bucket_alloc_fail, 6) \
- x(btree_cache_scan, 7) \
- x(btree_cache_reap, 8) \
- x(btree_cache_cannibalize, 9) \
- x(btree_cache_cannibalize_lock, 10) \
- x(btree_cache_cannibalize_lock_fail, 11) \
- x(btree_cache_cannibalize_unlock, 12) \
- x(btree_node_write, 13) \
- x(btree_node_read, 14) \
- x(btree_node_compact, 15) \
- x(btree_node_merge, 16) \
- x(btree_node_split, 17) \
- x(btree_node_rewrite, 18) \
- x(btree_node_alloc, 19) \
- x(btree_node_free, 20) \
- x(btree_node_set_root, 21) \
- x(btree_path_relock_fail, 22) \
- x(btree_path_upgrade_fail, 23) \
- x(btree_reserve_get_fail, 24) \
- x(journal_entry_full, 25) \
- x(journal_full, 26) \
- x(journal_reclaim_finish, 27) \
- x(journal_reclaim_start, 28) \
- x(journal_write, 29) \
- x(read_promote, 30) \
- x(read_bounce, 31) \
- x(read_split, 33) \
- x(read_retry, 32) \
- x(read_reuse_race, 34) \
- x(move_extent_read, 35) \
- x(move_extent_write, 36) \
- x(move_extent_finish, 37) \
- x(move_extent_fail, 38) \
- x(move_extent_start_fail, 39) \
- x(copygc, 40) \
- x(copygc_wait, 41) \
- x(gc_gens_end, 42) \
- x(gc_gens_start, 43) \
- x(trans_blocked_journal_reclaim, 44) \
- x(trans_restart_btree_node_reused, 45) \
- x(trans_restart_btree_node_split, 46) \
- x(trans_restart_fault_inject, 47) \
- x(trans_restart_iter_upgrade, 48) \
- x(trans_restart_journal_preres_get, 49) \
- x(trans_restart_journal_reclaim, 50) \
- x(trans_restart_journal_res_get, 51) \
- x(trans_restart_key_cache_key_realloced, 52) \
- x(trans_restart_key_cache_raced, 53) \
- x(trans_restart_mark_replicas, 54) \
- x(trans_restart_mem_realloced, 55) \
- x(trans_restart_memory_allocation_failure, 56) \
- x(trans_restart_relock, 57) \
- x(trans_restart_relock_after_fill, 58) \
- x(trans_restart_relock_key_cache_fill, 59) \
- x(trans_restart_relock_next_node, 60) \
- x(trans_restart_relock_parent_for_fill, 61) \
- x(trans_restart_relock_path, 62) \
- x(trans_restart_relock_path_intent, 63) \
- x(trans_restart_too_many_iters, 64) \
- x(trans_restart_traverse, 65) \
- x(trans_restart_upgrade, 66) \
- x(trans_restart_would_deadlock, 67) \
- x(trans_restart_would_deadlock_write, 68) \
- x(trans_restart_injected, 69) \
- x(trans_restart_key_cache_upgrade, 70) \
- x(trans_traverse_all, 71) \
- x(transaction_commit, 72) \
- x(write_super, 73) \
- x(trans_restart_would_deadlock_recursion_limit, 74) \
- x(trans_restart_write_buffer_flush, 75) \
- x(trans_restart_split_race, 76)
-
-enum bch_persistent_counters {
-#define x(t, n, ...) BCH_COUNTER_##t,
- BCH_PERSISTENT_COUNTERS()
-#undef x
- BCH_COUNTER_NR
-};
-
-struct bch_sb_field_counters {
- struct bch_sb_field field;
- __le64 d[];
-};
-
/*
* On clean shutdown, store btree roots and current journal sequence number in
* the superblock:
@@ -1662,69 +810,41 @@ struct bch_sb_field_downgrade {
#define BCH_VERSION_MINOR(_v) ((__u16) ((_v) & ~(~0U << 10)))
#define BCH_VERSION(_major, _minor) (((_major) << 10)|(_minor) << 0)
-#define RECOVERY_PASS_ALL_FSCK (1ULL << 63)
-
/*
* field 1: version name
* field 2: BCH_VERSION(major, minor)
* field 3: recovery passess required on upgrade
*/
#define BCH_METADATA_VERSIONS() \
- x(bkey_renumber, BCH_VERSION(0, 10), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_btree_change, BCH_VERSION(0, 11), \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot, BCH_VERSION(0, 12), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_backpointers, BCH_VERSION(0, 13), \
- RECOVERY_PASS_ALL_FSCK) \
- x(btree_ptr_sectors_written, BCH_VERSION(0, 14), \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot_2, BCH_VERSION(0, 15), \
- BIT_ULL(BCH_RECOVERY_PASS_fs_upgrade_for_subvolumes)| \
- BIT_ULL(BCH_RECOVERY_PASS_initialize_subvolumes)| \
- RECOVERY_PASS_ALL_FSCK) \
- x(reflink_p_fix, BCH_VERSION(0, 16), \
- BIT_ULL(BCH_RECOVERY_PASS_fix_reflink_p)) \
- x(subvol_dirent, BCH_VERSION(0, 17), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_v2, BCH_VERSION(0, 18), \
- RECOVERY_PASS_ALL_FSCK) \
- x(freespace, BCH_VERSION(0, 19), \
- RECOVERY_PASS_ALL_FSCK) \
- x(alloc_v4, BCH_VERSION(0, 20), \
- RECOVERY_PASS_ALL_FSCK) \
- x(new_data_types, BCH_VERSION(0, 21), \
- RECOVERY_PASS_ALL_FSCK) \
- x(backpointers, BCH_VERSION(0, 22), \
- RECOVERY_PASS_ALL_FSCK) \
- x(inode_v3, BCH_VERSION(0, 23), \
- RECOVERY_PASS_ALL_FSCK) \
- x(unwritten_extents, BCH_VERSION(0, 24), \
- RECOVERY_PASS_ALL_FSCK) \
- x(bucket_gens, BCH_VERSION(0, 25), \
- BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \
- RECOVERY_PASS_ALL_FSCK) \
- x(lru_v2, BCH_VERSION(0, 26), \
- RECOVERY_PASS_ALL_FSCK) \
- x(fragmentation_lru, BCH_VERSION(0, 27), \
- RECOVERY_PASS_ALL_FSCK) \
- x(no_bps_in_alloc_keys, BCH_VERSION(0, 28), \
- RECOVERY_PASS_ALL_FSCK) \
- x(snapshot_trees, BCH_VERSION(0, 29), \
- RECOVERY_PASS_ALL_FSCK) \
- x(major_minor, BCH_VERSION(1, 0), \
- 0) \
- x(snapshot_skiplists, BCH_VERSION(1, 1), \
- BIT_ULL(BCH_RECOVERY_PASS_check_snapshots)) \
- x(deleted_inodes, BCH_VERSION(1, 2), \
- BIT_ULL(BCH_RECOVERY_PASS_check_inodes)) \
- x(rebalance_work, BCH_VERSION(1, 3), \
- BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
+ x(bkey_renumber, BCH_VERSION(0, 10)) \
+ x(inode_btree_change, BCH_VERSION(0, 11)) \
+ x(snapshot, BCH_VERSION(0, 12)) \
+ x(inode_backpointers, BCH_VERSION(0, 13)) \
+ x(btree_ptr_sectors_written, BCH_VERSION(0, 14)) \
+ x(snapshot_2, BCH_VERSION(0, 15)) \
+ x(reflink_p_fix, BCH_VERSION(0, 16)) \
+ x(subvol_dirent, BCH_VERSION(0, 17)) \
+ x(inode_v2, BCH_VERSION(0, 18)) \
+ x(freespace, BCH_VERSION(0, 19)) \
+ x(alloc_v4, BCH_VERSION(0, 20)) \
+ x(new_data_types, BCH_VERSION(0, 21)) \
+ x(backpointers, BCH_VERSION(0, 22)) \
+ x(inode_v3, BCH_VERSION(0, 23)) \
+ x(unwritten_extents, BCH_VERSION(0, 24)) \
+ x(bucket_gens, BCH_VERSION(0, 25)) \
+ x(lru_v2, BCH_VERSION(0, 26)) \
+ x(fragmentation_lru, BCH_VERSION(0, 27)) \
+ x(no_bps_in_alloc_keys, BCH_VERSION(0, 28)) \
+ x(snapshot_trees, BCH_VERSION(0, 29)) \
+ x(major_minor, BCH_VERSION(1, 0)) \
+ x(snapshot_skiplists, BCH_VERSION(1, 1)) \
+ x(deleted_inodes, BCH_VERSION(1, 2)) \
+ x(rebalance_work, BCH_VERSION(1, 3)) \
+ x(member_seq, BCH_VERSION(1, 4))
enum bcachefs_metadata_version {
bcachefs_metadata_version_min = 9,
-#define x(t, n, upgrade_passes) bcachefs_metadata_version_##t = n,
+#define x(t, n) bcachefs_metadata_version_##t = n,
BCH_METADATA_VERSIONS()
#undef x
bcachefs_metadata_version_max
@@ -1786,7 +906,8 @@ struct bch_sb {
__le32 time_base_hi;
__le32 time_precision;
- __le64 flags[8];
+ __le64 flags[7];
+ __le64 write_time;
__le64 features[2];
__le64 compat[2];
@@ -2153,7 +1274,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb)
x(clock, 7) \
x(dev_usage, 8) \
x(log, 9) \
- x(overwrite, 10)
+ x(overwrite, 10) \
+ x(write_buffer_keys, 11)
enum {
#define x(f, nr) BCH_JSET_ENTRY_##f = nr,
@@ -2162,6 +1284,19 @@ enum {
BCH_JSET_ENTRY_NR
};
+static inline bool jset_entry_is_key(struct jset_entry *e)
+{
+ switch (e->type) {
+ case BCH_JSET_ENTRY_btree_keys:
+ case BCH_JSET_ENTRY_btree_root:
+ case BCH_JSET_ENTRY_overwrite:
+ case BCH_JSET_ENTRY_write_buffer_keys:
+ return true;
+ }
+
+ return false;
+}
+
/*
* Journal sequence numbers can be blacklisted: bsets record the max sequence
* number of all the journal entries they contain updates for, so that on
@@ -2203,7 +1338,7 @@ struct jset_entry_usage {
struct jset_entry_data_usage {
struct jset_entry entry;
__le64 v;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
struct jset_entry_clock {
@@ -2224,8 +1359,8 @@ struct jset_entry_dev_usage {
__le32 dev;
__u32 pad;
- __le64 buckets_ec;
- __le64 _buckets_unavailable; /* No longer used */
+ __le64 _buckets_ec; /* No longer used */
+ __le64 _buckets_unavailable; /* No longer used */
struct jset_entry_dev_usage_type d[];
};
@@ -2239,7 +1374,7 @@ static inline unsigned jset_entry_dev_usage_nr_types(struct jset_entry_dev_usage
struct jset_entry_log {
struct jset_entry entry;
u8 d[];
-} __packed;
+} __packed __aligned(8);
/*
* On disk format for a journal entry:
diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h
index f05881f7e113..4b8fba754b1c 100644
--- a/fs/bcachefs/bcachefs_ioctl.h
+++ b/fs/bcachefs/bcachefs_ioctl.h
@@ -81,6 +81,11 @@ struct bch_ioctl_incremental {
#define BCH_IOCTL_SUBVOLUME_CREATE _IOW(0xbc, 16, struct bch_ioctl_subvolume)
#define BCH_IOCTL_SUBVOLUME_DESTROY _IOW(0xbc, 17, struct bch_ioctl_subvolume)
+#define BCH_IOCTL_DEV_USAGE_V2 _IOWR(0xbc, 18, struct bch_ioctl_dev_usage_v2)
+
+#define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline)
+#define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online)
+
/* ioctl below act on a particular file, not the filesystem as a whole: */
#define BCHFS_IOC_REINHERIT_ATTRS _IOR(0xbc, 64, const char __user *)
@@ -173,12 +178,18 @@ struct bch_ioctl_disk_set_state {
__u64 dev;
};
+#define BCH_DATA_OPS() \
+ x(scrub, 0) \
+ x(rereplicate, 1) \
+ x(migrate, 2) \
+ x(rewrite_old_nodes, 3) \
+ x(drop_extra_replicas, 4)
+
enum bch_data_ops {
- BCH_DATA_OP_SCRUB = 0,
- BCH_DATA_OP_REREPLICATE = 1,
- BCH_DATA_OP_MIGRATE = 2,
- BCH_DATA_OP_REWRITE_OLD_NODES = 3,
- BCH_DATA_OP_NR = 4,
+#define x(t, n) BCH_DATA_OP_##t = n,
+ BCH_DATA_OPS()
+#undef x
+ BCH_DATA_OP_NR
};
/*
@@ -237,7 +248,7 @@ struct bch_ioctl_data_event {
struct bch_replicas_usage {
__u64 sectors;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
static inline struct bch_replicas_usage *
@@ -268,7 +279,7 @@ struct bch_ioctl_fs_usage {
__u32 replica_entries_bytes;
__u32 pad;
- struct bch_replicas_usage replicas[0];
+ struct bch_replicas_usage replicas[];
};
/*
@@ -292,7 +303,20 @@ struct bch_ioctl_dev_usage {
__u64 buckets;
__u64 sectors;
__u64 fragmented;
- } d[BCH_DATA_NR];
+ } d[10];
+};
+
+struct bch_ioctl_dev_usage_v2 {
+ __u64 dev;
+ __u32 flags;
+ __u8 state;
+ __u8 nr_data_types;
+ __u8 pad[6];
+
+ __u32 bucket_size;
+ __u64 nr_buckets;
+
+ struct bch_ioctl_dev_usage_type d[];
};
/*
@@ -365,4 +389,24 @@ struct bch_ioctl_subvolume {
#define BCH_SUBVOL_SNAPSHOT_CREATE (1U << 0)
#define BCH_SUBVOL_SNAPSHOT_RO (1U << 1)
+/*
+ * BCH_IOCTL_FSCK_OFFLINE: run fsck from the 'bcachefs fsck' userspace command,
+ * but with the kernel's implementation of fsck:
+ */
+struct bch_ioctl_fsck_offline {
+ __u64 flags;
+ __u64 opts; /* string */
+ __u64 nr_devs;
+ __u64 devs[] __counted_by(nr_devs);
+};
+
+/*
+ * BCH_IOCTL_FSCK_ONLINE: run fsck from the 'bcachefs fsck' userspace command,
+ * but with the kernel's implementation of fsck:
+ */
+struct bch_ioctl_fsck_online {
+ __u64 flags;
+ __u64 opts; /* string */
+};
+
#endif /* _BCACHEFS_IOCTL_H */
diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c
index abdb05507d16..76e79a15ba08 100644
--- a/fs/bcachefs/bkey.c
+++ b/fs/bcachefs/bkey.c
@@ -33,7 +33,7 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out,
next_key_bits -= 64;
}
- bch2_prt_u64_binary(out, v, min(word_bits, nr_key_bits));
+ bch2_prt_u64_base2_nbits(out, v, min(word_bits, nr_key_bits));
if (!next_key_bits)
break;
diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c
index 761f5e33b1e6..5e52684764eb 100644
--- a/fs/bcachefs/bkey_methods.c
+++ b/fs/bcachefs/bkey_methods.c
@@ -63,8 +63,17 @@ static int key_type_cookie_invalid(struct bch_fs *c, struct bkey_s_c k,
return 0;
}
+static void key_type_cookie_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bkey_s_c k)
+{
+ struct bkey_s_c_cookie ck = bkey_s_c_to_cookie(k);
+
+ prt_printf(out, "%llu", le64_to_cpu(ck.v->cookie));
+}
+
#define bch2_bkey_ops_cookie ((struct bkey_ops) { \
.key_invalid = key_type_cookie_invalid, \
+ .val_to_text = key_type_cookie_to_text, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/bkey_methods.h b/fs/bcachefs/bkey_methods.h
index 3a370b7087ac..03efe8ee565a 100644
--- a/fs/bcachefs/bkey_methods.h
+++ b/fs/bcachefs/bkey_methods.h
@@ -28,10 +28,8 @@ struct bkey_ops {
void (*swab)(struct bkey_s);
bool (*key_normalize)(struct bch_fs *, struct bkey_s);
bool (*key_merge)(struct bch_fs *, struct bkey_s, struct bkey_s_c);
- int (*trans_trigger)(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
- int (*atomic_trigger)(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
+ int (*trigger)(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
void (*compat)(enum btree_id id, unsigned version,
unsigned big_endian, int write,
struct bkey_s);
@@ -78,84 +76,88 @@ static inline bool bch2_bkey_maybe_mergable(const struct bkey *l, const struct b
bool bch2_bkey_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
-static inline int bch2_mark_key(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
-
- return ops->atomic_trigger
- ? ops->atomic_trigger(trans, btree, level, old, new, flags)
- : 0;
-}
-
enum btree_update_flags {
__BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE = __BTREE_ITER_FLAGS_END,
__BTREE_UPDATE_NOJOURNAL,
- __BTREE_UPDATE_PREJOURNAL,
__BTREE_UPDATE_KEY_CACHE_RECLAIM,
- __BTREE_TRIGGER_NORUN, /* Don't run triggers at all */
-
+ __BTREE_TRIGGER_NORUN,
+ __BTREE_TRIGGER_TRANSACTIONAL,
+ __BTREE_TRIGGER_ATOMIC,
+ __BTREE_TRIGGER_GC,
__BTREE_TRIGGER_INSERT,
__BTREE_TRIGGER_OVERWRITE,
-
- __BTREE_TRIGGER_GC,
__BTREE_TRIGGER_BUCKET_INVALIDATE,
- __BTREE_TRIGGER_NOATOMIC,
};
#define BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE (1U << __BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE)
#define BTREE_UPDATE_NOJOURNAL (1U << __BTREE_UPDATE_NOJOURNAL)
-#define BTREE_UPDATE_PREJOURNAL (1U << __BTREE_UPDATE_PREJOURNAL)
#define BTREE_UPDATE_KEY_CACHE_RECLAIM (1U << __BTREE_UPDATE_KEY_CACHE_RECLAIM)
+/* Don't run triggers at all */
#define BTREE_TRIGGER_NORUN (1U << __BTREE_TRIGGER_NORUN)
+/*
+ * If set, we're running transactional triggers as part of a transaction commit:
+ * triggers may generate new updates
+ *
+ * If cleared, and either BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE are set,
+ * we're running atomic triggers during a transaction commit: we have our
+ * journal reservation, we're holding btree node write locks, and we know the
+ * transaction is going to commit (returning an error here is a fatal error,
+ * causing us to go emergency read-only)
+ */
+#define BTREE_TRIGGER_TRANSACTIONAL (1U << __BTREE_TRIGGER_TRANSACTIONAL)
+#define BTREE_TRIGGER_ATOMIC (1U << __BTREE_TRIGGER_ATOMIC)
+
+/* We're in gc/fsck: running triggers to recalculate e.g. disk usage */
+#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
+
+/* @new is entering the btree */
#define BTREE_TRIGGER_INSERT (1U << __BTREE_TRIGGER_INSERT)
+
+/* @old is leaving the btree */
#define BTREE_TRIGGER_OVERWRITE (1U << __BTREE_TRIGGER_OVERWRITE)
-#define BTREE_TRIGGER_GC (1U << __BTREE_TRIGGER_GC)
+/* signal from bucket invalidate path to alloc trigger */
#define BTREE_TRIGGER_BUCKET_INVALIDATE (1U << __BTREE_TRIGGER_BUCKET_INVALIDATE)
-#define BTREE_TRIGGER_NOATOMIC (1U << __BTREE_TRIGGER_NOATOMIC)
-static inline int bch2_trans_mark_key(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+static inline int bch2_key_trigger(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new->k.type);
+ const struct bkey_ops *ops = bch2_bkey_type_ops(old.k->type ?: new.k->type);
- return ops->trans_trigger
- ? ops->trans_trigger(trans, btree_id, level, old, new, flags)
+ return ops->trigger
+ ? ops->trigger(trans, btree, level, old, new, flags)
: 0;
}
-static inline int bch2_trans_mark_old(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, unsigned flags)
+static inline int bch2_key_trigger_old(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, unsigned flags)
{
struct bkey_i deleted;
bkey_init(&deleted.k);
deleted.k.p = old.k->p;
- return bch2_trans_mark_key(trans, btree_id, level, old, &deleted,
- BTREE_TRIGGER_OVERWRITE|flags);
+ return bch2_key_trigger(trans, btree_id, level, old, bkey_i_to_s(&deleted),
+ BTREE_TRIGGER_OVERWRITE|flags);
}
-static inline int bch2_trans_mark_new(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_i *new, unsigned flags)
+static inline int bch2_key_trigger_new(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s new, unsigned flags)
{
struct bkey_i deleted;
bkey_init(&deleted.k);
- deleted.k.p = new->k.p;
+ deleted.k.p = new.k->p;
- return bch2_trans_mark_key(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
- BTREE_TRIGGER_INSERT|flags);
+ return bch2_key_trigger(trans, btree_id, level, bkey_i_to_s_c(&deleted), new,
+ BTREE_TRIGGER_INSERT|flags);
}
void bch2_bkey_renumber(enum btree_node_type, struct bkey_packed *, int);
diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c
index bb73ba9017b0..3fd1085b6c61 100644
--- a/fs/bcachefs/bset.c
+++ b/fs/bcachefs/bset.c
@@ -68,6 +68,12 @@ void bch2_dump_bset(struct bch_fs *c, struct btree *b,
_k = _n) {
_n = bkey_p_next(_k);
+ if (!_k->u64s) {
+ printk(KERN_ERR "block %u key %5zu - u64s 0? aieee!\n", set,
+ _k->_data - i->_data);
+ break;
+ }
+
k = bkey_disassemble(b, _k, &uk);
printbuf_reset(&buf);
@@ -714,7 +720,7 @@ static noinline void __build_ro_aux_tree(struct btree *b, struct bset_tree *t)
{
struct bkey_packed *prev = NULL, *k = btree_bkey_first(b, t);
struct bkey_i min_key, max_key;
- unsigned j, cacheline = 1;
+ unsigned cacheline = 1;
t->size = min(bkey_to_cacheline(b, t, btree_bkey_last(b, t)),
bset_ro_tree_capacity(b, t));
@@ -817,13 +823,12 @@ void bch2_bset_init_first(struct btree *b, struct bset *i)
set_btree_bset(b, t, i);
}
-void bch2_bset_init_next(struct bch_fs *c, struct btree *b,
- struct btree_node_entry *bne)
+void bch2_bset_init_next(struct btree *b, struct btree_node_entry *bne)
{
struct bset *i = &bne->keys;
struct bset_tree *t;
- BUG_ON(bset_byte_offset(b, bne) >= btree_bytes(c));
+ BUG_ON(bset_byte_offset(b, bne) >= btree_buf_bytes(b));
BUG_ON((void *) bne < (void *) btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(b->nsets >= MAX_BSETS);
diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h
index 632c2b8c5460..79c77baaa383 100644
--- a/fs/bcachefs/bset.h
+++ b/fs/bcachefs/bset.h
@@ -264,8 +264,7 @@ static inline struct bset *bset_next_set(struct btree *b,
void bch2_btree_keys_init(struct btree *);
void bch2_bset_init_first(struct btree *, struct bset *);
-void bch2_bset_init_next(struct bch_fs *, struct btree *,
- struct btree_node_entry *);
+void bch2_bset_init_next(struct btree *, struct btree_node_entry *);
void bch2_bset_build_aux_tree(struct btree *, struct bset_tree *, bool);
void bch2_bset_insert(struct btree *, struct btree_node_iter *,
diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c
index 79495cd7a794..d7c81beac14a 100644
--- a/fs/bcachefs/btree_cache.c
+++ b/fs/bcachefs/btree_cache.c
@@ -60,7 +60,7 @@ static void btree_node_data_free(struct bch_fs *c, struct btree *b)
clear_btree_node_just_written(b);
- kvpfree(b->data, btree_bytes(c));
+ kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
#ifdef __KERNEL__
kvfree(b->aux_data);
@@ -94,7 +94,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
{
BUG_ON(b->data || b->aux_data);
- b->data = kvpmalloc(btree_bytes(c), gfp);
+ b->data = kvpmalloc(btree_buf_bytes(b), gfp);
if (!b->data)
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
#ifdef __KERNEL__
@@ -107,7 +107,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp)
b->aux_data = NULL;
#endif
if (!b->aux_data) {
- kvpfree(b->data, btree_bytes(c));
+ kvpfree(b->data, btree_buf_bytes(b));
b->data = NULL;
return -BCH_ERR_ENOMEM_btree_node_mem_alloc;
}
@@ -126,7 +126,7 @@ static struct btree *__btree_node_mem_alloc(struct bch_fs *c, gfp_t gfp)
bkey_btree_ptr_init(&b->key);
INIT_LIST_HEAD(&b->list);
INIT_LIST_HEAD(&b->write_blocked);
- b->byte_order = ilog2(btree_bytes(c));
+ b->byte_order = ilog2(c->opts.btree_node_size);
return b;
}
@@ -408,7 +408,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c)
if (c->verify_data)
list_move(&c->verify_data->list, &bc->live);
- kvpfree(c->verify_ondisk, btree_bytes(c));
+ kvpfree(c->verify_ondisk, c->opts.btree_node_size);
for (i = 0; i < btree_id_nr_alive(c); i++) {
struct btree_root *r = bch2_btree_id_root(c, i);
@@ -500,19 +500,21 @@ void bch2_fs_btree_cache_init_early(struct btree_cache *bc)
* cannibalize_bucket() will take. This means every time we unlock the root of
* the btree, we need to release this lock if we have it held.
*/
-void bch2_btree_cache_cannibalize_unlock(struct bch_fs *c)
+void bch2_btree_cache_cannibalize_unlock(struct btree_trans *trans)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
if (bc->alloc_lock == current) {
- trace_and_count(c, btree_cache_cannibalize_unlock, c);
+ trace_and_count(c, btree_cache_cannibalize_unlock, trans);
bc->alloc_lock = NULL;
closure_wake_up(&bc->alloc_wait);
}
}
-int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
+int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure *cl)
{
+ struct bch_fs *c = trans->c;
struct btree_cache *bc = &c->btree_cache;
struct task_struct *old;
@@ -521,7 +523,7 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
if (!cl) {
- trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock;
}
@@ -535,11 +537,11 @@ int bch2_btree_cache_cannibalize_lock(struct bch_fs *c, struct closure *cl)
goto success;
}
- trace_and_count(c, btree_cache_cannibalize_lock_fail, c);
+ trace_and_count(c, btree_cache_cannibalize_lock_fail, trans);
return -BCH_ERR_btree_cache_cannibalize_lock_blocked;
success:
- trace_and_count(c, btree_cache_cannibalize_lock, c);
+ trace_and_count(c, btree_cache_cannibalize_lock, trans);
return 0;
}
@@ -673,7 +675,7 @@ err:
mutex_unlock(&bc->lock);
- trace_and_count(c, btree_cache_cannibalize, c);
+ trace_and_count(c, btree_cache_cannibalize, trans);
goto out;
}
@@ -717,12 +719,6 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
if (IS_ERR(b))
return b;
- /*
- * Btree nodes read in from disk should not have the accessed bit set
- * initially, so that linear scans don't thrash the cache:
- */
- clear_btree_node_accessed(b);
-
bkey_copy(&b->key, k);
if (bch2_btree_node_hash_insert(bc, b, level, btree_id)) {
/* raced with another fill: */
@@ -749,7 +745,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans,
if (path && sync)
bch2_trans_unlock_noassert(trans);
- bch2_btree_node_read(c, b, sync);
+ bch2_btree_node_read(trans, b, sync);
if (!sync)
return NULL;
@@ -1039,7 +1035,7 @@ retry:
goto retry;
if (IS_ERR(b) &&
- !bch2_btree_cache_cannibalize_lock(c, NULL))
+ !bch2_btree_cache_cannibalize_lock(trans, NULL))
goto retry;
if (IS_ERR(b))
@@ -1087,7 +1083,7 @@ lock_node:
EBUG_ON(BTREE_NODE_LEVEL(b->data) != level);
btree_check_header(c, b);
out:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return b;
}
@@ -1196,7 +1192,7 @@ void bch2_btree_node_to_text(struct printbuf *out, struct bch_fs *c, const struc
" failed unpacked %zu\n",
b->unpack_fn_len,
b->nr.live_u64s * sizeof(u64),
- btree_bytes(c) - sizeof(struct btree_node),
+ btree_buf_bytes(b) - sizeof(struct btree_node),
b->nr.live_u64s * 100 / btree_max_u64s(c),
b->sib_u64s[0],
b->sib_u64s[1],
diff --git a/fs/bcachefs/btree_cache.h b/fs/bcachefs/btree_cache.h
index cfb80b201d61..6d33885fdbde 100644
--- a/fs/bcachefs/btree_cache.h
+++ b/fs/bcachefs/btree_cache.h
@@ -17,8 +17,8 @@ int __bch2_btree_node_hash_insert(struct btree_cache *, struct btree *);
int bch2_btree_node_hash_insert(struct btree_cache *, struct btree *,
unsigned, enum btree_id);
-void bch2_btree_cache_cannibalize_unlock(struct bch_fs *);
-int bch2_btree_cache_cannibalize_lock(struct bch_fs *, struct closure *);
+void bch2_btree_cache_cannibalize_unlock(struct btree_trans *);
+int bch2_btree_cache_cannibalize_lock(struct btree_trans *, struct closure *);
struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *);
struct btree *bch2_btree_node_mem_alloc(struct btree_trans *, bool);
@@ -74,22 +74,27 @@ static inline bool btree_node_hashed(struct btree *b)
_iter = 0; _iter < (_tbl)->size; _iter++) \
rht_for_each_entry_rcu((_b), (_pos), _tbl, _iter, hash)
-static inline size_t btree_bytes(struct bch_fs *c)
+static inline size_t btree_buf_bytes(const struct btree *b)
{
- return c->opts.btree_node_size;
+ return 1UL << b->byte_order;
}
-static inline size_t btree_max_u64s(struct bch_fs *c)
+static inline size_t btree_buf_max_u64s(const struct btree *b)
{
- return (btree_bytes(c) - sizeof(struct btree_node)) / sizeof(u64);
+ return (btree_buf_bytes(b) - sizeof(struct btree_node)) / sizeof(u64);
}
-static inline size_t btree_pages(struct bch_fs *c)
+static inline size_t btree_max_u64s(const struct bch_fs *c)
{
- return btree_bytes(c) / PAGE_SIZE;
+ return (c->opts.btree_node_size - sizeof(struct btree_node)) / sizeof(u64);
}
-static inline unsigned btree_blocks(struct bch_fs *c)
+static inline size_t btree_sectors(const struct bch_fs *c)
+{
+ return c->opts.btree_node_size >> SECTOR_SHIFT;
+}
+
+static inline unsigned btree_blocks(const struct bch_fs *c)
{
return btree_sectors(c) >> c->block_bits;
}
diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c
index 30ab78a24517..1102995643b1 100644
--- a/fs/bcachefs/btree_gc.c
+++ b/fs/bcachefs/btree_gc.c
@@ -41,6 +41,14 @@
#define DROP_THIS_NODE 10
#define DROP_PREV_NODE 11
+static struct bkey_s unsafe_bkey_s_c_to_s(struct bkey_s_c k)
+{
+ return (struct bkey_s) {{{
+ (struct bkey *) k.k,
+ (struct bch_val *) k.v
+ }}};
+}
+
static bool should_restart_for_topology_repair(struct bch_fs *c)
{
return c->opts.fix_errors != FSCK_FIX_no &&
@@ -108,7 +116,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
goto err;
} else {
- set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
}
}
}
@@ -134,7 +142,7 @@ static int bch2_gc_check_topology(struct bch_fs *c,
ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology);
goto err;
} else {
- set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
}
}
@@ -414,10 +422,9 @@ again:
continue;
}
- if (ret) {
- bch_err_msg(c, ret, "getting btree node");
+ bch_err_msg(c, ret, "getting btree node");
+ if (ret)
break;
- }
ret = btree_repair_node_boundaries(c, b, prev, cur);
@@ -482,10 +489,9 @@ again:
false);
ret = PTR_ERR_OR_ZERO(cur);
- if (ret) {
- bch_err_msg(c, ret, "getting btree node");
+ bch_err_msg(c, ret, "getting btree node");
+ if (ret)
goto err;
- }
ret = bch2_btree_repair_topology_recurse(trans, cur);
six_unlock_read(&cur->c.lock);
@@ -591,7 +597,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen %u missing in alloc btree\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -609,7 +615,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s ptr gen in the future: %u > %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf)))) {
@@ -619,7 +625,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
g->data_type = 0;
g->dirty_sectors = 0;
g->cached_sectors = 0;
- set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ set_bit(BCH_FS_need_another_gc, &c->flags);
} else {
do_update = true;
}
@@ -631,7 +637,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr), g->gen,
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -643,7 +649,7 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu data type %s stale dirty ptr: %u < %u\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[ptr_data_type(k->k, &p.ptr)],
+ bch2_data_type_str(ptr_data_type(k->k, &p.ptr)),
p.ptr.gen, g->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))))
@@ -658,13 +664,13 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
"bucket %u:%zu different types of data in same bucket: %s, %s\n"
"while marking %s",
p.ptr.dev, PTR_BUCKET_NR(ca, &p.ptr),
- bch2_data_types[g->data_type],
- bch2_data_types[data_type],
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, *k), buf.buf))) {
if (data_type == BCH_DATA_btree) {
g->data_type = data_type;
- set_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ set_bit(BCH_FS_need_another_gc, &c->flags);
} else {
do_update = true;
}
@@ -707,8 +713,8 @@ static int bch2_check_fix_ptrs(struct btree_trans *trans, enum btree_id btree_id
new = kmalloc(bkey_bytes(k->k), GFP_KERNEL);
if (!new) {
- bch_err_msg(c, ret, "allocating new key");
ret = -BCH_ERR_ENOMEM_gc_repair_key;
+ bch_err_msg(c, ret, "allocating new key");
goto err;
}
@@ -807,9 +813,6 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
struct bch_fs *c = trans->c;
struct bkey deleted = KEY(0, 0, 0);
struct bkey_s_c old = (struct bkey_s_c) { &deleted, NULL };
- unsigned flags =
- BTREE_TRIGGER_GC|
- (initial ? BTREE_TRIGGER_NOATOMIC : 0);
int ret = 0;
deleted.p = k->k->p;
@@ -831,11 +834,10 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id,
}
ret = commit_do(trans, NULL, NULL, 0,
- bch2_mark_key(trans, btree_id, level, old, *k, flags));
+ bch2_key_trigger(trans, btree_id, level, old, unsafe_bkey_s_c_to_s(*k), BTREE_TRIGGER_GC));
fsck_err:
err:
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -996,7 +998,7 @@ static int bch2_gc_btree_init_recurse(struct btree_trans *trans, struct btree *b
/* Continue marking when opted to not
* fix the error: */
ret = 0;
- set_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags);
+ set_bit(BCH_FS_initial_gc_unfixed, &c->flags);
continue;
}
} else if (ret) {
@@ -1068,8 +1070,7 @@ static int bch2_gc_btree_init(struct btree_trans *trans,
fsck_err:
six_unlock_read(&b->c.lock);
- if (ret < 0)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
printbuf_exit(&buf);
return ret;
}
@@ -1105,10 +1106,8 @@ static int bch2_gc_btrees(struct bch_fs *c, bool initial, bool metadata_only)
: bch2_gc_btree(trans, i, initial, metadata_only);
}
- if (ret < 0)
- bch_err_fn(c, ret);
-
bch2_trans_put(trans);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1159,13 +1158,10 @@ static void bch2_mark_dev_superblock(struct bch_fs *c, struct bch_dev *ca,
static void bch2_mark_superblocks(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
mutex_lock(&c->sb_lock);
gc_pos_set(c, gc_phase(GC_PHASE_SB));
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
bch2_mark_dev_superblock(c, ca, BTREE_TRIGGER_GC);
mutex_unlock(&c->sb_lock);
}
@@ -1190,13 +1186,10 @@ static void bch2_mark_pending_btree_node_frees(struct bch_fs *c)
static void bch2_gc_free(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
genradix_free(&c->reflink_gc_table);
genradix_free(&c->gc_stripes);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
kvpfree(rcu_dereference_protected(ca->buckets_gc, 1),
sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket));
@@ -1218,7 +1211,7 @@ static int bch2_gc_done(struct bch_fs *c,
bool verify = !metadata_only &&
!c->opts.reconstruct_alloc &&
(!initial || (c->sb.compat & (1ULL << BCH_COMPAT_alloc_info)));
- unsigned i, dev;
+ unsigned i;
int ret = 0;
percpu_down_write(&c->mark_lock);
@@ -1230,14 +1223,14 @@ static int bch2_gc_done(struct bch_fs *c,
, ##__VA_ARGS__, dst->_f, src->_f))) \
dst->_f = src->_f
#define copy_dev_field(_err, _f, _msg, ...) \
- copy_field(_err, _f, "dev %u has wrong " _msg, dev, ##__VA_ARGS__)
+ copy_field(_err, _f, "dev %u has wrong " _msg, ca->dev_idx, ##__VA_ARGS__)
#define copy_fs_field(_err, _f, _msg, ...) \
copy_field(_err, _f, "fs has wrong " _msg, ##__VA_ARGS__)
for (i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
- for_each_member_device(ca, c, dev) {
+ __for_each_member_device(c, ca) {
struct bch_dev_usage *dst = ca->usage_base;
struct bch_dev_usage *src = (void *)
bch2_acc_percpu_u64s((u64 __percpu *) ca->usage_gc,
@@ -1245,15 +1238,12 @@ static int bch2_gc_done(struct bch_fs *c,
for (i = 0; i < BCH_DATA_NR; i++) {
copy_dev_field(dev_usage_buckets_wrong,
- d[i].buckets, "%s buckets", bch2_data_types[i]);
+ d[i].buckets, "%s buckets", bch2_data_type_str(i));
copy_dev_field(dev_usage_sectors_wrong,
- d[i].sectors, "%s sectors", bch2_data_types[i]);
+ d[i].sectors, "%s sectors", bch2_data_type_str(i));
copy_dev_field(dev_usage_fragmented_wrong,
- d[i].fragmented, "%s fragmented", bch2_data_types[i]);
+ d[i].fragmented, "%s fragmented", bch2_data_type_str(i));
}
-
- copy_dev_field(dev_usage_buckets_ec_wrong,
- buckets_ec, "buckets_ec");
}
{
@@ -1263,19 +1253,19 @@ static int bch2_gc_done(struct bch_fs *c,
bch2_acc_percpu_u64s((u64 __percpu *) c->usage_gc, nr);
copy_fs_field(fs_usage_hidden_wrong,
- hidden, "hidden");
+ b.hidden, "hidden");
copy_fs_field(fs_usage_btree_wrong,
- btree, "btree");
+ b.btree, "btree");
if (!metadata_only) {
copy_fs_field(fs_usage_data_wrong,
- data, "data");
+ b.data, "data");
copy_fs_field(fs_usage_cached_wrong,
- cached, "cached");
+ b.cached, "cached");
copy_fs_field(fs_usage_reserved_wrong,
- reserved, "reserved");
+ b.reserved, "reserved");
copy_fs_field(fs_usage_nr_inodes_wrong,
- nr_inodes,"nr_inodes");
+ b.nr_inodes,"nr_inodes");
for (i = 0; i < BCH_REPLICAS_MAX; i++)
copy_fs_field(fs_usage_persistent_reserved_wrong,
@@ -1284,7 +1274,7 @@ static int bch2_gc_done(struct bch_fs *c,
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
if (metadata_only &&
@@ -1307,8 +1297,7 @@ static int bch2_gc_done(struct bch_fs *c,
fsck_err:
if (ca)
percpu_ref_put(&ca->ref);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
percpu_up_write(&c->mark_lock);
printbuf_exit(&buf);
@@ -1317,9 +1306,6 @@ fsck_err:
static int bch2_gc_start(struct bch_fs *c)
{
- struct bch_dev *ca = NULL;
- unsigned i;
-
BUG_ON(c->usage_gc);
c->usage_gc = __alloc_percpu_gfp(fs_usage_u64s(c) * sizeof(u64),
@@ -1329,7 +1315,7 @@ static int bch2_gc_start(struct bch_fs *c)
return -BCH_ERR_ENOMEM_gc_start;
}
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
BUG_ON(ca->usage_gc);
ca->usage_gc = alloc_percpu(struct bch_dev_usage);
@@ -1348,10 +1334,7 @@ static int bch2_gc_start(struct bch_fs *c)
static int bch2_gc_reset(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
free_percpu(ca->usage_gc);
ca->usage_gc = NULL;
}
@@ -1389,9 +1372,6 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
enum bch_data_type type;
int ret;
- if (bkey_ge(iter->pos, POS(ca->dev_idx, ca->mi.nbuckets)))
- return 1;
-
old = bch2_alloc_to_v4(k, &old_convert);
new = *old;
@@ -1437,8 +1417,8 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %s, should be %s",
iter->pos.inode, iter->pos.offset,
gc.gen,
- bch2_data_types[new.data_type],
- bch2_data_types[gc.data_type]))
+ bch2_data_type_str(new.data_type),
+ bch2_data_type_str(gc.data_type)))
new.data_type = gc.data_type;
#define copy_bucket_field(_errtype, _f) \
@@ -1448,7 +1428,7 @@ static int bch2_alloc_write_key(struct btree_trans *trans,
": got %u, should be %u", \
iter->pos.inode, iter->pos.offset, \
gc.gen, \
- bch2_data_types[gc.data_type], \
+ bch2_data_type_str(gc.data_type), \
new._f, gc._f)) \
new._f = gc._f; \
@@ -1488,52 +1468,36 @@ fsck_err:
static int bch2_gc_alloc_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_dev *ca;
- unsigned i;
int ret = 0;
- for_each_member_device(ca, c, i) {
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS(ca->dev_idx, ca->mi.first_bucket),
- BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW,
- bch2_alloc_write_key(trans, &iter, k, metadata_only));
-
- if (ret < 0) {
- bch_err_fn(c, ret);
+ for_each_member_device(c, ca) {
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto_commit(trans, iter, BTREE_ID_alloc,
+ POS(ca->dev_idx, ca->mi.first_bucket),
+ POS(ca->dev_idx, ca->mi.nbuckets - 1),
+ BTREE_ITER_SLOTS|BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
+ bch2_alloc_write_key(trans, &iter, k, metadata_only)));
+ if (ret) {
percpu_ref_put(&ca->ref);
break;
}
}
- bch2_trans_put(trans);
- return ret < 0 ? ret : 0;
+ bch_err_fn(c, ret);
+ return ret;
}
static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
{
- struct bch_dev *ca;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bucket *g;
- struct bch_alloc_v4 a_convert;
- const struct bch_alloc_v4 *a;
- unsigned i;
- int ret;
-
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bucket_array *buckets = kvpmalloc(sizeof(struct bucket_array) +
ca->mi.nbuckets * sizeof(struct bucket),
GFP_KERNEL|__GFP_ZERO);
if (!buckets) {
percpu_ref_put(&ca->ref);
bch_err(c, "error allocating ca->buckets[gc]");
- ret = -BCH_ERR_ENOMEM_gc_alloc_start;
- goto err;
+ return -BCH_ERR_ENOMEM_gc_alloc_start;
}
buckets->first_bucket = ca->mi.first_bucket;
@@ -1541,42 +1505,38 @@ static int bch2_gc_alloc_start(struct bch_fs *c, bool metadata_only)
rcu_assign_pointer(ca->buckets_gc, buckets);
}
- ret = for_each_btree_key2(trans, iter, BTREE_ID_alloc, POS_MIN,
- BTREE_ITER_PREFETCH, k, ({
- ca = bch_dev_bkey_exists(c, k.k->p.inode);
- g = gc_bucket(ca, k.k->p.offset);
-
- a = bch2_alloc_to_v4(k, &a_convert);
-
- g->gen_valid = 1;
- g->gen = a->gen;
-
- if (metadata_only &&
- (a->data_type == BCH_DATA_user ||
- a->data_type == BCH_DATA_cached ||
- a->data_type == BCH_DATA_parity)) {
- g->data_type = a->data_type;
- g->dirty_sectors = a->dirty_sectors;
- g->cached_sectors = a->cached_sectors;
- g->stripe = a->stripe;
- g->stripe_redundancy = a->stripe_redundancy;
- }
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
+ struct bch_dev *ca = bch_dev_bkey_exists(c, k.k->p.inode);
+ struct bucket *g = gc_bucket(ca, k.k->p.offset);
- 0;
- }));
-err:
- bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ struct bch_alloc_v4 a_convert;
+ const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &a_convert);
+
+ g->gen_valid = 1;
+ g->gen = a->gen;
+
+ if (metadata_only &&
+ (a->data_type == BCH_DATA_user ||
+ a->data_type == BCH_DATA_cached ||
+ a->data_type == BCH_DATA_parity)) {
+ g->data_type = a->data_type;
+ g->dirty_sectors = a->dirty_sectors;
+ g->cached_sectors = a->cached_sectors;
+ g->stripe = a->stripe;
+ g->stripe_redundancy = a->stripe_redundancy;
+ }
+
+ 0;
+ })));
+ bch_err_fn(c, ret);
return ret;
}
static void bch2_gc_alloc_reset(struct bch_fs *c, bool metadata_only)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bucket_array *buckets = gc_bucket_array(ca);
struct bucket *g;
@@ -1634,7 +1594,7 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans,
if (!r->refcount)
new->k.type = KEY_TYPE_deleted;
else
- *bkey_refcount(new) = cpu_to_le64(r->refcount);
+ *bkey_refcount(bkey_i_to_s(new)) = cpu_to_le64(r->refcount);
}
fsck_err:
printbuf_exit(&buf);
@@ -1643,64 +1603,52 @@ fsck_err:
static int bch2_gc_reflink_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
size_t idx = 0;
- int ret = 0;
if (metadata_only)
return 0;
- trans = bch2_trans_get(c);
-
- ret = for_each_btree_key_commit(trans, iter,
- BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_gc_write_reflink_key(trans, &iter, k, &idx));
-
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_gc_write_reflink_key(trans, &iter, k, &idx)));
c->reflink_gc_nr = 0;
- bch2_trans_put(trans);
return ret;
}
static int bch2_gc_reflink_start(struct bch_fs *c,
bool metadata_only)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct reflink_gc *r;
- int ret = 0;
if (metadata_only)
return 0;
- trans = bch2_trans_get(c);
c->reflink_gc_nr = 0;
- for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- const __le64 *refcount = bkey_refcount_c(k);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_reflink, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
+ const __le64 *refcount = bkey_refcount_c(k);
- if (!refcount)
- continue;
+ if (!refcount)
+ continue;
- r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++,
- GFP_KERNEL);
- if (!r) {
- ret = -BCH_ERR_ENOMEM_gc_reflink_start;
- break;
- }
+ struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table,
+ c->reflink_gc_nr++, GFP_KERNEL);
+ if (!r) {
+ ret = -BCH_ERR_ENOMEM_gc_reflink_start;
+ break;
+ }
- r->offset = k.k->p.offset;
- r->size = k.k->size;
- r->refcount = 0;
- }
- bch2_trans_iter_exit(trans, &iter);
+ r->offset = k.k->p.offset;
+ r->size = k.k->size;
+ r->refcount = 0;
+ 0;
+ })));
- bch2_trans_put(trans);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1768,24 +1716,15 @@ fsck_err:
static int bch2_gc_stripes_done(struct bch_fs *c, bool metadata_only)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
if (metadata_only)
return 0;
- trans = bch2_trans_get(c);
-
- ret = for_each_btree_key_commit(trans, iter,
- BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
- bch2_gc_write_stripes_key(trans, &iter, k));
-
- bch2_trans_put(trans);
- return ret;
+ return bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter,
+ BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ bch2_gc_write_stripes_key(trans, &iter, k)));
}
static void bch2_gc_stripes_reset(struct bch_fs *c, bool metadata_only)
@@ -1848,7 +1787,7 @@ again:
#endif
c->gc_count++;
- if (test_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags) ||
+ if (test_bit(BCH_FS_need_another_gc, &c->flags) ||
(!iter && bch2_test_restart_gc)) {
if (iter++ > 2) {
bch_info(c, "Unable to fix bucket gens, looping");
@@ -1860,7 +1799,7 @@ again:
* XXX: make sure gens we fixed got saved
*/
bch_info(c, "Second GC pass needed, restarting:");
- clear_bit(BCH_FS_NEED_ANOTHER_GC, &c->flags);
+ clear_bit(BCH_FS_need_another_gc, &c->flags);
__gc_pos_set(c, gc_phase(GC_PHASE_NOT_RUNNING));
bch2_gc_stripes_reset(c, metadata_only);
@@ -1900,9 +1839,7 @@ out:
* allocator thread - issue wakeup in case they blocked on gc_lock:
*/
closure_wake_up(&c->freelist_wait);
-
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1912,7 +1849,6 @@ static int gc_btree_gens_key(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
struct bkey_i *u;
int ret;
@@ -1970,12 +1906,7 @@ static int bch2_alloc_write_oldest_gen(struct btree_trans *trans, struct btree_i
int bch2_gc_gens(struct bch_fs *c)
{
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_dev *ca;
u64 b, start_time = local_clock();
- unsigned i;
int ret;
/*
@@ -1988,9 +1919,8 @@ int bch2_gc_gens(struct bch_fs *c)
trace_and_count(c, gc_gens_start, c);
down_read(&c->gc_lock);
- trans = bch2_trans_get(c);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bucket_gens *gens = bucket_gens(ca);
BUG_ON(ca->oldest_gen);
@@ -2007,33 +1937,31 @@ int bch2_gc_gens(struct bch_fs *c)
ca->oldest_gen[b] = gens->b[b];
}
- for (i = 0; i < BTREE_ID_NR; i++)
+ for (unsigned i = 0; i < BTREE_ID_NR; i++)
if (btree_type_has_ptrs(i)) {
c->gc_gens_btree = i;
c->gc_gens_pos = POS_MIN;
- ret = for_each_btree_key_commit(trans, iter, i,
- POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
- k,
- NULL, NULL,
- BTREE_INSERT_NOFAIL,
- gc_btree_gens_key(trans, &iter, k));
- if (ret && !bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, i,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ gc_btree_gens_key(trans, &iter, k)));
if (ret)
goto err;
}
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
- POS_MIN,
- BTREE_ITER_PREFETCH,
- k,
- NULL, NULL,
- BTREE_INSERT_NOFAIL,
- bch2_alloc_write_oldest_gen(trans, &iter, k));
- if (ret && !bch2_err_matches(ret, EROFS))
- bch_err_fn(c, ret);
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_alloc,
+ POS_MIN,
+ BTREE_ITER_PREFETCH,
+ k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ bch2_alloc_write_oldest_gen(trans, &iter, k)));
if (ret)
goto err;
@@ -2045,14 +1973,15 @@ int bch2_gc_gens(struct bch_fs *c)
bch2_time_stats_update(&c->times[BCH_TIME_btree_gc], start_time);
trace_and_count(c, gc_gens_end, c);
err:
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
kvfree(ca->oldest_gen);
ca->oldest_gen = NULL;
}
- bch2_trans_put(trans);
up_read(&c->gc_lock);
mutex_unlock(&c->gc_gens_lock);
+ if (!bch2_err_matches(ret, EROFS))
+ bch_err_fn(c, ret);
return ret;
}
@@ -2062,7 +1991,6 @@ static int bch2_gc_thread(void *arg)
struct io_clock *clock = &c->io_clock[WRITE];
unsigned long last = atomic64_read(&clock->now);
unsigned last_kick = atomic_read(&c->kick_gc);
- int ret;
set_freezable();
@@ -2102,11 +2030,8 @@ static int bch2_gc_thread(void *arg)
#if 0
ret = bch2_gc(c, false, false);
#else
- ret = bch2_gc_gens(c);
+ bch2_gc_gens(c);
#endif
- if (ret < 0)
- bch_err_fn(c, ret);
-
debug_check_no_locks_held();
}
diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c
index 5a720f0cd5a6..aa9b6cbe3226 100644
--- a/fs/bcachefs/btree_io.c
+++ b/fs/bcachefs/btree_io.c
@@ -112,7 +112,7 @@ static void *btree_bounce_alloc(struct bch_fs *c, size_t size,
unsigned flags = memalloc_nofs_save();
void *p;
- BUG_ON(size > btree_bytes(c));
+ BUG_ON(size > c->opts.btree_node_size);
*used_mempool = false;
p = vpmalloc(size, __GFP_NOWARN|GFP_NOWAIT);
@@ -174,8 +174,8 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
ptrs = ptrs_end = ((void *) new_whiteouts + bytes);
- for (k = unwritten_whiteouts_start(c, b);
- k != unwritten_whiteouts_end(c, b);
+ for (k = unwritten_whiteouts_start(b);
+ k != unwritten_whiteouts_end(b);
k = bkey_p_next(k))
*--ptrs = k;
@@ -192,7 +192,7 @@ static void bch2_sort_whiteouts(struct bch_fs *c, struct btree *b)
verify_no_dups(b, new_whiteouts,
(void *) ((u64 *) new_whiteouts + b->whiteout_u64s));
- memcpy_u64s(unwritten_whiteouts_start(c, b),
+ memcpy_u64s(unwritten_whiteouts_start(b),
new_whiteouts, b->whiteout_u64s);
btree_bounce_free(c, bytes, used_mempool, new_whiteouts);
@@ -313,7 +313,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
}
bytes = sorting_entire_node
- ? btree_bytes(c)
+ ? btree_buf_bytes(b)
: __vstruct_bytes(struct btree_node, u64s);
out = btree_bounce_alloc(c, bytes, &used_mempool);
@@ -338,7 +338,7 @@ static void btree_node_sort(struct bch_fs *c, struct btree *b,
if (sorting_entire_node) {
u64s = le16_to_cpu(out->keys.u64s);
- BUG_ON(bytes != btree_bytes(c));
+ BUG_ON(bytes != btree_buf_bytes(b));
/*
* Our temporary buffer is the same size as the btree node's
@@ -502,7 +502,7 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(c, b, bne);
+ bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);
@@ -524,7 +524,8 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c,
prt_printf(out, "at btree ");
bch2_btree_pos_to_text(out, c, b);
- prt_printf(out, "\n node offset %u", b->written);
+ prt_printf(out, "\n node offset %u/%u",
+ b->written, btree_ptr_sectors_written(&b->key));
if (i)
prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s));
prt_str(out, ": ");
@@ -830,6 +831,23 @@ static int bset_key_invalid(struct bch_fs *c, struct btree *b,
(rw == WRITE ? bch2_bkey_val_invalid(c, k, READ, err) : 0);
}
+static bool __bkey_valid(struct bch_fs *c, struct btree *b,
+ struct bset *i, struct bkey_packed *k)
+{
+ if (bkey_p_next(k) > vstruct_last(i))
+ return false;
+
+ if (k->format > KEY_FORMAT_CURRENT)
+ return false;
+
+ struct printbuf buf = PRINTBUF;
+ struct bkey tmp;
+ struct bkey_s u = __bkey_disassemble(b, k, &tmp);
+ bool ret = __bch2_bkey_invalid(c, u.s_c, btree_node_type(b), READ, &buf);
+ printbuf_exit(&buf);
+ return ret;
+}
+
static int validate_bset_keys(struct bch_fs *c, struct btree *b,
struct bset *i, int write,
bool have_retry, bool *saw_error)
@@ -845,6 +863,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
k != vstruct_last(i);) {
struct bkey_s u;
struct bkey tmp;
+ unsigned next_good_key;
if (btree_err_on(bkey_p_next(k) > vstruct_last(i),
-BCH_ERR_btree_node_read_err_fixable,
@@ -859,12 +878,8 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_bad_format,
- "invalid bkey format %u", k->format)) {
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_p_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- continue;
- }
+ "invalid bkey format %u", k->format))
+ goto drop_this_key;
/* XXX: validate k->u64s */
if (!write)
@@ -885,11 +900,7 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
c, NULL, b, i,
btree_node_bad_bkey,
"invalid bkey: %s", buf.buf);
-
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_p_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- continue;
+ goto drop_this_key;
}
if (write)
@@ -906,21 +917,45 @@ static int validate_bset_keys(struct bch_fs *c, struct btree *b,
prt_printf(&buf, " > ");
bch2_bkey_to_text(&buf, u.k);
- bch2_dump_bset(c, b, i, 0);
-
if (btree_err(-BCH_ERR_btree_node_read_err_fixable,
c, NULL, b, i,
btree_node_bkey_out_of_order,
- "%s", buf.buf)) {
- i->u64s = cpu_to_le16(le16_to_cpu(i->u64s) - k->u64s);
- memmove_u64s_down(k, bkey_p_next(k),
- (u64 *) vstruct_end(i) - (u64 *) k);
- continue;
- }
+ "%s", buf.buf))
+ goto drop_this_key;
}
prev = k;
k = bkey_p_next(k);
+ continue;
+drop_this_key:
+ next_good_key = k->u64s;
+
+ if (!next_good_key ||
+ (BSET_BIG_ENDIAN(i) == CPU_BIG_ENDIAN &&
+ version >= bcachefs_metadata_version_snapshot)) {
+ /*
+ * only do scanning if bch2_bkey_compat() has nothing to
+ * do
+ */
+
+ if (!__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key))) {
+ for (next_good_key = 1;
+ next_good_key < (u64 *) vstruct_last(i) - (u64 *) k;
+ next_good_key++)
+ if (__bkey_valid(c, b, i, (void *) ((u64 *) k + next_good_key)))
+ goto got_good_key;
+
+ }
+
+ /*
+ * didn't find a good key, have to truncate the rest of
+ * the bset
+ */
+ next_good_key = (u64 *) vstruct_last(i) - (u64 *) k;
+ }
+got_good_key:
+ le16_add_cpu(&i->u64s, -next_good_key);
+ memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k);
}
fsck_err:
printbuf_exit(&buf);
@@ -934,7 +969,6 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct sort_iter *iter;
struct btree_node *sorted;
struct bkey_packed *k;
- struct bch_extent_ptr *ptr;
struct bset *i;
bool used_mempool, blacklisted;
bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 &&
@@ -943,6 +977,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
unsigned ptr_written = btree_ptr_sectors_written(&b->key);
struct printbuf buf = PRINTBUF;
int ret = 0, retry_read = 0, write = READ;
+ u64 start_time = local_clock();
b->version_ondisk = U16_MAX;
/* We might get called multiple times on read retry: */
@@ -968,12 +1003,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
struct bch_btree_ptr_v2 *bp =
&bkey_i_to_btree_ptr_v2(&b->key)->v;
+ bch2_bpos_to_text(&buf, b->data->min_key);
+ prt_str(&buf, "-");
+ bch2_bpos_to_text(&buf, b->data->max_key);
+
btree_err_on(b->data->keys.seq != bp->seq,
-BCH_ERR_btree_node_read_err_must_retry,
c, ca, b, NULL,
btree_node_bad_seq,
- "got wrong btree node (seq %llx want %llx)",
- b->data->keys.seq, bp->seq);
+ "got wrong btree node (want %llx got %llx)\n"
+ "got btree %s level %llu pos %s",
+ bp->seq, b->data->keys.seq,
+ bch2_btree_id_str(BTREE_NODE_ID(b->data)),
+ BTREE_NODE_LEVEL(b->data),
+ buf.buf);
} else {
btree_err_on(!b->data->keys.seq,
-BCH_ERR_btree_node_read_err_must_retry,
@@ -999,8 +1042,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
nonce = btree_nonce(i, b->written << 9);
- csum_bad = bch2_crc_cmp(b->data->csum,
- csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data));
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, b->data);
+ csum_bad = bch2_crc_cmp(b->data->csum, csum);
if (csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
@@ -1008,7 +1051,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_bad_csum,
- "invalid checksum");
+ "%s",
+ (printbuf_reset(&buf),
+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), b->data->csum, csum),
+ buf.buf));
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
@@ -1037,8 +1083,8 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"unknown checksum type %llu", BSET_CSUM_TYPE(i));
nonce = btree_nonce(i, b->written << 9);
- csum_bad = bch2_crc_cmp(bne->csum,
- csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne));
+ struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne);
+ csum_bad = bch2_crc_cmp(bne->csum, csum);
if (csum_bad)
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
@@ -1046,7 +1092,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
-BCH_ERR_btree_node_read_err_want_retry,
c, ca, b, i,
bset_bad_csum,
- "invalid checksum");
+ "%s",
+ (printbuf_reset(&buf),
+ bch2_csum_err_msg(&buf, BSET_CSUM_TYPE(i), bne->csum, csum),
+ buf.buf));
ret = bset_encrypt(c, i, b->written << 9);
if (bch2_fs_fatal_err_on(ret, c,
@@ -1111,7 +1160,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
ptr_written, b->written);
} else {
for (bne = write_block(b);
- bset_byte_offset(b, bne) < btree_bytes(c);
+ bset_byte_offset(b, bne) < btree_buf_bytes(b);
bne = (void *) bne + block_bytes(c))
btree_err_on(bne->keys.seq == b->data->keys.seq &&
!bch2_journal_seq_is_blacklisted(c,
@@ -1123,7 +1172,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
"found bset signature after last bset");
}
- sorted = btree_bounce_alloc(c, btree_bytes(c), &used_mempool);
+ sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool);
sorted->keys.u64s = 0;
set_btree_bset(b, b->set, &b->data->keys);
@@ -1139,7 +1188,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
BUG_ON(b->nr.live_u64s != u64s);
- btree_bounce_free(c, btree_bytes(c), used_mempool, sorted);
+ btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted);
if (updated_range)
bch2_btree_node_drop_keys_outside_node(b);
@@ -1202,6 +1251,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca,
out:
mempool_free(iter, &c->fill_iter);
printbuf_exit(&buf);
+ bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time);
return retry_read;
fsck_err:
if (ret == -BCH_ERR_btree_node_read_err_want_retry ||
@@ -1234,7 +1284,7 @@ static void btree_node_read_work(struct work_struct *work)
rb->have_ioref = bch2_dev_get_ioref(ca, READ);
bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META);
bio->bi_iter.bi_sector = rb->pick.ptr.offset;
- bio->bi_iter.bi_size = btree_bytes(c);
+ bio->bi_iter.bi_size = btree_buf_bytes(b);
if (rb->have_ioref) {
bio_set_dev(bio, ca->disk_sb.bdev);
@@ -1462,7 +1512,7 @@ fsck_err:
}
if (best >= 0) {
- memcpy(b->data, ra->buf[best], btree_bytes(c));
+ memcpy(b->data, ra->buf[best], btree_buf_bytes(b));
ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error);
} else {
ret = -1;
@@ -1528,7 +1578,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
for (i = 0; i < ra->nr; i++) {
ra->buf[i] = mempool_alloc(&c->btree_bounce_pool, GFP_NOFS);
ra->bio[i] = bio_alloc_bioset(NULL,
- buf_pages(ra->buf[i], btree_bytes(c)),
+ buf_pages(ra->buf[i], btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@@ -1548,7 +1598,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
rb->pick = pick;
rb->bio.bi_iter.bi_sector = pick.ptr.offset;
rb->bio.bi_end_io = btree_node_read_all_replicas_endio;
- bch2_bio_map(&rb->bio, ra->buf[i], btree_bytes(c));
+ bch2_bio_map(&rb->bio, ra->buf[i], btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -1575,16 +1625,17 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool
return 0;
}
-void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
+void bch2_btree_node_read(struct btree_trans *trans, struct btree *b,
bool sync)
{
+ struct bch_fs *c = trans->c;
struct extent_ptr_decoded pick;
struct btree_read_bio *rb;
struct bch_dev *ca;
struct bio *bio;
int ret;
- trace_and_count(c, btree_node_read, c, b);
+ trace_and_count(c, btree_node_read, trans, b);
if (bch2_verify_all_btree_replicas &&
!btree_node_read_all_replicas(c, b, sync))
@@ -1614,7 +1665,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
ca = bch_dev_bkey_exists(c, pick.ptr.dev);
bio = bio_alloc_bioset(NULL,
- buf_pages(b->data, btree_bytes(c)),
+ buf_pages(b->data, btree_buf_bytes(b)),
REQ_OP_READ|REQ_SYNC|REQ_META,
GFP_NOFS,
&c->btree_bio);
@@ -1628,7 +1679,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
INIT_WORK(&rb->work, btree_node_read_work);
bio->bi_iter.bi_sector = pick.ptr.offset;
bio->bi_end_io = btree_node_read_endio;
- bch2_bio_map(bio, b->data, btree_bytes(c));
+ bch2_bio_map(bio, b->data, btree_buf_bytes(b));
if (rb->have_ioref) {
this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree],
@@ -1637,7 +1688,7 @@ void bch2_btree_node_read(struct bch_fs *c, struct btree *b,
if (sync) {
submit_bio_wait(bio);
-
+ bch2_latency_acct(ca, rb->start_time, READ);
btree_node_read_work(&rb->work);
} else {
submit_bio(bio);
@@ -1663,12 +1714,12 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(trans, level != 0);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
BUG_ON(IS_ERR(b));
@@ -1677,7 +1728,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id,
set_btree_node_read_in_flight(b);
- bch2_btree_node_read(c, b, true);
+ bch2_btree_node_read(trans, b, true);
if (btree_node_read_error(b)) {
bch2_btree_node_hash_remove(&c->btree_cache, b);
@@ -1789,8 +1840,10 @@ static void btree_node_write_work(struct work_struct *work)
bch2_bkey_drop_ptrs(bkey_i_to_s(&wbio->key), ptr,
bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev));
- if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key)))
+ if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) {
+ ret = -BCH_ERR_btree_write_all_failed;
goto err;
+ }
if (wbio->wbio.first_btree_write) {
if (wbio->wbio.failed.nr) {
@@ -1800,9 +1853,9 @@ static void btree_node_write_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0,
bch2_btree_node_update_key_get_iter(trans, b, &wbio->key,
BCH_WATERMARK_reclaim|
- BTREE_INSERT_JOURNAL_RECLAIM|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW,
+ BCH_TRANS_COMMIT_journal_reclaim|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw,
!wbio->wbio.failed.nr));
if (ret)
goto err;
@@ -1885,7 +1938,6 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b,
static void btree_write_submit(struct work_struct *work)
{
struct btree_write_bio *wbio = container_of(work, struct btree_write_bio, work);
- struct bch_extent_ptr *ptr;
BKEY_PADDED_ONSTACK(k, BKEY_BTREE_PTR_VAL_U64s_MAX) tmp;
bkey_copy(&tmp.k, &wbio->key);
@@ -2022,8 +2074,8 @@ do_write:
i->u64s = 0;
sort_iter_add(&sort_iter.iter,
- unwritten_whiteouts_start(c, b),
- unwritten_whiteouts_end(c, b));
+ unwritten_whiteouts_start(b),
+ unwritten_whiteouts_end(b));
SET_BSET_SEPARATE_WHITEOUTS(i, false);
b->whiteout_u64s = 0;
@@ -2199,7 +2251,7 @@ bool bch2_btree_post_write_cleanup(struct bch_fs *c, struct btree *b)
bne = want_new_bset(c, b);
if (bne)
- bch2_bset_init_next(c, b, bne);
+ bch2_bset_init_next(b, bne);
bch2_btree_build_aux_trees(b);
diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h
index e0d7fa5b1dfb..e251cb6b965f 100644
--- a/fs/bcachefs/btree_io.h
+++ b/fs/bcachefs/btree_io.h
@@ -130,7 +130,7 @@ void bch2_btree_init_next(struct btree_trans *, struct btree *);
int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *,
struct btree *, bool, bool *);
-void bch2_btree_node_read(struct bch_fs *, struct btree *, bool);
+void bch2_btree_node_read(struct btree_trans *, struct btree *, bool);
int bch2_btree_root_read(struct bch_fs *, enum btree_id,
const struct bkey_i *, unsigned);
diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c
index da594e006769..3ef338df82f5 100644
--- a/fs/bcachefs/btree_iter.c
+++ b/fs/bcachefs/btree_iter.c
@@ -13,6 +13,7 @@
#include "error.h"
#include "extents.h"
#include "journal.h"
+#include "journal_io.h"
#include "replicas.h"
#include "snapshot.h"
#include "trace.h"
@@ -21,8 +22,8 @@
#include <linux/prefetch.h>
static inline void btree_path_list_remove(struct btree_trans *, struct btree_path *);
-static inline void btree_path_list_add(struct btree_trans *, struct btree_path *,
- struct btree_path *);
+static inline void btree_path_list_add(struct btree_trans *,
+ btree_path_idx_t, btree_path_idx_t);
static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
{
@@ -33,7 +34,8 @@ static inline unsigned long btree_iter_ip_allocated(struct btree_iter *iter)
#endif
}
-static struct btree_path *btree_path_alloc(struct btree_trans *, struct btree_path *);
+static btree_path_idx_t btree_path_alloc(struct btree_trans *, btree_path_idx_t);
+static void bch2_trans_srcu_lock(struct btree_trans *);
static inline int __btree_path_cmp(const struct btree_path *l,
enum btree_id r_btree_id,
@@ -239,8 +241,9 @@ static void bch2_btree_path_verify(struct btree_trans *trans,
void bch2_trans_verify_paths(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned iter;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, iter)
bch2_btree_path_verify(trans, path);
}
@@ -250,7 +253,7 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
BUG_ON(iter->btree_id >= BTREE_ID_NR);
- BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != iter->path->cached);
+ BUG_ON(!!(iter->flags & BTREE_ITER_CACHED) != btree_iter_path(trans, iter)->cached);
BUG_ON((iter->flags & BTREE_ITER_IS_EXTENTS) &&
(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
@@ -260,8 +263,8 @@ static void bch2_btree_iter_verify(struct btree_iter *iter)
!btree_type_has_snapshot_field(iter->btree_id));
if (iter->update_path)
- bch2_btree_path_verify(trans, iter->update_path);
- bch2_btree_path_verify(trans, iter->path);
+ bch2_btree_path_verify(trans, &trans->paths[iter->update_path]);
+ bch2_btree_path_verify(trans, btree_iter_path(trans, iter));
}
static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter)
@@ -330,12 +333,12 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id,
struct bpos pos, bool key_cache)
{
struct btree_path *path;
- unsigned idx;
+ struct trans_for_each_path_inorder_iter iter;
struct printbuf buf = PRINTBUF;
btree_trans_sort_paths(trans);
- trans_for_each_path_inorder(trans, path, idx) {
+ trans_for_each_path_inorder(trans, path, iter) {
int cmp = cmp_int(path->btree_id, id) ?:
cmp_int(path->cached, key_cache);
@@ -415,8 +418,9 @@ void bch2_btree_path_fix_key_modified(struct btree_trans *trans,
struct bkey_packed *where)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path_with_node(trans, b, path) {
+ trans_for_each_path_with_node(trans, b, path, i) {
__bch2_btree_path_fix_key_modified(path, b, where);
bch2_btree_path_verify_level(trans, path, b->c.level);
}
@@ -523,6 +527,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
{
struct bset_tree *t = bch2_bkey_to_bset_inlined(b, where);
struct btree_path *linked;
+ unsigned i;
if (node_iter != &path->l[b->c.level].iter) {
__bch2_btree_node_iter_fix(path, b, node_iter, t,
@@ -532,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans,
bch2_btree_node_iter_verify(node_iter, b);
}
- trans_for_each_path_with_node(trans, b, linked) {
+ trans_for_each_path_with_node(trans, b, linked, i) {
__bch2_btree_node_iter_fix(linked, b,
&linked->l[b->c.level].iter, t,
where, clobber_u64s, new_u64s);
@@ -647,7 +652,6 @@ void bch2_btree_path_level_init(struct btree_trans *trans,
static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, struct btree *b)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
trans_for_each_update(trans, i)
if (!i->cached &&
@@ -655,7 +659,7 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str
i->btree_id == b->c.btree_id &&
bpos_cmp(i->k->k.p, b->data->min_key) >= 0 &&
bpos_cmp(i->k->k.p, b->data->max_key) <= 0) {
- i->old_v = bch2_btree_path_peek_slot(i->path, &i->old_k).v;
+ i->old_v = bch2_btree_path_peek_slot(trans->paths + i->path, &i->old_k).v;
if (unlikely(trans->journal_replay_not_finished)) {
struct bkey_i *j_k =
@@ -674,14 +678,22 @@ static void bch2_trans_revalidate_updates_in_node(struct btree_trans *trans, str
* A btree node is being replaced - update the iterator to point to the new
* node:
*/
-void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
+void bch2_trans_node_add(struct btree_trans *trans,
+ struct btree_path *path,
+ struct btree *b)
{
- struct btree_path *path;
+ struct btree_path *prev;
+
+ BUG_ON(!btree_path_pos_in_node(path, b));
+
+ while ((prev = prev_btree_path(trans, path)) &&
+ btree_path_pos_in_node(prev, b))
+ path = prev;
- trans_for_each_path(trans, path)
- if (path->uptodate == BTREE_ITER_UPTODATE &&
- !path->cached &&
- btree_path_pos_in_node(path, b)) {
+ for (;
+ path && btree_path_pos_in_node(path, b);
+ path = next_btree_path(trans, path))
+ if (path->uptodate == BTREE_ITER_UPTODATE && !path->cached) {
enum btree_node_locked_type t =
btree_lock_want(path, b->c.level);
@@ -704,8 +716,9 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree *b)
void bch2_trans_node_reinit_iter(struct btree_trans *trans, struct btree *b)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path_with_node(trans, b, path)
+ trans_for_each_path_with_node(trans, b, path, i)
__btree_path_level_init(path, b->c.level);
bch2_trans_revalidate_updates_in_node(trans, b);
@@ -781,7 +794,7 @@ static int btree_path_prefetch(struct btree_trans *trans, struct btree_path *pat
struct btree_node_iter node_iter = l->iter;
struct bkey_packed *k;
struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ unsigned nr = test_bit(BCH_FS_started, &c->flags)
? (path->level > 1 ? 0 : 2)
: (path->level > 1 ? 1 : 16);
bool was_locked = btree_node_locked(path, path->level);
@@ -816,7 +829,7 @@ static int btree_path_prefetch_j(struct btree_trans *trans, struct btree_path *p
struct bch_fs *c = trans->c;
struct bkey_s_c k;
struct bkey_buf tmp;
- unsigned nr = test_bit(BCH_FS_STARTED, &c->flags)
+ unsigned nr = test_bit(BCH_FS_started, &c->flags)
? (path->level > 1 ? 0 : 2)
: (path->level > 1 ? 1 : 16);
bool was_locked = btree_node_locked(path, path->level);
@@ -884,7 +897,8 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans,
bch2_bkey_buf_reassemble(out, c, k);
- if (flags & BTREE_ITER_PREFETCH)
+ if ((flags & BTREE_ITER_PREFETCH) &&
+ c->opts.btree_node_prefetch)
ret = btree_path_prefetch_j(trans, path, &jiter);
bch2_btree_and_journal_iter_exit(&jiter);
@@ -916,7 +930,8 @@ static __always_inline int btree_path_down(struct btree_trans *trans,
bch2_bkey_buf_unpack(&tmp, c, l->b,
bch2_btree_node_iter_peek(&l->iter, l->b));
- if (flags & BTREE_ITER_PREFETCH) {
+ if ((flags & BTREE_ITER_PREFETCH) &&
+ c->opts.btree_node_prefetch) {
ret = btree_path_prefetch(trans, path);
if (ret)
goto err;
@@ -953,7 +968,8 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans)
struct bch_fs *c = trans->c;
struct btree_path *path;
unsigned long trace_ip = _RET_IP_;
- int i, ret = 0;
+ unsigned i;
+ int ret = 0;
if (trans->in_traverse_all)
return -BCH_ERR_transaction_restart_in_traverse_all;
@@ -963,7 +979,7 @@ retry_all:
trans->restarted = 0;
trans->last_restarted_ip = 0;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
path->should_be_locked = false;
btree_trans_sort_paths(trans);
@@ -977,7 +993,7 @@ retry_all:
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
}
@@ -985,16 +1001,16 @@ retry_all:
/* Now, redo traversals in correct order: */
i = 0;
while (i < trans->nr_sorted) {
- path = trans->paths + trans->sorted[i];
+ btree_path_idx_t idx = trans->sorted[i];
/*
* Traversing a path can cause another path to be added at about
* the same position:
*/
- if (path->uptodate) {
- __btree_path_get(path, false);
- ret = bch2_btree_path_traverse_one(trans, path, 0, _THIS_IP_);
- __btree_path_put(path, false);
+ if (trans->paths[idx].uptodate) {
+ __btree_path_get(&trans->paths[idx], false);
+ ret = bch2_btree_path_traverse_one(trans, idx, 0, _THIS_IP_);
+ __btree_path_put(&trans->paths[idx], false);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart) ||
bch2_err_matches(ret, ENOMEM))
@@ -1013,7 +1029,7 @@ retry_all:
* then failed to relock a path - that's fine.
*/
err:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
trans->in_traverse_all = false;
@@ -1099,10 +1115,11 @@ static inline unsigned btree_path_up_until_good_node(struct btree_trans *trans,
* stashed in the iterator and returned from bch2_trans_exit().
*/
int bch2_btree_path_traverse_one(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path_idx,
unsigned flags,
unsigned long trace_ip)
{
+ struct btree_path *path = &trans->paths[path_idx];
unsigned depth_want = path->level;
int ret = -((int) trans->restarted);
@@ -1126,6 +1143,8 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans,
goto out;
}
+ path = &trans->paths[path_idx];
+
if (unlikely(path->level >= BTREE_MAX_DEPTH))
goto out;
@@ -1188,39 +1207,38 @@ static inline void btree_path_copy(struct btree_trans *trans, struct btree_path
}
}
-static struct btree_path *btree_path_clone(struct btree_trans *trans, struct btree_path *src,
- bool intent)
+static btree_path_idx_t btree_path_clone(struct btree_trans *trans, btree_path_idx_t src,
+ bool intent)
{
- struct btree_path *new = btree_path_alloc(trans, src);
-
- btree_path_copy(trans, new, src);
- __btree_path_get(new, intent);
+ btree_path_idx_t new = btree_path_alloc(trans, src);
+ btree_path_copy(trans, trans->paths + new, trans->paths + src);
+ __btree_path_get(trans->paths + new, intent);
return new;
}
__flatten
-struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *trans,
- struct btree_path *path, bool intent,
- unsigned long ip)
+btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *trans,
+ btree_path_idx_t path, bool intent, unsigned long ip)
{
- __btree_path_put(path, intent);
+ __btree_path_put(trans->paths + path, intent);
path = btree_path_clone(trans, path, intent);
- path->preserve = false;
+ trans->paths[path].preserve = false;
return path;
}
-struct btree_path * __must_check
+btree_path_idx_t __must_check
__bch2_btree_path_set_pos(struct btree_trans *trans,
- struct btree_path *path, struct bpos new_pos,
- bool intent, unsigned long ip, int cmp)
+ btree_path_idx_t path_idx, struct bpos new_pos,
+ bool intent, unsigned long ip)
{
- unsigned level = path->level;
+ int cmp = bpos_cmp(new_pos, trans->paths[path_idx].pos);
bch2_trans_verify_not_in_restart(trans);
- EBUG_ON(!path->ref);
+ EBUG_ON(!trans->paths[path_idx].ref);
- path = bch2_btree_path_make_mut(trans, path, intent, ip);
+ path_idx = bch2_btree_path_make_mut(trans, path_idx, intent, ip);
+ struct btree_path *path = trans->paths + path_idx;
path->pos = new_pos;
trans->paths_sorted = false;
@@ -1231,7 +1249,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
goto out;
}
- level = btree_path_up_until_good_node(trans, path, cmp);
+ unsigned level = btree_path_up_until_good_node(trans, path, cmp);
if (btree_path_node(path, level)) {
struct btree_path_level *l = &path->l[level];
@@ -1261,7 +1279,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans,
}
out:
bch2_btree_path_verify(trans, path);
- return path;
+ return path_idx;
}
/* Btree path: main interface: */
@@ -1296,19 +1314,16 @@ static struct btree_path *have_node_at_pos(struct btree_trans *trans, struct btr
return NULL;
}
-static inline void __bch2_path_free(struct btree_trans *trans, struct btree_path *path)
+static inline void __bch2_path_free(struct btree_trans *trans, btree_path_idx_t path)
{
- __bch2_btree_path_unlock(trans, path);
- btree_path_list_remove(trans, path);
- trans->paths_allocated &= ~(1ULL << path->idx);
+ __bch2_btree_path_unlock(trans, trans->paths + path);
+ btree_path_list_remove(trans, trans->paths + path);
+ __clear_bit(path, trans->paths_allocated);
}
-void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool intent)
+void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent)
{
- struct btree_path *dup;
-
- EBUG_ON(trans->paths + path->idx != path);
- EBUG_ON(!path->ref);
+ struct btree_path *path = trans->paths + path_idx, *dup;
if (!__btree_path_put(path, intent))
return;
@@ -1322,7 +1337,7 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
if (path->should_be_locked &&
!trans->restarted &&
- (!dup || !bch2_btree_path_relock_norestart(trans, dup, _THIS_IP_)))
+ (!dup || !bch2_btree_path_relock_norestart(trans, dup)))
return;
if (dup) {
@@ -1330,16 +1345,13 @@ void bch2_path_put(struct btree_trans *trans, struct btree_path *path, bool inte
dup->should_be_locked |= path->should_be_locked;
}
- __bch2_path_free(trans, path);
+ __bch2_path_free(trans, path_idx);
}
-static void bch2_path_put_nokeep(struct btree_trans *trans, struct btree_path *path,
+static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path,
bool intent)
{
- EBUG_ON(trans->paths + path->idx != path);
- EBUG_ON(!path->ref);
-
- if (!__btree_path_put(path, intent))
+ if (!__btree_path_put(trans->paths + path, intent))
return;
__bch2_path_free(trans, path);
@@ -1362,9 +1374,6 @@ void __noreturn bch2_trans_in_restart_error(struct btree_trans *trans)
noinline __cold
void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
{
- struct btree_insert_entry *i;
- struct btree_write_buffered_key *wb;
-
prt_printf(buf, "transaction updates for %s journal seq %llu",
trans->fn, trans->journal_res.seq);
prt_newline(buf);
@@ -1388,16 +1397,10 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans)
prt_newline(buf);
}
- trans_for_each_wb_update(trans, wb) {
- prt_printf(buf, "update: btree=%s wb=1 %pS",
- bch2_btree_id_str(wb->btree),
- (void *) i->ip_allocated);
- prt_newline(buf);
-
- prt_printf(buf, " new ");
- bch2_bkey_val_to_text(buf, trans->c, bkey_i_to_s_c(&wb->k));
- prt_newline(buf);
- }
+ for (struct jset_entry *e = trans->journal_entries;
+ e != btree_trans_journal_entries_top(trans);
+ e = vstruct_next(e))
+ bch2_journal_entry_to_text(buf, trans->c, e);
printbuf_indent_sub(buf, 2);
}
@@ -1412,11 +1415,12 @@ void bch2_dump_trans_updates(struct btree_trans *trans)
printbuf_exit(&buf);
}
-noinline __cold
-void bch2_btree_path_to_text(struct printbuf *out, struct btree_path *path)
+static void bch2_btree_path_to_text(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx)
{
+ struct btree_path *path = trans->paths + path_idx;
+
prt_printf(out, "path: idx %2u ref %u:%u %c %c btree=%s l=%u pos ",
- path->idx, path->ref, path->intent_ref,
+ path_idx, path->ref, path->intent_ref,
path->preserve ? 'P' : ' ',
path->should_be_locked ? 'S' : ' ',
bch2_btree_id_str(path->btree_id),
@@ -1434,14 +1438,13 @@ static noinline __cold
void __bch2_trans_paths_to_text(struct printbuf *out, struct btree_trans *trans,
bool nosort)
{
- struct btree_path *path;
- unsigned idx;
+ struct trans_for_each_path_inorder_iter iter;
if (!nosort)
btree_trans_sort_paths(trans);
- trans_for_each_path_inorder(trans, path, idx)
- bch2_btree_path_to_text(out, path);
+ trans_for_each_path_idx_inorder(trans, iter)
+ bch2_btree_path_to_text(out, trans, iter.path_idx);
}
noinline __cold
@@ -1473,17 +1476,14 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
{
struct btree_transaction_stats *s = btree_trans_stats(trans);
struct printbuf buf = PRINTBUF;
-
- if (!s)
- return;
+ size_t nr = bitmap_weight(trans->paths_allocated, trans->nr_paths);
bch2_trans_paths_to_text(&buf, trans);
if (!buf.allocation_failure) {
mutex_lock(&s->lock);
- if (s->nr_max_paths < hweight64(trans->paths_allocated)) {
- s->nr_max_paths = trans->nr_max_paths =
- hweight64(trans->paths_allocated);
+ if (nr > s->nr_max_paths) {
+ s->nr_max_paths = nr;
swap(s->max_paths_text, buf.buf);
}
mutex_unlock(&s->lock);
@@ -1491,64 +1491,121 @@ static void bch2_trans_update_max_paths(struct btree_trans *trans)
printbuf_exit(&buf);
- trans->nr_max_paths = hweight64(trans->paths_allocated);
+ trans->nr_paths_max = nr;
+}
+
+noinline __cold
+int __bch2_btree_trans_too_many_iters(struct btree_trans *trans)
+{
+ if (trace_trans_restart_too_many_iters_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_trans_paths_to_text(&buf, trans);
+ trace_trans_restart_too_many_iters(trans, _THIS_IP_, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ count_event(trans->c, trans_restart_too_many_iters);
+
+ return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
}
static noinline void btree_path_overflow(struct btree_trans *trans)
{
bch2_dump_trans_paths_updates(trans);
- panic("trans path overflow\n");
+ bch_err(trans->c, "trans path overflow");
}
-static inline struct btree_path *btree_path_alloc(struct btree_trans *trans,
- struct btree_path *pos)
+static noinline void btree_paths_realloc(struct btree_trans *trans)
{
- struct btree_path *path;
- unsigned idx;
+ unsigned nr = trans->nr_paths * 2;
+
+ void *p = kzalloc(BITS_TO_LONGS(nr) * sizeof(unsigned long) +
+ sizeof(struct btree_trans_paths) +
+ nr * sizeof(struct btree_path) +
+ nr * sizeof(btree_path_idx_t) + 8 +
+ nr * sizeof(struct btree_insert_entry), GFP_KERNEL|__GFP_NOFAIL);
+
+ unsigned long *paths_allocated = p;
+ memcpy(paths_allocated, trans->paths_allocated, BITS_TO_LONGS(trans->nr_paths) * sizeof(unsigned long));
+ p += BITS_TO_LONGS(nr) * sizeof(unsigned long);
+
+ p += sizeof(struct btree_trans_paths);
+ struct btree_path *paths = p;
+ *trans_paths_nr(paths) = nr;
+ memcpy(paths, trans->paths, trans->nr_paths * sizeof(struct btree_path));
+ p += nr * sizeof(struct btree_path);
+
+ btree_path_idx_t *sorted = p;
+ memcpy(sorted, trans->sorted, trans->nr_sorted * sizeof(btree_path_idx_t));
+ p += nr * sizeof(btree_path_idx_t) + 8;
+
+ struct btree_insert_entry *updates = p;
+ memcpy(updates, trans->updates, trans->nr_paths * sizeof(struct btree_insert_entry));
+
+ unsigned long *old = trans->paths_allocated;
- if (unlikely(trans->paths_allocated ==
- ~((~0ULL << 1) << (BTREE_ITER_MAX - 1))))
- btree_path_overflow(trans);
+ rcu_assign_pointer(trans->paths_allocated, paths_allocated);
+ rcu_assign_pointer(trans->paths, paths);
+ rcu_assign_pointer(trans->sorted, sorted);
+ rcu_assign_pointer(trans->updates, updates);
- idx = __ffs64(~trans->paths_allocated);
+ trans->nr_paths = nr;
+
+ if (old != trans->_paths_allocated)
+ kfree_rcu_mightsleep(old);
+}
+
+static inline btree_path_idx_t btree_path_alloc(struct btree_trans *trans,
+ btree_path_idx_t pos)
+{
+ btree_path_idx_t idx = find_first_zero_bit(trans->paths_allocated, trans->nr_paths);
+
+ if (unlikely(idx == trans->nr_paths)) {
+ if (trans->nr_paths == BTREE_ITER_MAX) {
+ btree_path_overflow(trans);
+ return 0;
+ }
+
+ btree_paths_realloc(trans);
+ }
/*
* Do this before marking the new path as allocated, since it won't be
* initialized yet:
*/
- if (unlikely(idx > trans->nr_max_paths))
+ if (unlikely(idx > trans->nr_paths_max))
bch2_trans_update_max_paths(trans);
- trans->paths_allocated |= 1ULL << idx;
+ __set_bit(idx, trans->paths_allocated);
- path = &trans->paths[idx];
- path->idx = idx;
+ struct btree_path *path = &trans->paths[idx];
path->ref = 0;
path->intent_ref = 0;
path->nodes_locked = 0;
- path->alloc_seq++;
- btree_path_list_add(trans, pos, path);
+ btree_path_list_add(trans, pos, idx);
trans->paths_sorted = false;
- return path;
+ return idx;
}
-struct btree_path *bch2_path_get(struct btree_trans *trans,
- enum btree_id btree_id, struct bpos pos,
- unsigned locks_want, unsigned level,
- unsigned flags, unsigned long ip)
+btree_path_idx_t bch2_path_get(struct btree_trans *trans,
+ enum btree_id btree_id, struct bpos pos,
+ unsigned locks_want, unsigned level,
+ unsigned flags, unsigned long ip)
{
- struct btree_path *path, *path_pos = NULL;
+ struct btree_path *path;
bool cached = flags & BTREE_ITER_CACHED;
bool intent = flags & BTREE_ITER_INTENT;
- int i;
+ struct trans_for_each_path_inorder_iter iter;
+ btree_path_idx_t path_pos = 0, path_idx;
bch2_trans_verify_not_in_restart(trans);
bch2_trans_verify_locks(trans);
btree_trans_sort_paths(trans);
- trans_for_each_path_inorder(trans, path, i) {
+ trans_for_each_path_inorder(trans, path, iter) {
if (__btree_path_cmp(path,
btree_id,
cached,
@@ -1556,18 +1613,19 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
level) > 0)
break;
- path_pos = path;
+ path_pos = iter.path_idx;
}
if (path_pos &&
- path_pos->cached == cached &&
- path_pos->btree_id == btree_id &&
- path_pos->level == level) {
- __btree_path_get(path_pos, intent);
- path = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ trans->paths[path_pos].cached == cached &&
+ trans->paths[path_pos].btree_id == btree_id &&
+ trans->paths[path_pos].level == level) {
+ __btree_path_get(trans->paths + path_pos, intent);
+ path_idx = bch2_btree_path_set_pos(trans, path_pos, pos, intent, ip);
+ path = trans->paths + path_idx;
} else {
- path = btree_path_alloc(trans, path_pos);
- path_pos = NULL;
+ path_idx = btree_path_alloc(trans, path_pos);
+ path = trans->paths + path_idx;
__btree_path_get(path, intent);
path->pos = pos;
@@ -1578,7 +1636,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
path->level = level;
path->locks_want = locks_want;
path->nodes_locked = 0;
- for (i = 0; i < ARRAY_SIZE(path->l); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(path->l); i++)
path->l[i].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
#ifdef TRACK_PATH_ALLOCATED
path->ip_allocated = ip;
@@ -1604,7 +1662,7 @@ struct btree_path *bch2_path_get(struct btree_trans *trans,
if (locks_want > path->locks_want)
bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL);
- return path;
+ return path_idx;
}
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *path, struct bkey *u)
@@ -1659,9 +1717,10 @@ __bch2_btree_iter_traverse(struct btree_iter *iter)
int __must_check
bch2_btree_iter_traverse(struct btree_iter *iter)
{
+ struct btree_trans *trans = iter->trans;
int ret;
- iter->path = bch2_btree_path_set_pos(iter->trans, iter->path,
+ iter->path = bch2_btree_path_set_pos(trans, iter->path,
btree_iter_search_key(iter),
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
@@ -1670,7 +1729,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter)
if (ret)
return ret;
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(trans->paths + iter->path);
return 0;
}
@@ -1682,14 +1741,15 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
struct btree *b = NULL;
int ret;
- EBUG_ON(iter->path->cached);
+ EBUG_ON(trans->paths[iter->path].cached);
bch2_btree_iter_verify(iter);
ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
goto err;
- b = btree_path_node(iter->path, iter->path->level);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ b = btree_path_node(path, path->level);
if (!b)
goto out;
@@ -1701,7 +1761,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -1726,14 +1786,15 @@ struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter)
struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
- struct btree_path *path = iter->path;
struct btree *b = NULL;
int ret;
+ EBUG_ON(trans->paths[iter->path].cached);
bch2_trans_verify_not_in_restart(trans);
- EBUG_ON(iter->path->cached);
bch2_btree_iter_verify(iter);
+ struct btree_path *path = btree_iter_path(trans, iter);
+
/* already at end? */
if (!btree_path_node(path, path->level))
return NULL;
@@ -1763,17 +1824,19 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
* Haven't gotten to the end of the parent node: go back down to
* the next child node
*/
- path = iter->path =
- bch2_btree_path_set_pos(trans, path, bpos_successor(iter->pos),
- iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
+ iter->path = bch2_btree_path_set_pos(trans, iter->path,
+ bpos_successor(iter->pos),
+ iter->flags & BTREE_ITER_INTENT,
+ btree_iter_ip_allocated(iter));
+ path = btree_iter_path(trans, iter);
btree_path_set_level_down(trans, path, iter->min_depth);
- ret = bch2_btree_path_traverse(trans, path, iter->flags);
+ ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
if (ret)
goto err;
+ path = btree_iter_path(trans, iter);
b = path->l[path->level].b;
}
@@ -1783,8 +1846,8 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter)
iter->path = bch2_btree_path_set_pos(trans, iter->path, b->key.k.p,
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(iter->path);
- BUG_ON(iter->path->uptodate);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
+ EBUG_ON(btree_iter_path(trans, iter)->uptodate);
out:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -1799,23 +1862,15 @@ err:
inline bool bch2_btree_iter_advance(struct btree_iter *iter)
{
- if (likely(!(iter->flags & BTREE_ITER_ALL_LEVELS))) {
- struct bpos pos = iter->k.p;
- bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
- ? bpos_eq(pos, SPOS_MAX)
- : bkey_eq(pos, SPOS_MAX));
-
- if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
- pos = bkey_successor(iter, pos);
- bch2_btree_iter_set_pos(iter, pos);
- return ret;
- } else {
- if (!btree_path_node(iter->path, iter->path->level))
- return true;
+ struct bpos pos = iter->k.p;
+ bool ret = !(iter->flags & BTREE_ITER_ALL_SNAPSHOTS
+ ? bpos_eq(pos, SPOS_MAX)
+ : bkey_eq(pos, SPOS_MAX));
- iter->advanced = true;
- return false;
- }
+ if (ret && !(iter->flags & BTREE_ITER_IS_EXTENTS))
+ pos = bkey_successor(iter, pos);
+ bch2_btree_iter_set_pos(iter, pos);
+ return ret;
}
inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
@@ -1832,58 +1887,70 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter)
}
static noinline
-struct bkey_i *__bch2_btree_trans_peek_updates(struct btree_iter *iter)
+void bch2_btree_trans_peek_prev_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c *k)
{
- struct btree_insert_entry *i;
- struct bkey_i *ret = NULL;
+ struct bpos end = path_l(btree_iter_path(trans, iter))->b->data->min_key;
- trans_for_each_update(iter->trans, i) {
- if (i->btree_id < iter->btree_id)
- continue;
- if (i->btree_id > iter->btree_id)
- break;
- if (bpos_lt(i->k->k.p, iter->path->pos))
- continue;
- if (i->key_cache_already_flushed)
- continue;
- if (!ret || bpos_lt(i->k->k.p, ret->k.p))
- ret = i->k;
- }
+ trans_for_each_update(trans, i)
+ if (!i->key_cache_already_flushed &&
+ i->btree_id == iter->btree_id &&
+ bpos_le(i->k->k.p, iter->pos) &&
+ bpos_ge(i->k->k.p, k->k ? k->k->p : end)) {
+ iter->k = i->k->k;
+ *k = bkey_i_to_s_c(i->k);
+ }
+}
- return ret;
+static noinline
+void bch2_btree_trans_peek_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c *k)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct bpos end = path_l(path)->b->key.k.p;
+
+ trans_for_each_update(trans, i)
+ if (!i->key_cache_already_flushed &&
+ i->btree_id == iter->btree_id &&
+ bpos_ge(i->k->k.p, path->pos) &&
+ bpos_le(i->k->k.p, k->k ? k->k->p : end)) {
+ iter->k = i->k->k;
+ *k = bkey_i_to_s_c(i->k);
+ }
}
-static inline struct bkey_i *btree_trans_peek_updates(struct btree_iter *iter)
+static noinline
+void bch2_btree_trans_peek_slot_updates(struct btree_trans *trans, struct btree_iter *iter,
+ struct bkey_s_c *k)
{
- return iter->flags & BTREE_ITER_WITH_UPDATES
- ? __bch2_btree_trans_peek_updates(iter)
- : NULL;
+ trans_for_each_update(trans, i)
+ if (!i->key_cache_already_flushed &&
+ i->btree_id == iter->btree_id &&
+ bpos_eq(i->k->k.p, iter->pos)) {
+ iter->k = i->k->k;
+ *k = bkey_i_to_s_c(i->k);
+ }
}
static struct bkey_i *bch2_btree_journal_peek(struct btree_trans *trans,
struct btree_iter *iter,
struct bpos end_pos)
{
- struct bkey_i *k;
-
- if (bpos_lt(iter->path->pos, iter->journal_pos))
- iter->journal_idx = 0;
+ struct btree_path *path = btree_iter_path(trans, iter);
- k = bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
- iter->path->level,
- iter->path->pos,
- end_pos,
- &iter->journal_idx);
-
- iter->journal_pos = k ? k->k.p : end_pos;
- return k;
+ return bch2_journal_keys_peek_upto(trans->c, iter->btree_id,
+ path->level,
+ path->pos,
+ end_pos,
+ &iter->journal_idx);
}
static noinline
struct bkey_s_c btree_trans_peek_slot_journal(struct btree_trans *trans,
struct btree_iter *iter)
{
- struct bkey_i *k = bch2_btree_journal_peek(trans, iter, iter->path->pos);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ struct bkey_i *k = bch2_btree_journal_peek(trans, iter, path->pos);
if (k) {
iter->k = k->k;
@@ -1898,9 +1965,10 @@ struct bkey_s_c btree_trans_peek_journal(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k)
{
+ struct btree_path *path = btree_iter_path(trans, iter);
struct bkey_i *next_journal =
bch2_btree_journal_peek(trans, iter,
- k.k ? k.k->p : path_l(iter->path)->b->key.k.p);
+ k.k ? k.k->p : path_l(path)->b->key.k.p);
if (next_journal) {
iter->k = next_journal->k;
@@ -1943,13 +2011,13 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
iter->flags|BTREE_ITER_CACHED) ?:
- bch2_btree_path_relock(trans, iter->path, _THIS_IP_);
+ bch2_btree_path_relock(trans, btree_iter_path(trans, iter), _THIS_IP_);
if (unlikely(ret))
return bkey_s_c_err(ret);
- btree_path_set_should_be_locked(iter->key_cache_path);
+ btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
- k = bch2_btree_path_peek_slot(iter->key_cache_path, &u);
+ k = bch2_btree_path_peek_slot(trans->paths + iter->key_cache_path, &u);
if (k.k && !bkey_err(k)) {
iter->k = u;
k.k = &iter->k;
@@ -1960,11 +2028,10 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos
static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key)
{
struct btree_trans *trans = iter->trans;
- struct bkey_i *next_update;
struct bkey_s_c k, k2;
int ret;
- EBUG_ON(iter->path->cached);
+ EBUG_ON(btree_iter_path(trans, iter)->cached);
bch2_btree_iter_verify(iter);
while (1) {
@@ -1982,7 +2049,8 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
- l = path_l(iter->path);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ l = path_l(path);
if (unlikely(!l->b)) {
/* No btree nodes at requested level: */
@@ -1991,7 +2059,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
goto out;
}
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(path);
k = btree_path_level_peek_all(trans->c, l, &iter->k);
@@ -2009,14 +2077,9 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp
if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL))
k = btree_trans_peek_journal(trans, iter, k);
- next_update = btree_trans_peek_updates(iter);
-
- if (next_update &&
- bpos_le(next_update->k.p,
- k.k ? k.k->p : l->b->key.k.p)) {
- iter->k = next_update->k;
- k = bkey_i_to_s_c(next_update);
- }
+ if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ trans->nr_updates))
+ bch2_btree_trans_peek_updates(trans, iter, &k);
if (k.k && bkey_deleted(k.k)) {
/*
@@ -2066,13 +2129,12 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
struct bpos iter_pos;
int ret;
- EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
EBUG_ON((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) && bkey_eq(end, POS_MAX));
if (iter->update_path) {
bch2_path_put_nokeep(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT);
- iter->update_path = NULL;
+ iter->update_path = 0;
}
bch2_btree_iter_verify_entry_exit(iter);
@@ -2094,14 +2156,16 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* isn't monotonically increasing before FILTER_SNAPSHOTS, and
* that's what we check against in extents mode:
*/
- if (k.k->p.inode > end.inode)
+ if (unlikely(!(iter->flags & BTREE_ITER_IS_EXTENTS)
+ ? bkey_gt(k.k->p, end)
+ : k.k->p.inode > end.inode))
goto end;
if (iter->update_path &&
- !bkey_eq(iter->update_path->pos, k.k->p)) {
+ !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) {
bch2_path_put_nokeep(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT);
- iter->update_path = NULL;
+ iter->update_path = 0;
}
if ((iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) &&
@@ -2121,7 +2185,7 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
* advance, same as on exit for iter->path, but only up
* to snapshot
*/
- __btree_path_get(iter->path, iter->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + iter->path, iter->flags & BTREE_ITER_INTENT);
iter->update_path = iter->path;
iter->update_path = bch2_btree_path_set_pos(trans,
@@ -2177,14 +2241,14 @@ struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *iter, struct bpos e
iter->flags & BTREE_ITER_INTENT,
btree_iter_ip_allocated(iter));
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
out_no_locked:
if (iter->update_path) {
- ret = bch2_btree_path_relock(trans, iter->update_path, _THIS_IP_);
+ ret = bch2_btree_path_relock(trans, trans->paths + iter->update_path, _THIS_IP_);
if (unlikely(ret))
k = bkey_s_c_err(ret);
else
- btree_path_set_should_be_locked(iter->update_path);
+ btree_path_set_should_be_locked(trans->paths + iter->update_path);
}
if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
@@ -2206,103 +2270,6 @@ end:
}
/**
- * bch2_btree_iter_peek_all_levels() - returns the first key greater than or
- * equal to iterator's current position, returning keys from every level of the
- * btree. For keys at different levels of the btree that compare equal, the key
- * from the lower level (leaf) is returned first.
- * @iter: iterator to peek from
- *
- * Returns: key if found, or an error extractable with bkey_err().
- */
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *iter)
-{
- struct btree_trans *trans = iter->trans;
- struct bkey_s_c k;
- int ret;
-
- EBUG_ON(iter->path->cached);
- bch2_btree_iter_verify(iter);
- BUG_ON(iter->path->level < iter->min_depth);
- BUG_ON(!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS));
- EBUG_ON(!(iter->flags & BTREE_ITER_ALL_LEVELS));
-
- while (1) {
- iter->path = bch2_btree_path_set_pos(trans, iter->path, iter->pos,
- iter->flags & BTREE_ITER_INTENT,
- btree_iter_ip_allocated(iter));
-
- ret = bch2_btree_path_traverse(trans, iter->path, iter->flags);
- if (unlikely(ret)) {
- /* ensure that iter->k is consistent with iter->pos: */
- bch2_btree_iter_set_pos(iter, iter->pos);
- k = bkey_s_c_err(ret);
- goto out_no_locked;
- }
-
- /* Already at end? */
- if (!btree_path_node(iter->path, iter->path->level)) {
- k = bkey_s_c_null;
- goto out_no_locked;
- }
-
- k = btree_path_level_peek_all(trans->c,
- &iter->path->l[iter->path->level], &iter->k);
-
- /* Check if we should go up to the parent node: */
- if (!k.k ||
- (iter->advanced &&
- bpos_eq(path_l(iter->path)->b->key.k.p, iter->pos))) {
- iter->pos = path_l(iter->path)->b->key.k.p;
- btree_path_set_level_up(trans, iter->path);
- iter->advanced = false;
- continue;
- }
-
- /*
- * Check if we should go back down to a leaf:
- * If we're not in a leaf node, we only return the current key
- * if it exactly matches iter->pos - otherwise we first have to
- * go back to the leaf:
- */
- if (iter->path->level != iter->min_depth &&
- (iter->advanced ||
- !k.k ||
- !bpos_eq(iter->pos, k.k->p))) {
- btree_path_set_level_down(trans, iter->path, iter->min_depth);
- iter->pos = bpos_successor(iter->pos);
- iter->advanced = false;
- continue;
- }
-
- /* Check if we should go to the next key: */
- if (iter->path->level == iter->min_depth &&
- iter->advanced &&
- k.k &&
- bpos_eq(iter->pos, k.k->p)) {
- iter->pos = bpos_successor(iter->pos);
- iter->advanced = false;
- continue;
- }
-
- if (iter->advanced &&
- iter->path->level == iter->min_depth &&
- !bpos_eq(k.k->p, iter->pos))
- iter->advanced = false;
-
- BUG_ON(iter->advanced);
- BUG_ON(!k.k);
- break;
- }
-
- iter->pos = k.k->p;
- btree_path_set_should_be_locked(iter->path);
-out_no_locked:
- bch2_btree_iter_verify(iter);
-
- return k;
-}
-
-/**
* bch2_btree_iter_next() - returns first key greater than iterator's current
* position
* @iter: iterator to peek from
@@ -2328,14 +2295,14 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
{
struct btree_trans *trans = iter->trans;
struct bpos search_key = iter->pos;
- struct btree_path *saved_path = NULL;
struct bkey_s_c k;
struct bkey saved_k;
const struct bch_val *saved_v;
+ btree_path_idx_t saved_path = 0;
int ret;
- EBUG_ON(iter->path->cached || iter->path->level);
- EBUG_ON(iter->flags & BTREE_ITER_WITH_UPDATES);
+ EBUG_ON(btree_iter_path(trans, iter)->cached ||
+ btree_iter_path(trans, iter)->level);
if (iter->flags & BTREE_ITER_WITH_JOURNAL)
return bkey_s_c_err(-EIO);
@@ -2359,14 +2326,18 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
goto out_no_locked;
}
- k = btree_path_level_peek(trans, iter->path,
- &iter->path->l[0], &iter->k);
+ struct btree_path *path = btree_iter_path(trans, iter);
+
+ k = btree_path_level_peek(trans, path, &path->l[0], &iter->k);
if (!k.k ||
((iter->flags & BTREE_ITER_IS_EXTENTS)
? bpos_ge(bkey_start_pos(k.k), search_key)
: bpos_gt(k.k->p, search_key)))
- k = btree_path_level_prev(trans, iter->path,
- &iter->path->l[0], &iter->k);
+ k = btree_path_level_prev(trans, path, &path->l[0], &iter->k);
+
+ if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ trans->nr_updates))
+ bch2_btree_trans_peek_prev_updates(trans, iter, &k);
if (likely(k.k)) {
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS) {
@@ -2382,13 +2353,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
bch2_path_put_nokeep(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
iter->path = saved_path;
- saved_path = NULL;
+ saved_path = 0;
iter->k = saved_k;
k.v = saved_v;
goto got_key;
}
- if (bch2_snapshot_is_ancestor(iter->trans->c,
+ if (bch2_snapshot_is_ancestor(trans->c,
iter->snapshot,
k.k->p.snapshot)) {
if (saved_path)
@@ -2396,6 +2367,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter)
iter->flags & BTREE_ITER_INTENT);
saved_path = btree_path_clone(trans, iter->path,
iter->flags & BTREE_ITER_INTENT);
+ path = btree_iter_path(trans, iter);
saved_k = *k.k;
saved_v = k.v;
}
@@ -2412,10 +2384,11 @@ got_key:
continue;
}
+ btree_path_set_should_be_locked(path);
break;
- } else if (likely(!bpos_eq(iter->path->l[0].b->data->min_key, POS_MIN))) {
+ } else if (likely(!bpos_eq(path->l[0].b->data->min_key, POS_MIN))) {
/* Advance to previous leaf node: */
- search_key = bpos_predecessor(iter->path->l[0].b->data->min_key);
+ search_key = bpos_predecessor(path->l[0].b->data->min_key);
} else {
/* Start of btree: */
bch2_btree_iter_set_pos(iter, POS_MIN);
@@ -2432,8 +2405,6 @@ got_key:
if (iter->flags & BTREE_ITER_FILTER_SNAPSHOTS)
iter->pos.snapshot = iter->snapshot;
-
- btree_path_set_should_be_locked(iter->path);
out_no_locked:
if (saved_path)
bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_INTENT);
@@ -2468,8 +2439,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
bch2_btree_iter_verify(iter);
bch2_btree_iter_verify_entry_exit(iter);
- EBUG_ON(iter->flags & BTREE_ITER_ALL_LEVELS);
- EBUG_ON(iter->path->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
+ EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_WITH_KEY_CACHE));
/* extents can't span inode numbers: */
if ((iter->flags & BTREE_ITER_IS_EXTENTS) &&
@@ -2493,13 +2463,13 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if ((iter->flags & BTREE_ITER_CACHED) ||
!(iter->flags & (BTREE_ITER_IS_EXTENTS|BTREE_ITER_FILTER_SNAPSHOTS))) {
- struct bkey_i *next_update;
+ k = bkey_s_c_null;
- if ((next_update = btree_trans_peek_updates(iter)) &&
- bpos_eq(next_update->k.p, iter->pos)) {
- iter->k = next_update->k;
- k = bkey_i_to_s_c(next_update);
- goto out;
+ if (unlikely((iter->flags & BTREE_ITER_WITH_UPDATES) &&
+ trans->nr_updates)) {
+ bch2_btree_trans_peek_slot_updates(trans, iter, &k);
+ if (k.k)
+ goto out;
}
if (unlikely(iter->flags & BTREE_ITER_WITH_JOURNAL) &&
@@ -2514,7 +2484,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
goto out_no_locked;
}
- k = bch2_btree_path_peek_slot(iter->path, &iter->k);
+ k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k);
if (unlikely(!k.k))
goto out_no_locked;
} else {
@@ -2524,7 +2494,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
if (iter->flags & BTREE_ITER_IS_EXTENTS)
end.offset = U64_MAX;
- EBUG_ON(iter->path->level);
+ EBUG_ON(btree_iter_path(trans, iter)->level);
if (iter->flags & BTREE_ITER_INTENT) {
struct btree_iter iter2;
@@ -2570,7 +2540,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter)
}
}
out:
- btree_path_set_should_be_locked(iter->path);
+ btree_path_set_should_be_locked(btree_iter_path(trans, iter));
out_no_locked:
bch2_btree_iter_verify_entry_exit(iter);
bch2_btree_iter_verify(iter);
@@ -2617,17 +2587,17 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
struct btree_path *path;
unsigned i;
- BUG_ON(trans->nr_sorted != hweight64(trans->paths_allocated));
+ BUG_ON(trans->nr_sorted != bitmap_weight(trans->paths_allocated, trans->nr_paths) - 1);
- trans_for_each_path(trans, path) {
+ trans_for_each_path(trans, path, i) {
BUG_ON(path->sorted_idx >= trans->nr_sorted);
- BUG_ON(trans->sorted[path->sorted_idx] != path->idx);
+ BUG_ON(trans->sorted[path->sorted_idx] != i);
}
for (i = 0; i < trans->nr_sorted; i++) {
unsigned idx = trans->sorted[i];
- EBUG_ON(!(trans->paths_allocated & (1ULL << idx)));
+ BUG_ON(!test_bit(idx, trans->paths_allocated));
BUG_ON(trans->paths[idx].sorted_idx != i);
}
}
@@ -2635,12 +2605,12 @@ static void btree_trans_verify_sorted_refs(struct btree_trans *trans)
static void btree_trans_verify_sorted(struct btree_trans *trans)
{
struct btree_path *path, *prev = NULL;
- unsigned i;
+ struct trans_for_each_path_inorder_iter iter;
if (!bch2_debug_check_iterators)
return;
- trans_for_each_path_inorder(trans, path, i) {
+ trans_for_each_path_inorder(trans, path, iter) {
if (prev && btree_path_cmp(prev, path) > 0) {
__bch2_dump_trans_paths_updates(trans, true);
panic("trans paths out of order!\n");
@@ -2697,42 +2667,40 @@ out:
static inline void btree_path_list_remove(struct btree_trans *trans,
struct btree_path *path)
{
- unsigned i;
-
EBUG_ON(path->sorted_idx >= trans->nr_sorted);
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
trans->nr_sorted--;
memmove_u64s_down_small(trans->sorted + path->sorted_idx,
trans->sorted + path->sorted_idx + 1,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
+ sizeof(u64) / sizeof(btree_path_idx_t)));
#else
array_remove_item(trans->sorted, trans->nr_sorted, path->sorted_idx);
#endif
- for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
trans->paths[trans->sorted[i]].sorted_idx = i;
-
- path->sorted_idx = U8_MAX;
}
static inline void btree_path_list_add(struct btree_trans *trans,
- struct btree_path *pos,
- struct btree_path *path)
+ btree_path_idx_t pos,
+ btree_path_idx_t path_idx)
{
- unsigned i;
+ struct btree_path *path = trans->paths + path_idx;
- path->sorted_idx = pos ? pos->sorted_idx + 1 : trans->nr_sorted;
+ path->sorted_idx = pos ? trans->paths[pos].sorted_idx + 1 : trans->nr_sorted;
#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
memmove_u64s_up_small(trans->sorted + path->sorted_idx + 1,
trans->sorted + path->sorted_idx,
- DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx, 8));
+ DIV_ROUND_UP(trans->nr_sorted - path->sorted_idx,
+ sizeof(u64) / sizeof(btree_path_idx_t)));
trans->nr_sorted++;
- trans->sorted[path->sorted_idx] = path->idx;
+ trans->sorted[path->sorted_idx] = path_idx;
#else
- array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path->idx);
+ array_insert_item(trans->sorted, trans->nr_sorted, path->sorted_idx, path_idx);
#endif
- for (i = path->sorted_idx; i < trans->nr_sorted; i++)
+ for (unsigned i = path->sorted_idx; i < trans->nr_sorted; i++)
trans->paths[trans->sorted[i]].sorted_idx = i;
btree_trans_verify_sorted_refs(trans);
@@ -2749,9 +2717,10 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter)
if (iter->key_cache_path)
bch2_path_put(trans, iter->key_cache_path,
iter->flags & BTREE_ITER_INTENT);
- iter->path = NULL;
- iter->update_path = NULL;
- iter->key_cache_path = NULL;
+ iter->path = 0;
+ iter->update_path = 0;
+ iter->key_cache_path = 0;
+ iter->trans = NULL;
}
void bch2_trans_iter_init_outlined(struct btree_trans *trans,
@@ -2782,41 +2751,46 @@ void bch2_trans_node_iter_init(struct btree_trans *trans,
iter->min_depth = depth;
- BUG_ON(iter->path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
- BUG_ON(iter->path->level != depth);
- BUG_ON(iter->min_depth != depth);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ BUG_ON(path->locks_want < min(locks_want, BTREE_MAX_DEPTH));
+ BUG_ON(path->level != depth);
+ BUG_ON(iter->min_depth != depth);
}
void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src)
{
+ struct btree_trans *trans = src->trans;
+
*dst = *src;
if (src->path)
- __btree_path_get(src->path, src->flags & BTREE_ITER_INTENT);
+ __btree_path_get(trans->paths + src->path, src->flags & BTREE_ITER_INTENT);
if (src->update_path)
- __btree_path_get(src->update_path, src->flags & BTREE_ITER_INTENT);
- dst->key_cache_path = NULL;
+ __btree_path_get(trans->paths + src->update_path, src->flags & BTREE_ITER_INTENT);
+ dst->key_cache_path = 0;
}
void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
{
+ struct bch_fs *c = trans->c;
unsigned new_top = trans->mem_top + size;
- size_t old_bytes = trans->mem_bytes;
- size_t new_bytes = roundup_pow_of_two(new_top);
+ unsigned old_bytes = trans->mem_bytes;
+ unsigned new_bytes = roundup_pow_of_two(new_top);
int ret;
void *new_mem;
void *p;
- trans->mem_max = max(trans->mem_max, new_top);
-
WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX);
+ struct btree_transaction_stats *s = btree_trans_stats(trans);
+ s->max_mem = max(s->max_mem, new_bytes);
+
new_mem = krealloc(trans->mem, new_bytes, GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_mem)) {
bch2_trans_unlock(trans);
new_mem = krealloc(trans->mem, new_bytes, GFP_KERNEL);
if (!new_mem && new_bytes <= BTREE_TRANS_MEM_MAX) {
- new_mem = mempool_alloc(&trans->c->btree_trans_mem_pool, GFP_KERNEL);
+ new_mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
new_bytes = BTREE_TRANS_MEM_MAX;
kfree(trans->mem);
}
@@ -2836,7 +2810,7 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
trans->mem_bytes = new_bytes;
if (old_bytes) {
- trace_and_count(trans->c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
+ trace_and_count(c, trans_restart_mem_realloced, trans, _RET_IP_, new_bytes);
return ERR_PTR(btree_trans_restart(trans, BCH_ERR_transaction_restart_mem_realloced));
}
@@ -2858,8 +2832,9 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans)
if (trans->srcu_held) {
struct bch_fs *c = trans->c;
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->cached && !btree_node_locked(path, 0))
path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_srcu_reset);
@@ -2869,7 +2844,7 @@ void bch2_trans_srcu_unlock(struct btree_trans *trans)
}
}
-void bch2_trans_srcu_lock(struct btree_trans *trans)
+static void bch2_trans_srcu_lock(struct btree_trans *trans)
{
if (!trans->srcu_held) {
trans->srcu_idx = srcu_read_lock(&trans->c->btree_trans_barrier);
@@ -2891,14 +2866,16 @@ void bch2_trans_srcu_lock(struct btree_trans *trans)
u32 bch2_trans_begin(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
u64 now;
bch2_trans_reset_updates(trans);
trans->restart_count++;
trans->mem_top = 0;
+ trans->journal_entries = NULL;
- trans_for_each_path(trans, path) {
+ trans_for_each_path(trans, path, i) {
path->should_be_locked = false;
/*
@@ -2915,15 +2892,21 @@ u32 bch2_trans_begin(struct btree_trans *trans)
* iterators if we do that
*/
if (!path->ref && !path->preserve)
- __bch2_path_free(trans, path);
+ __bch2_path_free(trans, i);
else
path->preserve = false;
}
now = local_clock();
+
+ if (!IS_ENABLED(CONFIG_BCACHEFS_NO_LATENCY_ACCT) &&
+ time_after64(now, trans->last_begin_time + 10))
+ __bch2_time_stats_update(&btree_trans_stats(trans)->duration,
+ trans->last_begin_time, now);
+
if (!trans->restarted &&
(need_resched() ||
- now - trans->last_begin_time > BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS)) {
+ time_after64(now, trans->last_begin_time + BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS))) {
drop_locks_do(trans, (cond_resched(), 0));
now = local_clock();
}
@@ -2942,32 +2925,11 @@ u32 bch2_trans_begin(struct btree_trans *trans)
return trans->restart_count;
}
-static struct btree_trans *bch2_trans_alloc(struct bch_fs *c)
-{
- struct btree_trans *trans;
-
- if (IS_ENABLED(__KERNEL__)) {
- trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
- if (trans)
- return trans;
- }
-
- trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
- /*
- * paths need to be zeroed, bch2_check_for_deadlock looks at
- * paths in other threads
- */
- memset(&trans->paths, 0, sizeof(trans->paths));
- return trans;
-}
-
-const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR];
+const char *bch2_btree_transaction_fns[BCH_TRANSACTIONS_NR] = { "(unknown)" };
unsigned bch2_trans_get_fn_idx(const char *fn)
{
- unsigned i;
-
- for (i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(bch2_btree_transaction_fns); i++)
if (!bch2_btree_transaction_fns[i] ||
bch2_btree_transaction_fns[i] == fn) {
bch2_btree_transaction_fns[i] = fn;
@@ -2975,76 +2937,92 @@ unsigned bch2_trans_get_fn_idx(const char *fn)
}
pr_warn_once("BCH_TRANSACTIONS_NR not big enough!");
- return i;
+ return 0;
}
struct btree_trans *__bch2_trans_get(struct bch_fs *c, unsigned fn_idx)
__acquires(&c->btree_trans_barrier)
{
struct btree_trans *trans;
- struct btree_transaction_stats *s;
-
- trans = bch2_trans_alloc(c);
- memset(trans, 0, sizeof(*trans));
- trans->c = c;
- trans->fn = fn_idx < ARRAY_SIZE(bch2_btree_transaction_fns)
- ? bch2_btree_transaction_fns[fn_idx] : NULL;
- trans->last_begin_time = local_clock();
- trans->fn_idx = fn_idx;
- trans->locking_wait.task = current;
- trans->journal_replay_not_finished =
- unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
- atomic_inc_not_zero(&c->journal_keys.ref);
- closure_init_stack(&trans->ref);
-
- s = btree_trans_stats(trans);
- if (s && s->max_mem) {
- unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
-
- trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
-
- if (!unlikely(trans->mem)) {
- trans->mem = mempool_alloc(&c->btree_trans_mem_pool, GFP_KERNEL);
- trans->mem_bytes = BTREE_TRANS_MEM_MAX;
- } else {
- trans->mem_bytes = expected_mem_bytes;
+ if (IS_ENABLED(__KERNEL__)) {
+ trans = this_cpu_xchg(c->btree_trans_bufs->trans, NULL);
+ if (trans) {
+ memset(trans, 0, offsetof(struct btree_trans, list));
+ goto got_trans;
}
}
- if (s) {
- trans->nr_max_paths = s->nr_max_paths;
- trans->wb_updates_size = s->wb_updates_size;
- }
-
- trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
- trans->srcu_lock_time = jiffies;
- trans->srcu_held = true;
+ trans = mempool_alloc(&c->btree_trans_pool, GFP_NOFS);
+ memset(trans, 0, sizeof(*trans));
+ closure_init_stack(&trans->ref);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
+ seqmutex_lock(&c->btree_trans_lock);
+ if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG)) {
struct btree_trans *pos;
+ pid_t pid = current->pid;
+
+ trans->locking_wait.task = current;
- seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(pos, &c->btree_trans_list, list) {
+ struct task_struct *pos_task = READ_ONCE(pos->locking_wait.task);
/*
* We'd much prefer to be stricter here and completely
* disallow multiple btree_trans in the same thread -
* but the data move path calls bch2_write when we
* already have a btree_trans initialized.
*/
- BUG_ON(trans->locking_wait.task->pid == pos->locking_wait.task->pid &&
+ BUG_ON(pos_task &&
+ pid == pos_task->pid &&
bch2_trans_locked(pos));
- if (trans->locking_wait.task->pid < pos->locking_wait.task->pid) {
+ if (pos_task && pid < pos_task->pid) {
list_add_tail(&trans->list, &pos->list);
goto list_add_done;
}
}
- list_add_tail(&trans->list, &c->btree_trans_list);
+ }
+ list_add_tail(&trans->list, &c->btree_trans_list);
list_add_done:
- seqmutex_unlock(&c->btree_trans_lock);
+ seqmutex_unlock(&c->btree_trans_lock);
+got_trans:
+ trans->c = c;
+ trans->last_begin_time = local_clock();
+ trans->fn_idx = fn_idx;
+ trans->locking_wait.task = current;
+ trans->journal_replay_not_finished =
+ unlikely(!test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags)) &&
+ atomic_inc_not_zero(&c->journal_keys.ref);
+ trans->nr_paths = ARRAY_SIZE(trans->_paths);
+ trans->paths_allocated = trans->_paths_allocated;
+ trans->sorted = trans->_sorted;
+ trans->paths = trans->_paths;
+ trans->updates = trans->_updates;
+
+ *trans_paths_nr(trans->paths) = BTREE_ITER_INITIAL;
+
+ trans->paths_allocated[0] = 1;
+
+ if (fn_idx < BCH_TRANSACTIONS_NR) {
+ trans->fn = bch2_btree_transaction_fns[fn_idx];
+
+ struct btree_transaction_stats *s = &c->btree_transaction_stats[fn_idx];
+
+ if (s->max_mem) {
+ unsigned expected_mem_bytes = roundup_pow_of_two(s->max_mem);
+
+ trans->mem = kmalloc(expected_mem_bytes, GFP_KERNEL);
+ if (likely(trans->mem))
+ trans->mem_bytes = expected_mem_bytes;
+ }
+
+ trans->nr_paths_max = s->nr_max_paths;
+ trans->journal_entries_size = s->journal_entries_size;
}
+ trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
+ trans->srcu_lock_time = jiffies;
+ trans->srcu_held = true;
return trans;
}
@@ -3053,14 +3031,15 @@ static void check_btree_paths_leaked(struct btree_trans *trans)
#ifdef CONFIG_BCACHEFS_DEBUG
struct bch_fs *c = trans->c;
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->ref)
goto leaked;
return;
leaked:
bch_err(c, "btree paths leaked from %s!", trans->fn);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->ref)
printk(KERN_ERR " btree %s %pS\n",
bch2_btree_id_str(path->btree_id),
@@ -3073,26 +3052,14 @@ leaked:
void bch2_trans_put(struct btree_trans *trans)
__releases(&c->btree_trans_barrier)
{
- struct btree_insert_entry *i;
struct bch_fs *c = trans->c;
- struct btree_transaction_stats *s = btree_trans_stats(trans);
bch2_trans_unlock(trans);
- if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG_TRANSACTIONS)) {
- seqmutex_lock(&c->btree_trans_lock);
- list_del(&trans->list);
- seqmutex_unlock(&c->btree_trans_lock);
- }
-
- closure_sync(&trans->ref);
-
- if (s)
- s->max_mem = max(s->max_mem, trans->mem_max);
-
trans_for_each_update(trans, i)
- __btree_path_put(i->path, true);
- trans->nr_updates = 0;
+ __btree_path_put(trans->paths + i->path, true);
+ trans->nr_updates = 0;
+ trans->locking_wait.task = NULL;
check_btree_paths_leaked(trans);
@@ -3101,8 +3068,6 @@ void bch2_trans_put(struct btree_trans *trans)
srcu_read_unlock(&c->btree_trans_barrier, trans->srcu_idx);
}
- kfree(trans->extra_journal_entries.data);
-
if (trans->fs_usage_deltas) {
if (trans->fs_usage_deltas->size + sizeof(trans->fs_usage_deltas) ==
REPLICAS_DELTA_LIST_MAX)
@@ -3115,6 +3080,13 @@ void bch2_trans_put(struct btree_trans *trans)
if (unlikely(trans->journal_replay_not_finished))
bch2_journal_keys_put(c);
+ unsigned long *paths_allocated = trans->paths_allocated;
+ trans->paths_allocated = NULL;
+ trans->paths = NULL;
+
+ if (paths_allocated != trans->_paths_allocated)
+ kfree_rcu_mightsleep(paths_allocated);
+
if (trans->mem_bytes == BTREE_TRANS_MEM_MAX)
mempool_free(trans->mem, &c->btree_trans_mem_pool);
else
@@ -3123,8 +3095,16 @@ void bch2_trans_put(struct btree_trans *trans)
/* Userspace doesn't have a real percpu implementation: */
if (IS_ENABLED(__KERNEL__))
trans = this_cpu_xchg(c->btree_trans_bufs->trans, trans);
- if (trans)
+
+ if (trans) {
+ closure_sync(&trans->ref);
+
+ seqmutex_lock(&c->btree_trans_lock);
+ list_del(&trans->list);
+ seqmutex_unlock(&c->btree_trans_lock);
+
mempool_free(trans, &c->btree_trans_pool);
+ }
}
static void __maybe_unused
@@ -3152,24 +3132,38 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out,
void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
{
- struct btree_path *path;
struct btree_bkey_cached_common *b;
static char lock_types[] = { 'r', 'i', 'w' };
+ struct task_struct *task = READ_ONCE(trans->locking_wait.task);
unsigned l, idx;
+ /* before rcu_read_lock(): */
+ bch2_printbuf_make_room(out, 4096);
+
if (!out->nr_tabstops) {
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 32);
}
- prt_printf(out, "%i %s\n", trans->locking_wait.task->pid, trans->fn);
+ prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn);
- trans_for_each_path_safe(trans, path, idx) {
+ /* trans->paths is rcu protected vs. freeing */
+ rcu_read_lock();
+ out->atomic++;
+
+ struct btree_path *paths = rcu_dereference(trans->paths);
+ if (!paths)
+ goto out;
+
+ unsigned long *paths_allocated = trans_paths_allocated(paths);
+
+ trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths), idx, 1) {
+ struct btree_path *path = paths + idx;
if (!path->nodes_locked)
continue;
prt_printf(out, " path %u %c l=%u %s:",
- path->idx,
+ idx,
path->cached ? 'c' : 'b',
path->level,
bch2_btree_id_str(path->btree_id));
@@ -3197,6 +3191,9 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans)
bch2_btree_bkey_cached_common_to_text(out, b);
prt_newline(out);
}
+out:
+ --out->atomic;
+ rcu_read_unlock();
}
void bch2_fs_btree_iter_exit(struct bch_fs *c)
@@ -3205,15 +3202,26 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c)
struct btree_trans *trans;
int cpu;
+ if (c->btree_trans_bufs)
+ for_each_possible_cpu(cpu) {
+ struct btree_trans *trans =
+ per_cpu_ptr(c->btree_trans_bufs, cpu)->trans;
+
+ if (trans) {
+ closure_sync(&trans->ref);
+
+ seqmutex_lock(&c->btree_trans_lock);
+ list_del(&trans->list);
+ seqmutex_unlock(&c->btree_trans_lock);
+ }
+ kfree(trans);
+ }
+ free_percpu(c->btree_trans_bufs);
+
trans = list_first_entry_or_null(&c->btree_trans_list, struct btree_trans, list);
if (trans)
panic("%s leaked btree_trans\n", trans->fn);
- if (c->btree_trans_bufs)
- for_each_possible_cpu(cpu)
- kfree(per_cpu_ptr(c->btree_trans_bufs, cpu)->trans);
- free_percpu(c->btree_trans_bufs);
-
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
@@ -3234,6 +3242,7 @@ void bch2_fs_btree_iter_init_early(struct bch_fs *c)
for (s = c->btree_transaction_stats;
s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats);
s++) {
+ bch2_time_stats_init(&s->duration);
bch2_time_stats_init(&s->lock_hold_times);
mutex_init(&s->lock);
}
diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h
index eaffced4c132..24772538e4cc 100644
--- a/fs/bcachefs/btree_iter.h
+++ b/fs/bcachefs/btree_iter.h
@@ -63,60 +63,57 @@ static inline void btree_trans_sort_paths(struct btree_trans *trans)
__bch2_btree_trans_sort_paths(trans);
}
-static inline struct btree_path *
-__trans_next_path(struct btree_trans *trans, unsigned idx)
+static inline unsigned long *trans_paths_nr(struct btree_path *paths)
{
- u64 l;
-
- if (idx == BTREE_ITER_MAX)
- return NULL;
-
- l = trans->paths_allocated >> idx;
- if (!l)
- return NULL;
-
- idx += __ffs64(l);
- EBUG_ON(idx >= BTREE_ITER_MAX);
- EBUG_ON(trans->paths[idx].idx != idx);
- return &trans->paths[idx];
+ return &container_of(paths, struct btree_trans_paths, paths[0])->nr_paths;
}
-#define trans_for_each_path_from(_trans, _path, _start) \
- for (_path = __trans_next_path((_trans), _start); \
- (_path); \
- _path = __trans_next_path((_trans), (_path)->idx + 1))
-
-#define trans_for_each_path(_trans, _path) \
- trans_for_each_path_from(_trans, _path, 0)
-
-static inline struct btree_path *
-__trans_next_path_safe(struct btree_trans *trans, unsigned *idx)
+static inline unsigned long *trans_paths_allocated(struct btree_path *paths)
{
- u64 l;
+ unsigned long *v = trans_paths_nr(paths);
+ return v - BITS_TO_LONGS(*v);
+}
- if (*idx == BTREE_ITER_MAX)
- return NULL;
+#define trans_for_each_path_idx_from(_paths_allocated, _nr, _idx, _start)\
+ for (_idx = _start; \
+ (_idx = find_next_bit(_paths_allocated, _nr, _idx)) < _nr; \
+ _idx++)
- l = trans->paths_allocated >> *idx;
- if (!l)
- return NULL;
+static inline struct btree_path *
+__trans_next_path(struct btree_trans *trans, unsigned *idx)
+{
+ unsigned long *w = trans->paths_allocated + *idx / BITS_PER_LONG;
+ /*
+ * Open coded find_next_bit(), because
+ * - this is fast path, we can't afford the function call
+ * - and we know that nr_paths is a multiple of BITS_PER_LONG,
+ */
+ while (*idx < trans->nr_paths) {
+ unsigned long v = *w >> (*idx & (BITS_PER_LONG - 1));
+ if (v) {
+ *idx += __ffs(v);
+ return trans->paths + *idx;
+ }
+
+ *idx += BITS_PER_LONG;
+ *idx &= ~(BITS_PER_LONG - 1);
+ w++;
+ }
- *idx += __ffs64(l);
- EBUG_ON(*idx >= BTREE_ITER_MAX);
- return &trans->paths[*idx];
+ return NULL;
}
/*
* This version is intended to be safe for use on a btree_trans that is owned by
* another thread, for bch2_btree_trans_to_text();
*/
-#define trans_for_each_path_safe_from(_trans, _path, _idx, _start) \
+#define trans_for_each_path_from(_trans, _path, _idx, _start) \
for (_idx = _start; \
- (_path = __trans_next_path_safe((_trans), &_idx)); \
+ (_path = __trans_next_path((_trans), &_idx)); \
_idx++)
-#define trans_for_each_path_safe(_trans, _path, _idx) \
- trans_for_each_path_safe_from(_trans, _path, _idx, 0)
+#define trans_for_each_path(_trans, _path, _idx) \
+ trans_for_each_path_from(_trans, _path, _idx, 1)
static inline struct btree_path *next_btree_path(struct btree_trans *trans, struct btree_path *path)
{
@@ -138,10 +135,23 @@ static inline struct btree_path *prev_btree_path(struct btree_trans *trans, stru
: NULL;
}
-#define trans_for_each_path_inorder(_trans, _path, _i) \
- for (_i = 0; \
- ((_path) = (_trans)->paths + trans->sorted[_i]), (_i) < (_trans)->nr_sorted;\
- _i++)
+#define trans_for_each_path_idx_inorder(_trans, _iter) \
+ for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \
+ (_iter.path_idx = trans->sorted[_iter.sorted_idx], \
+ _iter.sorted_idx < (_trans)->nr_sorted); \
+ _iter.sorted_idx++)
+
+struct trans_for_each_path_inorder_iter {
+ btree_path_idx_t sorted_idx;
+ btree_path_idx_t path_idx;
+};
+
+#define trans_for_each_path_inorder(_trans, _path, _iter) \
+ for (_iter = (struct trans_for_each_path_inorder_iter) { 0 }; \
+ (_iter.path_idx = trans->sorted[_iter.sorted_idx], \
+ _path = (_trans)->paths + _iter.path_idx, \
+ _iter.sorted_idx < (_trans)->nr_sorted); \
+ _iter.sorted_idx++)
#define trans_for_each_path_inorder_reverse(_trans, _path, _i) \
for (_i = trans->nr_sorted - 1; \
@@ -157,67 +167,65 @@ static inline bool __path_has_node(const struct btree_path *path,
static inline struct btree_path *
__trans_next_path_with_node(struct btree_trans *trans, struct btree *b,
- unsigned idx)
+ unsigned *idx)
{
- struct btree_path *path = __trans_next_path(trans, idx);
+ struct btree_path *path;
- while (path && !__path_has_node(path, b))
- path = __trans_next_path(trans, path->idx + 1);
+ while ((path = __trans_next_path(trans, idx)) &&
+ !__path_has_node(path, b))
+ (*idx)++;
return path;
}
-#define trans_for_each_path_with_node(_trans, _b, _path) \
- for (_path = __trans_next_path_with_node((_trans), (_b), 0); \
- (_path); \
- _path = __trans_next_path_with_node((_trans), (_b), \
- (_path)->idx + 1))
+#define trans_for_each_path_with_node(_trans, _b, _path, _iter) \
+ for (_iter = 1; \
+ (_path = __trans_next_path_with_node((_trans), (_b), &_iter));\
+ _iter++)
-struct btree_path *__bch2_btree_path_make_mut(struct btree_trans *, struct btree_path *,
- bool, unsigned long);
+btree_path_idx_t __bch2_btree_path_make_mut(struct btree_trans *, btree_path_idx_t,
+ bool, unsigned long);
-static inline struct btree_path * __must_check
+static inline btree_path_idx_t __must_check
bch2_btree_path_make_mut(struct btree_trans *trans,
- struct btree_path *path, bool intent,
+ btree_path_idx_t path, bool intent,
unsigned long ip)
{
- if (path->ref > 1 || path->preserve)
+ if (trans->paths[path].ref > 1 ||
+ trans->paths[path].preserve)
path = __bch2_btree_path_make_mut(trans, path, intent, ip);
- path->should_be_locked = false;
+ trans->paths[path].should_be_locked = false;
return path;
}
-struct btree_path * __must_check
-__bch2_btree_path_set_pos(struct btree_trans *, struct btree_path *,
- struct bpos, bool, unsigned long, int);
+btree_path_idx_t __must_check
+__bch2_btree_path_set_pos(struct btree_trans *, btree_path_idx_t,
+ struct bpos, bool, unsigned long);
-static inline struct btree_path * __must_check
+static inline btree_path_idx_t __must_check
bch2_btree_path_set_pos(struct btree_trans *trans,
- struct btree_path *path, struct bpos new_pos,
- bool intent, unsigned long ip)
+ btree_path_idx_t path, struct bpos new_pos,
+ bool intent, unsigned long ip)
{
- int cmp = bpos_cmp(new_pos, path->pos);
-
- return cmp
- ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip, cmp)
+ return !bpos_eq(new_pos, trans->paths[path].pos)
+ ? __bch2_btree_path_set_pos(trans, path, new_pos, intent, ip)
: path;
}
-int __must_check bch2_btree_path_traverse_one(struct btree_trans *, struct btree_path *,
+int __must_check bch2_btree_path_traverse_one(struct btree_trans *,
+ btree_path_idx_t,
unsigned, unsigned long);
static inline int __must_check bch2_btree_path_traverse(struct btree_trans *trans,
- struct btree_path *path, unsigned flags)
+ btree_path_idx_t path, unsigned flags)
{
- if (path->uptodate < BTREE_ITER_NEED_RELOCK)
+ if (trans->paths[path].uptodate < BTREE_ITER_NEED_RELOCK)
return 0;
return bch2_btree_path_traverse_one(trans, path, flags, _RET_IP_);
}
-int __must_check bch2_btree_path_traverse(struct btree_trans *,
- struct btree_path *, unsigned);
-struct btree_path *bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
+btree_path_idx_t bch2_path_get(struct btree_trans *, enum btree_id, struct bpos,
unsigned, unsigned, unsigned, unsigned long);
struct bkey_s_c bch2_btree_path_peek_slot(struct btree_path *, struct bkey *);
@@ -269,7 +277,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, struct btree_path *,
int bch2_btree_path_relock_intent(struct btree_trans *, struct btree_path *);
-void bch2_path_put(struct btree_trans *, struct btree_path *, bool);
+void bch2_path_put(struct btree_trans *, btree_path_idx_t, bool);
int bch2_trans_relock(struct btree_trans *);
int bch2_trans_relock_notrace(struct btree_trans *);
@@ -335,7 +343,7 @@ static inline void bch2_btree_path_downgrade(struct btree_trans *trans,
void bch2_trans_downgrade(struct btree_trans *);
-void bch2_trans_node_add(struct btree_trans *trans, struct btree *);
+void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct btree *);
void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *);
int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter);
@@ -348,8 +356,6 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *);
struct bkey_s_c bch2_btree_iter_peek_upto(struct btree_iter *, struct bpos);
struct bkey_s_c bch2_btree_iter_next(struct btree_iter *);
-struct bkey_s_c bch2_btree_iter_peek_all_levels(struct btree_iter *);
-
static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter)
{
return bch2_btree_iter_peek_upto(iter, SPOS_MAX);
@@ -376,10 +382,12 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo
static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos)
{
+ struct btree_trans *trans = iter->trans;
+
if (unlikely(iter->update_path))
- bch2_path_put(iter->trans, iter->update_path,
+ bch2_path_put(trans, iter->update_path,
iter->flags & BTREE_ITER_INTENT);
- iter->update_path = NULL;
+ iter->update_path = 0;
if (!(iter->flags & BTREE_ITER_ALL_SNAPSHOTS))
new_pos.snapshot = iter->snapshot;
@@ -408,9 +416,6 @@ static inline unsigned __bch2_btree_iter_flags(struct btree_trans *trans,
unsigned btree_id,
unsigned flags)
{
- if (flags & BTREE_ITER_ALL_LEVELS)
- flags |= BTREE_ITER_ALL_SNAPSHOTS|__BTREE_ITER_ALL_SNAPSHOTS;
-
if (!(flags & (BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_NOT_EXTENTS)) &&
btree_id_is_extents(btree_id))
flags |= BTREE_ITER_IS_EXTENTS;
@@ -450,14 +455,16 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans,
unsigned flags,
unsigned long ip)
{
- memset(iter, 0, sizeof(*iter));
- iter->trans = trans;
- iter->btree_id = btree_id;
- iter->flags = flags;
- iter->snapshot = pos.snapshot;
- iter->pos = pos;
- iter->k.p = pos;
-
+ iter->trans = trans;
+ iter->update_path = 0;
+ iter->key_cache_path = 0;
+ iter->btree_id = btree_id;
+ iter->min_depth = 0;
+ iter->flags = flags;
+ iter->snapshot = pos.snapshot;
+ iter->pos = pos;
+ iter->k = POS_KEY(pos);
+ iter->journal_idx = 0;
#ifdef CONFIG_BCACHEFS_DEBUG
iter->ip_allocated = ip;
#endif
@@ -489,8 +496,10 @@ void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *);
static inline void set_btree_iter_dontneed(struct btree_iter *iter)
{
- if (!iter->trans->restarted)
- iter->path->preserve = false;
+ struct btree_trans *trans = iter->trans;
+
+ if (!trans->restarted)
+ btree_iter_path(trans, iter)->preserve = false;
}
void *__bch2_trans_kmalloc(struct btree_trans *, size_t);
@@ -512,7 +521,7 @@ static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size)
static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size)
{
- size = roundup(size, 8);
+ size = round_up(size, 8);
if (likely(trans->mem_top + size <= trans->mem_bytes)) {
void *p = trans->mem + trans->mem_top;
@@ -581,7 +590,6 @@ static inline int __bch2_bkey_get_val_typed(struct btree_trans *trans,
KEY_TYPE_##_type, sizeof(*_val), _val)
void bch2_trans_srcu_unlock(struct btree_trans *);
-void bch2_trans_srcu_lock(struct btree_trans *);
u32 bch2_trans_begin(struct btree_trans *);
@@ -606,8 +614,6 @@ u32 bch2_trans_begin(struct btree_trans *);
static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter,
unsigned flags)
{
- BUG_ON(flags & BTREE_ITER_ALL_LEVELS);
-
return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek_prev(iter);
}
@@ -615,8 +621,7 @@ static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *
static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter,
unsigned flags)
{
- return flags & BTREE_ITER_ALL_LEVELS ? bch2_btree_iter_peek_all_levels(iter) :
- flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
+ return flags & BTREE_ITER_SLOTS ? bch2_btree_iter_peek_slot(iter) :
bch2_btree_iter_peek(iter);
}
@@ -633,61 +638,34 @@ static inline struct bkey_s_c bch2_btree_iter_peek_upto_type(struct btree_iter *
return bch2_btree_iter_peek_slot(iter);
}
+int __bch2_btree_trans_too_many_iters(struct btree_trans *);
+
static inline int btree_trans_too_many_iters(struct btree_trans *trans)
{
- if (hweight64(trans->paths_allocated) > BTREE_ITER_MAX - 8) {
- trace_and_count(trans->c, trans_restart_too_many_iters, trans, _THIS_IP_);
- return btree_trans_restart(trans, BCH_ERR_transaction_restart_too_many_iters);
- }
+ if (bitmap_weight(trans->paths_allocated, trans->nr_paths) > BTREE_ITER_INITIAL - 8)
+ return __bch2_btree_trans_too_many_iters(trans);
return 0;
}
-struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
- struct btree_iter *iter, unsigned flags)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(trans) ||
- (k = bch2_btree_iter_peek_type(iter, flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(trans);
-
- return k;
-}
-
-static inline struct bkey_s_c
-__bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
- struct btree_iter *iter,
- struct bpos end,
- unsigned flags)
-{
- struct bkey_s_c k;
-
- while (btree_trans_too_many_iters(trans) ||
- (k = bch2_btree_iter_peek_upto_type(iter, end, flags),
- bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
- bch2_trans_begin(trans);
-
- return k;
-}
-
+/*
+ * goto instead of loop, so that when used inside for_each_btree_key2()
+ * break/continue work correctly
+ */
#define lockrestart_do(_trans, _do) \
({ \
+ __label__ transaction_restart; \
u32 _restart_count; \
int _ret2; \
+transaction_restart: \
+ _restart_count = bch2_trans_begin(_trans); \
+ _ret2 = (_do); \
\
- do { \
- _restart_count = bch2_trans_begin(_trans); \
- _ret2 = (_do); \
- } while (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)); \
+ if (bch2_err_matches(_ret2, BCH_ERR_transaction_restart)) \
+ goto transaction_restart; \
\
if (!_ret2) \
bch2_trans_verify_not_restarted(_trans, _restart_count);\
- \
_ret2; \
})
@@ -716,91 +694,56 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_ret2 ?: trans_was_restarted(_trans, _restart_count); \
})
-#define for_each_btree_key2(_trans, _iter, _btree_id, \
- _start, _flags, _k, _do) \
+#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
+ _start, _end, _flags, _k, _do) \
({ \
+ struct btree_iter _iter; \
+ struct bkey_s_c _k; \
int _ret3 = 0; \
\
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
\
- while (1) { \
- u32 _restart_count = bch2_trans_begin(_trans); \
- \
- _ret3 = 0; \
- (_k) = bch2_btree_iter_peek_type(&(_iter), (_flags)); \
- if (!(_k).k) \
- break; \
+ do { \
+ _ret3 = lockrestart_do(_trans, ({ \
+ (_k) = bch2_btree_iter_peek_upto_type(&(_iter), \
+ _end, (_flags)); \
+ if (!(_k).k) \
+ break; \
\
- _ret3 = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
- continue; \
- if (_ret3) \
- break; \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- if (!bch2_btree_iter_advance(&(_iter))) \
- break; \
- } \
+ bkey_err(_k) ?: (_do); \
+ })); \
+ } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
})
-#define for_each_btree_key2_upto(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _do) \
-({ \
- int _ret3 = 0; \
- \
- bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- \
- while (1) { \
- u32 _restart_count = bch2_trans_begin(_trans); \
- \
- _ret3 = 0; \
- (_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, (_flags));\
- if (!(_k).k) \
- break; \
- \
- _ret3 = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
- continue; \
- if (_ret3) \
- break; \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- if (!bch2_btree_iter_advance(&(_iter))) \
- break; \
- } \
- \
- bch2_trans_iter_exit((_trans), &(_iter)); \
- _ret3; \
-})
+#define for_each_btree_key(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _do) \
+ for_each_btree_key_upto(_trans, _iter, _btree_id, _start, \
+ SPOS_MAX, _flags, _k, _do)
#define for_each_btree_key_reverse(_trans, _iter, _btree_id, \
_start, _flags, _k, _do) \
({ \
+ struct btree_iter _iter; \
+ struct bkey_s_c _k; \
int _ret3 = 0; \
\
bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
\
- while (1) { \
- u32 _restart_count = bch2_trans_begin(_trans); \
- (_k) = bch2_btree_iter_peek_prev_type(&(_iter), (_flags));\
- if (!(_k).k) { \
- _ret3 = 0; \
- break; \
- } \
+ do { \
+ _ret3 = lockrestart_do(_trans, ({ \
+ (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \
+ (_flags)); \
+ if (!(_k).k) \
+ break; \
\
- _ret3 = bkey_err(_k) ?: (_do); \
- if (bch2_err_matches(_ret3, BCH_ERR_transaction_restart))\
- continue; \
- if (_ret3) \
- break; \
- bch2_trans_verify_not_restarted(_trans, _restart_count);\
- if (!bch2_btree_iter_rewind(&(_iter))) \
- break; \
- } \
+ bkey_err(_k) ?: (_do); \
+ })); \
+ } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \
\
bch2_trans_iter_exit((_trans), &(_iter)); \
_ret3; \
@@ -810,7 +753,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_start, _iter_flags, _k, \
_disk_res, _journal_seq, _commit_flags,\
_do) \
- for_each_btree_key2(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
+ for_each_btree_key(_trans, _iter, _btree_id, _start, _iter_flags, _k,\
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
@@ -826,32 +769,31 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_start, _end, _iter_flags, _k, \
_disk_res, _journal_seq, _commit_flags,\
_do) \
- for_each_btree_key2_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
+ for_each_btree_key_upto(_trans, _iter, _btree_id, _start, _end, _iter_flags, _k,\
(_do) ?: bch2_trans_commit(_trans, (_disk_res),\
(_journal_seq), (_commit_flags)))
-#define for_each_btree_key(_trans, _iter, _btree_id, \
- _start, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *);
-#define for_each_btree_key_upto(_trans, _iter, _btree_id, \
- _start, _end, _flags, _k, _ret) \
- for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
- (_start), (_flags)); \
- (_k) = __bch2_btree_iter_peek_upto_and_restart((_trans), \
- &(_iter), _end, _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
+static inline struct bkey_s_c
+__bch2_btree_iter_peek_and_restart(struct btree_trans *trans,
+ struct btree_iter *iter, unsigned flags)
+{
+ struct bkey_s_c k;
-#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
+ while (btree_trans_too_many_iters(trans) ||
+ (k = bch2_btree_iter_peek_type(iter, flags),
+ bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart)))
+ bch2_trans_begin(trans);
+
+ return k;
+}
+
+#define for_each_btree_key_old(_trans, _iter, _btree_id, \
_start, _flags, _k, _ret) \
for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \
(_start), (_flags)); \
- (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
+ (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
@@ -863,24 +805,25 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
-#define for_each_btree_key_continue(_trans, _iter, _flags, _k, _ret) \
- for (; \
- (_k) = __bch2_btree_iter_peek_and_restart((_trans), &(_iter), _flags),\
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
-
-#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
- for (; \
- (_k) = bch2_btree_iter_peek_type(&(_iter), _flags), \
- !((_ret) = bkey_err(_k)) && (_k).k; \
- bch2_btree_iter_advance(&(_iter)))
-
#define for_each_btree_key_upto_continue_norestart(_iter, _end, _flags, _k, _ret)\
for (; \
(_k) = bch2_btree_iter_peek_upto_type(&(_iter), _end, _flags), \
!((_ret) = bkey_err(_k)) && (_k).k; \
bch2_btree_iter_advance(&(_iter)))
+#define for_each_btree_key_norestart(_trans, _iter, _btree_id, \
+ _start, _flags, _k, _ret) \
+ for_each_btree_key_upto_norestart(_trans, _iter, _btree_id, _start,\
+ SPOS_MAX, _flags, _k, _ret)
+
+#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \
+ for_each_btree_key_upto_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret)
+
+/*
+ * This should not be used in a fastpath, without first trying _do in
+ * nonblocking mode - it will cause excessive transaction restarts and
+ * potentially livelocking:
+ */
#define drop_locks_do(_trans, _do) \
({ \
bch2_trans_unlock(_trans); \
@@ -912,10 +855,7 @@ __bch2_btree_iter_peek_upto_and_restart(struct btree_trans *trans,
_p; \
})
-/* new multiple iterator interface: */
-
void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *);
-void bch2_btree_path_to_text(struct printbuf *, struct btree_path *);
void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *);
void bch2_dump_trans_updates(struct btree_trans *);
void bch2_dump_trans_paths_updates(struct btree_trans *);
diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c
index ec52f50d249d..719a94a84950 100644
--- a/fs/bcachefs/btree_journal_iter.c
+++ b/fs/bcachefs/btree_journal_iter.c
@@ -73,6 +73,7 @@ static size_t bch2_journal_key_search(struct journal_keys *keys,
return idx_to_pos(keys, __bch2_journal_key_search(keys, id, level, pos));
}
+/* Returns first non-overwritten key >= search key: */
struct bkey_i *bch2_journal_keys_peek_upto(struct bch_fs *c, enum btree_id btree_id,
unsigned level, struct bpos pos,
struct bpos end_pos, size_t *idx)
@@ -86,12 +87,26 @@ search:
if (!*idx)
*idx = __bch2_journal_key_search(keys, btree_id, level, pos);
+ while (*idx &&
+ __journal_key_cmp(btree_id, level, end_pos, idx_to_key(keys, *idx - 1)) <= 0) {
+ --(*idx);
+ iters++;
+ if (iters == 10) {
+ *idx = 0;
+ goto search;
+ }
+ }
+
while ((k = *idx < keys->nr ? idx_to_key(keys, *idx) : NULL)) {
if (__journal_key_cmp(btree_id, level, end_pos, k) < 0)
return NULL;
- if (__journal_key_cmp(btree_id, level, pos, k) <= 0 &&
- !k->overwritten)
+ if (k->overwritten) {
+ (*idx)++;
+ continue;
+ }
+
+ if (__journal_key_cmp(btree_id, level, pos, k) <= 0)
return k->k;
(*idx)++;
@@ -162,7 +177,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id,
struct journal_keys *keys = &c->journal_keys;
size_t idx = bch2_journal_key_search(keys, id, level, k->k.p);
- BUG_ON(test_bit(BCH_FS_RW, &c->flags));
+ BUG_ON(test_bit(BCH_FS_rw, &c->flags));
if (idx < keys->size &&
journal_key_cmp(&n, &keys->d[idx]) == 0) {
@@ -452,9 +467,7 @@ static void __journal_keys_sort(struct journal_keys *keys)
src = dst = keys->d;
while (src < keys->d + keys->nr) {
while (src + 1 < keys->d + keys->nr &&
- src[0].btree_id == src[1].btree_id &&
- src[0].level == src[1].level &&
- bpos_eq(src[0].k->k.p, src[1].k->k.p))
+ !journal_key_cmp(src, src + 1))
src++;
*dst++ = *src++;
diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c
index 1b7a5668df7c..74e52fd28abe 100644
--- a/fs/bcachefs/btree_key_cache.c
+++ b/fs/bcachefs/btree_key_cache.c
@@ -630,7 +630,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
if (ret)
goto out;
- ck = (void *) c_iter.path->l[0].b;
+ ck = (void *) btree_iter_path(trans, &c_iter)->l[0].b;
if (!ck)
goto out;
@@ -645,22 +645,29 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
if (journal_seq && ck->journal.seq != journal_seq)
goto out;
+ trans->journal_res.seq = ck->journal.seq;
+
/*
- * Since journal reclaim depends on us making progress here, and the
- * allocator/copygc depend on journal reclaim making progress, we need
- * to be using alloc reserves:
+ * If we're at the end of the journal, we really want to free up space
+ * in the journal right away - we don't want to pin that old journal
+ * sequence number with a new btree node write, we want to re-journal
+ * the update
*/
+ if (ck->journal.seq == journal_last_seq(j))
+ commit_flags |= BCH_WATERMARK_reclaim;
+
+ if (ck->journal.seq != journal_last_seq(j) ||
+ j->watermark == BCH_WATERMARK_stripe)
+ commit_flags |= BCH_TRANS_COMMIT_no_journal_res;
+
ret = bch2_btree_iter_traverse(&b_iter) ?:
bch2_trans_update(trans, &b_iter, ck->k,
BTREE_UPDATE_KEY_CACHE_RECLAIM|
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE|
BTREE_TRIGGER_NORUN) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- (ck->journal.seq == journal_last_seq(j)
- ? BCH_WATERMARK_reclaim
- : 0)|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
commit_flags);
bch2_fs_fatal_err_on(ret &&
@@ -673,7 +680,8 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
bch2_journal_pin_drop(j, &ck->journal);
- BUG_ON(!btree_node_locked(c_iter.path, 0));
+ struct btree_path *path = btree_iter_path(trans, &c_iter);
+ BUG_ON(!btree_node_locked(path, 0));
if (!evict) {
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
@@ -682,19 +690,20 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans,
}
} else {
struct btree_path *path2;
+ unsigned i;
evict:
- trans_for_each_path(trans, path2)
- if (path2 != c_iter.path)
+ trans_for_each_path(trans, path2, i)
+ if (path2 != path)
__bch2_btree_path_unlock(trans, path2);
- bch2_btree_node_lock_write_nofail(trans, c_iter.path, &ck->c);
+ bch2_btree_node_lock_write_nofail(trans, path, &ck->c);
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
clear_bit(BKEY_CACHED_DIRTY, &ck->flags);
atomic_long_dec(&c->btree_key_cache.nr_dirty);
}
- mark_btree_node_locked_noreset(c_iter.path, 0, BTREE_NODE_UNLOCKED);
+ mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED);
bkey_cached_evict(&c->btree_key_cache, ck);
bkey_cached_free_fast(&c->btree_key_cache, ck);
}
@@ -732,9 +741,9 @@ int bch2_btree_key_cache_journal_flush(struct journal *j,
}
six_unlock_read(&ck->c.lock);
- ret = commit_do(trans, NULL, NULL, 0,
+ ret = lockrestart_do(trans,
btree_key_cache_flush_pos(trans, key, seq,
- BTREE_INSERT_JOURNAL_RECLAIM, false));
+ BCH_TRANS_COMMIT_journal_reclaim, false));
unlock:
srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
@@ -742,28 +751,12 @@ unlock:
return ret;
}
-/*
- * Flush and evict a key from the key cache:
- */
-int bch2_btree_key_cache_flush(struct btree_trans *trans,
- enum btree_id id, struct bpos pos)
-{
- struct bch_fs *c = trans->c;
- struct bkey_cached_key key = { id, pos };
-
- /* Fastpath - assume it won't be found: */
- if (!bch2_btree_key_cache_find(c, id, pos))
- return 0;
-
- return btree_key_cache_flush_pos(trans, key, 0, 0, true);
-}
-
bool bch2_btree_insert_key_cached(struct btree_trans *trans,
unsigned flags,
struct btree_insert_entry *insert_entry)
{
struct bch_fs *c = trans->c;
- struct bkey_cached *ck = (void *) insert_entry->path->l[0].b;
+ struct bkey_cached *ck = (void *) (trans->paths + insert_entry->path)->l[0].b;
struct bkey_i *insert = insert_entry->k;
bool kick_reclaim = false;
@@ -773,7 +766,7 @@ bool bch2_btree_insert_key_cached(struct btree_trans *trans,
ck->valid = true;
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
- EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
set_bit(BKEY_CACHED_DIRTY, &ck->flags);
atomic_long_inc(&c->btree_key_cache.nr_dirty);
@@ -1000,7 +993,7 @@ void bch2_fs_btree_key_cache_exit(struct btree_key_cache *bc)
if (atomic_long_read(&bc->nr_dirty) &&
!bch2_journal_error(&c->journal) &&
- test_bit(BCH_FS_WAS_RW, &c->flags))
+ test_bit(BCH_FS_was_rw, &c->flags))
panic("btree key cache shutdown error: nr_dirty nonzero (%li)\n",
atomic_long_read(&bc->nr_dirty));
diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h
index be3acde2caa0..e6b2cd0dd2c1 100644
--- a/fs/bcachefs/btree_key_cache.h
+++ b/fs/bcachefs/btree_key_cache.h
@@ -31,8 +31,6 @@ int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *,
bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned,
struct btree_insert_entry *);
-int bch2_btree_key_cache_flush(struct btree_trans *,
- enum btree_id, struct bpos);
void bch2_btree_key_cache_drop(struct btree_trans *,
struct btree_path *);
diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c
index 3d48834d091f..684397442338 100644
--- a/fs/bcachefs/btree_locking.c
+++ b/fs/bcachefs/btree_locking.c
@@ -32,13 +32,14 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *trans,
{
struct btree_path *path;
struct six_lock_count ret;
+ unsigned i;
memset(&ret, 0, sizeof(ret));
if (IS_ERR_OR_NULL(b))
return ret;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path != skip && &path->l[level].b->c == b) {
int t = btree_node_locked_type(path, level);
@@ -85,8 +86,14 @@ static noinline void print_cycle(struct printbuf *out, struct lock_graph *g)
prt_printf(out, "Found lock cycle (%u entries):", g->nr);
prt_newline(out);
- for (i = g->g; i < g->g + g->nr; i++)
+ for (i = g->g; i < g->g + g->nr; i++) {
+ struct task_struct *task = READ_ONCE(i->trans->locking_wait.task);
+ if (!task)
+ continue;
+
bch2_btree_trans_to_text(out, i->trans);
+ bch2_prt_task_backtrace(out, task, i == g->g ? 5 : 1, GFP_NOWAIT);
+ }
}
static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
@@ -94,9 +101,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g)
struct trans_waiting_for_lock *i;
for (i = g->g; i != g->g + g->nr; i++) {
+ struct task_struct *task = i->trans->locking_wait.task;
if (i != g->g)
prt_str(out, "<- ");
- prt_printf(out, "%u ", i->trans->locking_wait.task->pid);
+ prt_printf(out, "%u ", task ?task->pid : 0);
}
prt_newline(out);
}
@@ -142,10 +150,27 @@ static bool lock_graph_remove_non_waiters(struct lock_graph *g)
return false;
}
+static void trace_would_deadlock(struct lock_graph *g, struct btree_trans *trans)
+{
+ struct bch_fs *c = trans->c;
+
+ count_event(c, trans_restart_would_deadlock);
+
+ if (trace_trans_restart_would_deadlock_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ buf.atomic++;
+ print_cycle(&buf, g);
+
+ trace_trans_restart_would_deadlock(trans, buf.buf);
+ printbuf_exit(&buf);
+ }
+}
+
static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i)
{
if (i == g->g) {
- trace_and_count(i->trans->c, trans_restart_would_deadlock, i->trans, _RET_IP_);
+ trace_would_deadlock(g, i->trans);
return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock);
} else {
i->trans->lock_must_abort = true;
@@ -202,7 +227,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle)
prt_printf(&buf, "backtrace:");
prt_newline(&buf);
printbuf_indent_add(&buf, 2);
- bch2_prt_task_backtrace(&buf, trans->locking_wait.task);
+ bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT);
printbuf_indent_sub(&buf, 2);
prt_newline(&buf);
}
@@ -262,27 +287,40 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle)
struct lock_graph g;
struct trans_waiting_for_lock *top;
struct btree_bkey_cached_common *b;
- struct btree_path *path;
- unsigned path_idx;
- int ret;
+ btree_path_idx_t path_idx;
+ int ret = 0;
+
+ g.nr = 0;
if (trans->lock_must_abort) {
if (cycle)
return -1;
- trace_and_count(trans->c, trans_restart_would_deadlock, trans, _RET_IP_);
+ trace_would_deadlock(&g, trans);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_would_deadlock);
}
- g.nr = 0;
lock_graph_down(&g, trans);
+
+ /* trans->paths is rcu protected vs. freeing */
+ rcu_read_lock();
+ if (cycle)
+ cycle->atomic++;
next:
if (!g.nr)
- return 0;
+ goto out;
top = &g.g[g.nr - 1];
- trans_for_each_path_safe_from(top->trans, path, path_idx, top->path_idx) {
+ struct btree_path *paths = rcu_dereference(top->trans->paths);
+ if (!paths)
+ goto up;
+
+ unsigned long *paths_allocated = trans_paths_allocated(paths);
+
+ trans_for_each_path_idx_from(paths_allocated, *trans_paths_nr(paths),
+ path_idx, top->path_idx) {
+ struct btree_path *path = paths + path_idx;
if (!path->nodes_locked)
continue;
@@ -348,18 +386,23 @@ next:
ret = lock_graph_descend(&g, trans, cycle);
if (ret)
- return ret;
+ goto out;
goto next;
}
raw_spin_unlock(&b->lock.wait_lock);
}
}
-
+up:
if (g.nr > 1 && cycle)
print_chain(cycle, &g);
lock_graph_up(&g);
goto next;
+out:
+ if (cycle)
+ --cycle->atomic;
+ rcu_read_unlock();
+ return ret;
}
int bch2_six_check_for_deadlock(struct six_lock *lock, void *p)
@@ -398,7 +441,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
struct btree_bkey_cached_common *b)
{
struct btree_path *linked;
- unsigned i;
+ unsigned i, iter;
int ret;
/*
@@ -412,7 +455,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans,
* already taken are no longer needed:
*/
- trans_for_each_path(trans, linked) {
+ trans_for_each_path(trans, linked, iter) {
if (!linked->nodes_locked)
continue;
@@ -588,8 +631,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans,
}
__flatten
-bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
- struct btree_path *path, unsigned long trace_ip)
+bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path)
{
struct get_locks_fail f;
@@ -599,7 +641,7 @@ bool bch2_btree_path_relock_norestart(struct btree_trans *trans,
int __bch2_btree_path_relock(struct btree_trans *trans,
struct btree_path *path, unsigned long trace_ip)
{
- if (!bch2_btree_path_relock_norestart(trans, path, trace_ip)) {
+ if (!bch2_btree_path_relock_norestart(trans, path)) {
trace_and_count(trans->c, trans_restart_relock_path, trans, trace_ip, path);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path);
}
@@ -624,8 +666,6 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
unsigned new_locks_want,
struct get_locks_fail *f)
{
- struct btree_path *linked;
-
if (bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f))
return true;
@@ -648,8 +688,11 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
* before interior nodes - now that's handled by
* bch2_btree_path_traverse_all().
*/
- if (!path->cached && !trans->in_traverse_all)
- trans_for_each_path(trans, linked)
+ if (!path->cached && !trans->in_traverse_all) {
+ struct btree_path *linked;
+ unsigned i;
+
+ trans_for_each_path(trans, linked, i)
if (linked != path &&
linked->cached == path->cached &&
linked->btree_id == path->btree_id &&
@@ -657,6 +700,7 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans,
linked->locks_want = new_locks_want;
btree_path_get_locks(trans, linked, true, NULL);
}
+ }
return false;
}
@@ -665,7 +709,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
struct btree_path *path,
unsigned new_locks_want)
{
- unsigned l;
+ unsigned l, old_locks_want = path->locks_want;
if (trans->restarted)
return;
@@ -689,8 +733,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
bch2_btree_path_verify_locks(path);
- path->downgrade_seq++;
- trace_path_downgrade(trans, _RET_IP_, path);
+ trace_path_downgrade(trans, _RET_IP_, path, old_locks_want);
}
/* Btree transaction locking: */
@@ -698,40 +741,70 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans,
void bch2_trans_downgrade(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
if (trans->restarted)
return;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
bch2_btree_path_downgrade(trans, path);
}
int bch2_trans_relock(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
if (unlikely(trans->restarted))
return -((int) trans->restarted);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i) {
+ struct get_locks_fail f;
+
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
- trace_and_count(trans->c, trans_restart_relock, trans, _RET_IP_, path);
+ !btree_path_get_locks(trans, path, false, &f)) {
+ if (trace_trans_restart_relock_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bpos_to_text(&buf, path->pos);
+ prt_printf(&buf, " l=%u seq=%u node seq=",
+ f.l, path->l[f.l].lock_seq);
+ if (IS_ERR_OR_NULL(f.b)) {
+ prt_str(&buf, bch2_err_str(PTR_ERR(f.b)));
+ } else {
+ prt_printf(&buf, "%u", f.b->c.lock.seq);
+
+ struct six_lock_count c =
+ bch2_btree_node_lock_counts(trans, NULL, &f.b->c, f.l);
+ prt_printf(&buf, " self locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+
+ c = six_lock_counts(&f.b->c.lock);
+ prt_printf(&buf, " total locked %u.%u.%u", c.n[0], c.n[1], c.n[2]);
+ }
+
+ trace_trans_restart_relock(trans, _RET_IP_, buf.buf);
+ printbuf_exit(&buf);
+ }
+
+ count_event(trans->c, trans_restart_relock);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
+ }
+
return 0;
}
int bch2_trans_relock_notrace(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
if (unlikely(trans->restarted))
return -((int) trans->restarted);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->should_be_locked &&
- !bch2_btree_path_relock_norestart(trans, path, _RET_IP_)) {
+ !bch2_btree_path_relock_norestart(trans, path)) {
return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock);
}
return 0;
@@ -740,16 +813,18 @@ int bch2_trans_relock_notrace(struct btree_trans *trans)
void bch2_trans_unlock_noassert(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
__bch2_btree_path_unlock(trans, path);
}
void bch2_trans_unlock(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
__bch2_btree_path_unlock(trans, path);
}
@@ -762,8 +837,9 @@ void bch2_trans_unlock_long(struct btree_trans *trans)
bool bch2_trans_locked(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->nodes_locked)
return true;
return false;
@@ -809,8 +885,9 @@ void bch2_btree_path_verify_locks(struct btree_path *path)
void bch2_trans_verify_locks(struct btree_trans *trans)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
bch2_btree_path_verify_locks(path);
}
diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h
index 11b0a2c8cd69..4bd72c855da1 100644
--- a/fs/bcachefs/btree_locking.h
+++ b/fs/bcachefs/btree_locking.h
@@ -122,12 +122,9 @@ static void btree_trans_lock_hold_time_update(struct btree_trans *trans,
struct btree_path *path, unsigned level)
{
#ifdef CONFIG_BCACHEFS_LOCK_TIME_STATS
- struct btree_transaction_stats *s = btree_trans_stats(trans);
-
- if (s)
- __bch2_time_stats_update(&s->lock_hold_times,
- path->l[level].lock_taken_time,
- local_clock());
+ __bch2_time_stats_update(&btree_trans_stats(trans)->lock_hold_times,
+ path->l[level].lock_taken_time,
+ local_clock());
#endif
}
@@ -175,6 +172,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
struct btree *b)
{
struct btree_path *linked;
+ unsigned i;
EBUG_ON(path->l[b->c.level].b != b);
EBUG_ON(path->l[b->c.level].lock_seq != six_lock_seq(&b->c.lock));
@@ -182,7 +180,7 @@ bch2_btree_node_unlock_write_inlined(struct btree_trans *trans, struct btree_pat
mark_btree_node_locked_noreset(path, b->c.level, BTREE_NODE_INTENT_LOCKED);
- trans_for_each_path_with_node(trans, b, linked)
+ trans_for_each_path_with_node(trans, b, linked, i)
linked->l[b->c.level].lock_seq++;
six_unlock_write(&b->c.lock);
@@ -242,8 +240,9 @@ static inline bool btree_node_lock_increment(struct btree_trans *trans,
enum btree_node_locked_type want)
{
struct btree_path *path;
+ unsigned i;
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (&path->l[level].b->c == b &&
btree_node_locked_type(path, level) >= want) {
six_lock_increment(&b->lock, (enum six_lock_type) want);
@@ -263,7 +262,6 @@ static inline int btree_node_lock(struct btree_trans *trans,
int ret = 0;
EBUG_ON(level >= BTREE_MAX_DEPTH);
- EBUG_ON(!(trans->paths_allocated & (1ULL << path->idx)));
if (likely(six_trylock_type(&b->lock, type)) ||
btree_node_lock_increment(trans, b, level, (enum btree_node_locked_type) type) ||
@@ -314,8 +312,7 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *,
/* relock: */
-bool bch2_btree_path_relock_norestart(struct btree_trans *,
- struct btree_path *, unsigned long);
+bool bch2_btree_path_relock_norestart(struct btree_trans *, struct btree_path *);
int __bch2_btree_path_relock(struct btree_trans *,
struct btree_path *, unsigned long);
@@ -355,12 +352,6 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans,
/* upgrade */
-
-struct get_locks_fail {
- unsigned l;
- struct btree *b;
-};
-
bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *,
struct btree_path *, unsigned,
struct get_locks_fail *);
diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c
index 12907beda98c..30d69a6d133e 100644
--- a/fs/bcachefs/btree_trans_commit.c
+++ b/fs/bcachefs/btree_trans_commit.c
@@ -12,6 +12,7 @@
#include "errcode.h"
#include "error.h"
#include "journal.h"
+#include "journal_io.h"
#include "journal_reclaim.h"
#include "replicas.h"
#include "snapshot.h"
@@ -23,7 +24,7 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
#ifdef CONFIG_BCACHEFS_DEBUG
struct bch_fs *c = trans->c;
struct bkey u;
- struct bkey_s_c k = bch2_btree_path_peek_slot_exact(i->path, &u);
+ struct bkey_s_c k = bch2_btree_path_peek_slot_exact(trans->paths + i->path, &u);
if (unlikely(trans->journal_replay_not_finished)) {
struct bkey_i *j_k =
@@ -41,23 +42,23 @@ static void verify_update_old_key(struct btree_trans *trans, struct btree_insert
#endif
}
-static inline struct btree_path_level *insert_l(struct btree_insert_entry *i)
+static inline struct btree_path_level *insert_l(struct btree_trans *trans, struct btree_insert_entry *i)
{
- return i->path->l + i->level;
+ return (trans->paths + i->path)->l + i->level;
}
static inline bool same_leaf_as_prev(struct btree_trans *trans,
struct btree_insert_entry *i)
{
return i != trans->updates &&
- insert_l(&i[0])->b == insert_l(&i[-1])->b;
+ insert_l(trans, &i[0])->b == insert_l(trans, &i[-1])->b;
}
static inline bool same_leaf_as_next(struct btree_trans *trans,
struct btree_insert_entry *i)
{
return i + 1 < trans->updates + trans->nr_updates &&
- insert_l(&i[0])->b == insert_l(&i[1])->b;
+ insert_l(trans, &i[0])->b == insert_l(trans, &i[1])->b;
}
inline void bch2_btree_node_prep_for_write(struct btree_trans *trans,
@@ -84,7 +85,7 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre
if (same_leaf_as_prev(trans, i))
continue;
- bch2_btree_node_unlock_write(trans, i->path, insert_l(i)->b);
+ bch2_btree_node_unlock_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
}
trace_and_count(trans->c, trans_restart_would_deadlock_write, trans);
@@ -93,19 +94,17 @@ static noinline int trans_lock_write_fail(struct btree_trans *trans, struct btre
static inline int bch2_trans_lock_write(struct btree_trans *trans)
{
- struct btree_insert_entry *i;
-
EBUG_ON(trans->write_locked);
trans_for_each_update(trans, i) {
if (same_leaf_as_prev(trans, i))
continue;
- if (bch2_btree_node_lock_write(trans, i->path, &insert_l(i)->b->c))
+ if (bch2_btree_node_lock_write(trans, trans->paths + i->path, &insert_l(trans, i)->b->c))
return trans_lock_write_fail(trans, i);
if (!i->cached)
- bch2_btree_node_prep_for_write(trans, i->path, insert_l(i)->b);
+ bch2_btree_node_prep_for_write(trans, trans->paths + i->path, insert_l(trans, i)->b);
}
trans->write_locked = true;
@@ -115,12 +114,10 @@ static inline int bch2_trans_lock_write(struct btree_trans *trans)
static inline void bch2_trans_unlock_write(struct btree_trans *trans)
{
if (likely(trans->write_locked)) {
- struct btree_insert_entry *i;
-
trans_for_each_update(trans, i)
if (!same_leaf_as_prev(trans, i))
- bch2_btree_node_unlock_write_inlined(trans, i->path,
- insert_l(i)->b);
+ bch2_btree_node_unlock_write_inlined(trans,
+ trans->paths + i->path, insert_l(trans, i)->b);
trans->write_locked = false;
}
}
@@ -142,8 +139,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
EBUG_ON(bkey_deleted(&insert->k) && bkey_val_u64s(&insert->k));
EBUG_ON(bpos_lt(insert->k.p, b->data->min_key));
EBUG_ON(bpos_gt(insert->k.p, b->data->max_key));
- EBUG_ON(insert->k.u64s >
- bch_btree_keys_u64s_remaining(trans->c, b));
+ EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b));
EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos));
k = bch2_btree_node_iter_peek_all(node_iter, b);
@@ -163,7 +159,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans,
k->type = KEY_TYPE_deleted;
if (k->needs_whiteout)
- push_whiteout(trans->c, b, insert->k.p);
+ push_whiteout(b, insert->k.p);
k->needs_whiteout = false;
if (k >= btree_bset_last(b)->start) {
@@ -287,7 +283,7 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
bch2_btree_add_journal_pin(c, b, journal_seq);
if (unlikely(!btree_node_dirty(b))) {
- EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
set_btree_node_dirty_acct(c, b);
}
@@ -311,10 +307,12 @@ inline void bch2_btree_insert_key_leaf(struct btree_trans *trans,
static inline void btree_insert_entry_checks(struct btree_trans *trans,
struct btree_insert_entry *i)
{
- BUG_ON(!bpos_eq(i->k->k.p, i->path->pos));
- BUG_ON(i->cached != i->path->cached);
- BUG_ON(i->level != i->path->level);
- BUG_ON(i->btree_id != i->path->btree_id);
+ struct btree_path *path = trans->paths + i->path;
+
+ BUG_ON(!bpos_eq(i->k->k.p, path->pos));
+ BUG_ON(i->cached != path->cached);
+ BUG_ON(i->level != path->level);
+ BUG_ON(i->btree_id != path->btree_id);
EBUG_ON(!i->level &&
btree_type_has_snapshots(i->btree_id) &&
!(i->flags & BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) &&
@@ -349,9 +347,7 @@ static noinline void journal_transaction_name(struct btree_trans *trans)
static inline int btree_key_can_insert(struct btree_trans *trans,
struct btree *b, unsigned u64s)
{
- struct bch_fs *c = trans->c;
-
- if (!bch2_btree_node_insert_fits(c, b, u64s))
+ if (!bch2_btree_node_insert_fits(b, u64s))
return -BCH_ERR_btree_insert_btree_node_full;
return 0;
@@ -361,8 +357,6 @@ noinline static int
btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
struct btree_path *path, unsigned new_u64s)
{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
struct bkey_cached *ck = (void *) path->l[0].b;
struct bkey_i *new_k;
int ret;
@@ -372,7 +366,7 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags,
new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL);
if (!new_k) {
- bch_err(c, "error allocating memory for key cache key, btree %s u64s %u",
+ bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u",
bch2_btree_id_str(path->btree_id), new_u64s);
return -BCH_ERR_ENOMEM_btree_key_cache_insert;
}
@@ -401,7 +395,6 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
{
struct bch_fs *c = trans->c;
struct bkey_cached *ck = (void *) path->l[0].b;
- struct btree_insert_entry *i;
unsigned new_u64s;
struct bkey_i *new_k;
@@ -409,7 +402,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) &&
bch2_btree_key_cache_must_wait(c) &&
- !(flags & BTREE_INSERT_JOURNAL_RECLAIM))
+ !(flags & BCH_TRANS_COMMIT_journal_reclaim))
return -BCH_ERR_btree_insert_need_journal_reclaim;
/*
@@ -422,7 +415,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags
return 0;
new_u64s = roundup_pow_of_two(u64s);
- new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT);
+ new_k = krealloc(ck->k, new_u64s * sizeof(u64), GFP_NOWAIT|__GFP_NOWARN);
if (unlikely(!new_k))
return btree_key_can_insert_cached_slowpath(trans, flags, path, new_u64s);
@@ -452,25 +445,15 @@ static int run_one_mem_trigger(struct btree_trans *trans,
if (unlikely(flags & BTREE_TRIGGER_NORUN))
return 0;
- if (!btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)))
- return 0;
-
- if (old_ops->atomic_trigger == new_ops->atomic_trigger) {
- ret = bch2_mark_key(trans, i->btree_id, i->level,
- old, bkey_i_to_s_c(new),
+ if (old_ops->trigger == new_ops->trigger) {
+ ret = bch2_key_trigger(trans, i->btree_id, i->level,
+ old, bkey_i_to_s(new),
BTREE_TRIGGER_INSERT|BTREE_TRIGGER_OVERWRITE|flags);
} else {
- struct bkey _deleted = KEY(0, 0, 0);
- struct bkey_s_c deleted = (struct bkey_s_c) { &_deleted, NULL };
-
- _deleted.p = i->path->pos;
-
- ret = bch2_mark_key(trans, i->btree_id, i->level,
- deleted, bkey_i_to_s_c(new),
- BTREE_TRIGGER_INSERT|flags) ?:
- bch2_mark_key(trans, i->btree_id, i->level,
- old, deleted,
- BTREE_TRIGGER_OVERWRITE|flags);
+ ret = bch2_key_trigger_new(trans, i->btree_id, i->level,
+ bkey_i_to_s(new), flags) ?:
+ bch2_key_trigger_old(trans, i->btree_id, i->level,
+ old, flags);
}
return ret;
@@ -488,6 +471,7 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
struct bkey_s_c old = { &old_k, i->old_v };
const struct bkey_ops *old_ops = bch2_bkey_type_ops(old.k->type);
const struct bkey_ops *new_ops = bch2_bkey_type_ops(i->k->k.type);
+ unsigned flags = i->flags|BTREE_TRIGGER_TRANSACTIONAL;
verify_update_old_key(trans, i);
@@ -497,19 +481,18 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_
if (!i->insert_trigger_run &&
!i->overwrite_trigger_run &&
- old_ops->trans_trigger == new_ops->trans_trigger) {
+ old_ops->trigger == new_ops->trigger) {
i->overwrite_trigger_run = true;
i->insert_trigger_run = true;
- return bch2_trans_mark_key(trans, i->btree_id, i->level, old, i->k,
- BTREE_TRIGGER_INSERT|
- BTREE_TRIGGER_OVERWRITE|
- i->flags) ?: 1;
+ return bch2_key_trigger(trans, i->btree_id, i->level, old, bkey_i_to_s(i->k),
+ BTREE_TRIGGER_INSERT|
+ BTREE_TRIGGER_OVERWRITE|flags) ?: 1;
} else if (overwrite && !i->overwrite_trigger_run) {
i->overwrite_trigger_run = true;
- return bch2_trans_mark_old(trans, i->btree_id, i->level, old, i->flags) ?: 1;
+ return bch2_key_trigger_old(trans, i->btree_id, i->level, old, flags) ?: 1;
} else if (!overwrite && !i->insert_trigger_run) {
i->insert_trigger_run = true;
- return bch2_trans_mark_new(trans, i->btree_id, i->level, i->k, i->flags) ?: 1;
+ return bch2_key_trigger_new(trans, i->btree_id, i->level, bkey_i_to_s(i->k), flags) ?: 1;
} else {
return 0;
}
@@ -551,7 +534,7 @@ static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id,
static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
{
- struct btree_insert_entry *i = NULL, *btree_id_start = trans->updates;
+ struct btree_insert_entry *btree_id_start = trans->updates;
unsigned btree_id = 0;
int ret = 0;
@@ -597,10 +580,6 @@ static int bch2_trans_commit_run_triggers(struct btree_trans *trans)
static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
{
- struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- int ret = 0;
-
trans_for_each_update(trans, i) {
/*
* XXX: synchronization of cached update triggers with gc
@@ -608,14 +587,15 @@ static noinline int bch2_trans_commit_run_gc_triggers(struct btree_trans *trans)
*/
BUG_ON(i->cached || i->level);
- if (gc_visited(c, gc_pos_btree_node(insert_l(i)->b))) {
- ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
+ if (btree_node_type_needs_gc(__btree_node_type(i->level, i->btree_id)) &&
+ gc_visited(trans->c, gc_pos_btree_node(insert_l(trans, i)->b))) {
+ int ret = run_one_mem_trigger(trans, i, i->flags|BTREE_TRIGGER_GC);
if (ret)
- break;
+ return ret;
}
}
- return ret;
+ return 0;
}
static inline int
@@ -624,8 +604,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
- struct btree_write_buffered_key *wb;
struct btree_trans_commit_hook *h;
unsigned u64s = 0;
int ret;
@@ -650,23 +628,21 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
u64s += i->k->k.u64s;
ret = !i->cached
- ? btree_key_can_insert(trans, insert_l(i)->b, u64s)
- : btree_key_can_insert_cached(trans, flags, i->path, u64s);
+ ? btree_key_can_insert(trans, insert_l(trans, i)->b, u64s)
+ : btree_key_can_insert_cached(trans, flags, trans->paths + i->path, u64s);
if (ret) {
*stopped_at = i;
return ret;
}
- }
- if (trans->nr_wb_updates &&
- trans->nr_wb_updates + c->btree_write_buffer.state.nr > c->btree_write_buffer.size)
- return -BCH_ERR_btree_insert_need_flush_buffer;
+ i->k->k.needs_whiteout = false;
+ }
/*
* Don't get journal reservation until after we know insert will
* succeed:
*/
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
ret = bch2_trans_journal_res_get(trans,
(flags & BCH_WATERMARK_MASK)|
JOURNAL_RES_GET_NONBLOCK);
@@ -675,8 +651,6 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
if (unlikely(trans->journal_transaction_names))
journal_transaction_name(trans);
- } else {
- trans->journal_res.seq = c->journal.replay_journal_seq;
}
/*
@@ -685,7 +659,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
*/
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- !(flags & BTREE_INSERT_JOURNAL_REPLAY)) {
+ !(flags & BCH_TRANS_COMMIT_no_journal_res)) {
if (bch2_journal_seq_verify)
trans_for_each_update(trans, i)
i->k->k.version.lo = trans->journal_res.seq;
@@ -698,13 +672,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bch2_trans_fs_usage_apply(trans, trans->fs_usage_deltas))
return -BCH_ERR_btree_insert_need_mark_replicas;
- if (trans->nr_wb_updates) {
- EBUG_ON(flags & BTREE_INSERT_JOURNAL_REPLAY);
-
- ret = bch2_btree_insert_keys_write_buffer(trans);
- if (ret)
- goto revert_fs_usage;
- }
+ /* XXX: we only want to run this if deltas are nonzero */
+ bch2_trans_account_disk_usage_change(trans);
h = trans->hooks;
while (h) {
@@ -715,8 +684,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
}
trans_for_each_update(trans, i)
- if (BTREE_NODE_TYPE_HAS_MEM_TRIGGERS & (1U << i->bkey_type)) {
- ret = run_one_mem_trigger(trans, i, i->flags);
+ if (BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS & (1U << i->bkey_type)) {
+ ret = run_one_mem_trigger(trans, i, BTREE_TRIGGER_ATOMIC|i->flags);
if (ret)
goto fatal_err;
}
@@ -727,16 +696,7 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
goto fatal_err;
}
- if (unlikely(trans->extra_journal_entries.nr)) {
- memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
- trans->extra_journal_entries.data,
- trans->extra_journal_entries.nr);
-
- trans->journal_res.offset += trans->extra_journal_entries.nr;
- trans->journal_res.u64s -= trans->extra_journal_entries.nr;
- }
-
- if (likely(!(flags & BTREE_INSERT_JOURNAL_REPLAY))) {
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res))) {
struct journal *j = &c->journal;
struct jset_entry *entry;
@@ -765,33 +725,27 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags,
bkey_copy((struct bkey_i *) entry->start, i->k);
}
- trans_for_each_wb_update(trans, wb) {
- entry = bch2_journal_add_entry(j, &trans->journal_res,
- BCH_JSET_ENTRY_btree_keys,
- wb->btree, 0,
- wb->k.k.u64s);
- bkey_copy((struct bkey_i *) entry->start, &wb->k);
- }
+ memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res),
+ trans->journal_entries,
+ trans->journal_entries_u64s);
+
+ trans->journal_res.offset += trans->journal_entries_u64s;
+ trans->journal_res.u64s -= trans->journal_entries_u64s;
if (trans->journal_seq)
*trans->journal_seq = trans->journal_res.seq;
}
trans_for_each_update(trans, i) {
- i->k->k.needs_whiteout = false;
+ struct btree_path *path = trans->paths + i->path;
if (!i->cached) {
- u64 seq = trans->journal_res.seq;
-
- if (i->flags & BTREE_UPDATE_PREJOURNAL)
- seq = i->seq;
-
- bch2_btree_insert_key_leaf(trans, i->path, i->k, seq);
+ bch2_btree_insert_key_leaf(trans, path, i->k, trans->journal_res.seq);
} else if (!i->key_cache_already_flushed)
bch2_btree_insert_key_cached(trans, flags, i);
else {
- bch2_btree_key_cache_drop(trans, i->path);
- btree_path_set_dirty(i->path, BTREE_ITER_NEED_TRAVERSE);
+ bch2_btree_key_cache_drop(trans, path);
+ btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE);
}
}
@@ -806,14 +760,8 @@ revert_fs_usage:
static noinline void bch2_drop_overwrites_from_journal(struct btree_trans *trans)
{
- struct btree_insert_entry *i;
- struct btree_write_buffered_key *wb;
-
trans_for_each_update(trans, i)
bch2_journal_key_overwritten(trans->c, i->btree_id, i->level, i->k->k.p);
-
- trans_for_each_wb_update(trans, wb)
- bch2_journal_key_overwritten(trans->c, wb->btree, 0, wb->k.k.p);
}
static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
@@ -841,6 +789,33 @@ static noinline int bch2_trans_commit_bkey_invalid(struct btree_trans *trans,
return -EINVAL;
}
+static noinline int bch2_trans_commit_journal_entry_invalid(struct btree_trans *trans,
+ struct jset_entry *i)
+{
+ struct bch_fs *c = trans->c;
+ struct printbuf buf = PRINTBUF;
+
+ prt_printf(&buf, "invalid bkey on insert from %s", trans->fn);
+ prt_newline(&buf);
+ printbuf_indent_add(&buf, 2);
+
+ bch2_journal_entry_to_text(&buf, c, i);
+ prt_newline(&buf);
+
+ bch2_print_string_as_lines(KERN_ERR, buf.buf);
+
+ bch2_inconsistent_error(c);
+ bch2_dump_trans_updates(trans);
+
+ return -EINVAL;
+}
+
+static int bch2_trans_commit_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
/*
* Get journal reservation, take write locks, and attempt to do btree update(s):
*/
@@ -849,7 +824,6 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
unsigned long trace_ip)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
int ret = 0, u64s_delta = 0;
trans_for_each_update(trans, i) {
@@ -884,13 +858,15 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans, unsigned flags
if (!ret && trans->journal_pin)
bch2_journal_pin_add(&c->journal, trans->journal_res.seq,
- trans->journal_pin, NULL);
+ trans->journal_pin,
+ bch2_trans_commit_journal_pin_flush);
/*
* Drop journal reservation after dropping write locks, since dropping
* the journal reservation may kick off a journal write:
*/
- bch2_journal_res_put(&c->journal, &trans->journal_res);
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+ bch2_journal_res_put(&c->journal, &trans->journal_res);
return ret;
}
@@ -916,7 +892,8 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
case -BCH_ERR_btree_insert_btree_node_full:
ret = bch2_btree_split_leaf(trans, i->path, flags);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- trace_and_count(c, trans_restart_btree_node_split, trans, trace_ip, i->path);
+ trace_and_count(c, trans_restart_btree_node_split, trans,
+ trace_ip, trans->paths + i->path);
break;
case -BCH_ERR_btree_insert_need_mark_replicas:
ret = drop_locks_do(trans,
@@ -927,7 +904,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
* XXX: this should probably be a separate BTREE_INSERT_NONBLOCK
* flag
*/
- if ((flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) &&
(flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
break;
@@ -950,30 +927,6 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
ret = bch2_trans_relock(trans);
break;
- case -BCH_ERR_btree_insert_need_flush_buffer: {
- struct btree_write_buffer *wb = &c->btree_write_buffer;
-
- ret = 0;
-
- if (wb->state.nr > wb->size * 3 / 4) {
- bch2_trans_unlock(trans);
- mutex_lock(&wb->flush_lock);
-
- if (wb->state.nr > wb->size * 3 / 4) {
- bch2_trans_begin(trans);
- ret = __bch2_btree_write_buffer_flush(trans,
- flags|BTREE_INSERT_NOCHECK_RW, true);
- if (!ret) {
- trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
- }
- } else {
- mutex_unlock(&wb->flush_lock);
- ret = bch2_trans_relock(trans);
- }
- }
- break;
- }
default:
BUG_ON(ret >= 0);
break;
@@ -982,8 +935,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags,
BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted);
bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) &&
- !(flags & BTREE_INSERT_NOWAIT) &&
- (flags & BTREE_INSERT_NOFAIL), c,
+ (flags & BCH_TRANS_COMMIT_no_enospc), c,
"%s: incorrectly got %s\n", __func__, bch2_err_str(ret));
return ret;
@@ -995,8 +947,8 @@ bch2_trans_commit_get_rw_cold(struct btree_trans *trans, unsigned flags)
struct bch_fs *c = trans->c;
int ret;
- if (likely(!(flags & BTREE_INSERT_LAZY_RW)) ||
- test_bit(BCH_FS_STARTED, &c->flags))
+ if (likely(!(flags & BCH_TRANS_COMMIT_lazy_rw)) ||
+ test_bit(BCH_FS_started, &c->flags))
return -BCH_ERR_erofs_trans_commit;
ret = drop_locks_do(trans, bch2_fs_read_write_early(c));
@@ -1016,7 +968,6 @@ static noinline int
do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i;
int ret = 0;
trans_for_each_update(trans, i) {
@@ -1030,18 +981,15 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans)
int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
{
+ struct btree_insert_entry *errored_at = NULL;
struct bch_fs *c = trans->c;
- struct btree_insert_entry *i = NULL;
- struct btree_write_buffered_key *wb;
int ret = 0;
if (!trans->nr_updates &&
- !trans->nr_wb_updates &&
- !trans->extra_journal_entries.nr)
+ !trans->journal_entries_u64s)
goto out_reset;
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
+ memset(&trans->fs_usage_delta, 0, sizeof(trans->fs_usage_delta));
ret = bch2_trans_commit_run_triggers(trans);
if (ret)
@@ -1051,7 +999,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
struct printbuf buf = PRINTBUF;
enum bkey_invalid_flags invalid_flags = 0;
- if (!(flags & BTREE_INSERT_JOURNAL_REPLAY))
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
if (unlikely(bch2_bkey_invalid(c, bkey_i_to_s_c(i->k),
@@ -1064,47 +1012,52 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
return ret;
}
- if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ for (struct jset_entry *i = trans->journal_entries;
+ i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+ i = vstruct_next(i)) {
+ enum bkey_invalid_flags invalid_flags = 0;
+
+ if (!(flags & BCH_TRANS_COMMIT_no_journal_res))
+ invalid_flags |= BKEY_INVALID_WRITE|BKEY_INVALID_COMMIT;
+
+ if (unlikely(bch2_journal_entry_validate(c, NULL, i,
+ bcachefs_metadata_version_current,
+ CPU_BIG_ENDIAN, invalid_flags)))
+ ret = bch2_trans_commit_journal_entry_invalid(trans, i);
+
+ if (ret)
+ return ret;
+ }
+
+ if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
ret = do_bch2_trans_commit_to_journal_replay(trans);
goto out_reset;
}
- if (!(flags & BTREE_INSERT_NOCHECK_RW) &&
+ if (!(flags & BCH_TRANS_COMMIT_no_check_rw) &&
unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) {
ret = bch2_trans_commit_get_rw_cold(trans, flags);
if (ret)
goto out_reset;
}
- if (c->btree_write_buffer.state.nr > c->btree_write_buffer.size / 2 &&
- mutex_trylock(&c->btree_write_buffer.flush_lock)) {
- bch2_trans_begin(trans);
- bch2_trans_unlock(trans);
-
- ret = __bch2_btree_write_buffer_flush(trans,
- flags|BTREE_INSERT_NOCHECK_RW, true);
- if (!ret) {
- trace_and_count(c, trans_restart_write_buffer_flush, trans, _THIS_IP_);
- ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_write_buffer_flush);
- }
- goto out;
- }
-
- EBUG_ON(test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags));
+ EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags));
- trans->journal_u64s = trans->extra_journal_entries.nr;
+ trans->journal_u64s = trans->journal_entries_u64s;
trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names);
if (trans->journal_transaction_names)
trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s);
trans_for_each_update(trans, i) {
- EBUG_ON(!i->path->should_be_locked);
+ struct btree_path *path = trans->paths + i->path;
+
+ EBUG_ON(!path->should_be_locked);
- ret = bch2_btree_path_upgrade(trans, i->path, i->level + 1);
+ ret = bch2_btree_path_upgrade(trans, path, i->level + 1);
if (unlikely(ret))
goto out;
- EBUG_ON(!btree_node_intent_locked(i->path, i->level));
+ EBUG_ON(!btree_node_intent_locked(path, i->level));
if (i->key_cache_already_flushed)
continue;
@@ -1120,22 +1073,21 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags)
trans->journal_u64s += jset_u64s(i->old_k.u64s);
}
- trans_for_each_wb_update(trans, wb)
- trans->journal_u64s += jset_u64s(wb->k.k.u64s);
-
- if (trans->extra_journal_res) {
+ if (trans->extra_disk_res) {
ret = bch2_disk_reservation_add(c, trans->disk_res,
- trans->extra_journal_res,
- (flags & BTREE_INSERT_NOFAIL)
+ trans->extra_disk_res,
+ (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0);
if (ret)
goto err;
}
retry:
+ errored_at = NULL;
bch2_trans_verify_not_in_restart(trans);
- memset(&trans->journal_res, 0, sizeof(trans->journal_res));
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_journal_res)))
+ memset(&trans->journal_res, 0, sizeof(trans->journal_res));
- ret = do_bch2_trans_commit(trans, flags, &i, _RET_IP_);
+ ret = do_bch2_trans_commit(trans, flags, &errored_at, _RET_IP_);
/* make sure we didn't drop or screw up locks: */
bch2_trans_verify_locks(trans);
@@ -1145,7 +1097,7 @@ retry:
trace_and_count(c, transaction_commit, trans, _RET_IP_);
out:
- if (likely(!(flags & BTREE_INSERT_NOCHECK_RW)))
+ if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw)))
bch2_write_ref_put(c, BCH_WRITE_REF_trans);
out_reset:
if (!ret)
@@ -1154,9 +1106,21 @@ out_reset:
return ret;
err:
- ret = bch2_trans_commit_error(trans, flags, i, ret, _RET_IP_);
+ ret = bch2_trans_commit_error(trans, flags, errored_at, ret, _RET_IP_);
if (ret)
goto out;
+ /*
+ * We might have done another transaction commit in the error path -
+ * i.e. btree write buffer flush - which will have made use of
+ * trans->journal_res, but with BCH_TRANS_COMMIT_no_journal_res that is
+ * how the journal sequence number to pin is passed in - so we must
+ * restart:
+ */
+ if (flags & BCH_TRANS_COMMIT_no_journal_res) {
+ ret = -BCH_ERR_transaction_restart_nested;
+ goto out;
+ }
+
goto retry;
}
diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h
index 60453ba86c4b..4a5a64499eb7 100644
--- a/fs/bcachefs/btree_types.h
+++ b/fs/bcachefs/btree_types.h
@@ -185,33 +185,32 @@ struct btree_node_iter {
* Iterate over all possible positions, synthesizing deleted keys for holes:
*/
static const __maybe_unused u16 BTREE_ITER_SLOTS = 1 << 0;
-static const __maybe_unused u16 BTREE_ITER_ALL_LEVELS = 1 << 1;
/*
* Indicates that intent locks should be taken on leaf nodes, because we expect
* to be doing updates:
*/
-static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 2;
+static const __maybe_unused u16 BTREE_ITER_INTENT = 1 << 1;
/*
* Causes the btree iterator code to prefetch additional btree nodes from disk:
*/
-static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_PREFETCH = 1 << 2;
/*
* Used in bch2_btree_iter_traverse(), to indicate whether we're searching for
* @pos or the first key strictly greater than @pos
*/
-static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 4;
-static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 5;
-static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 6;
-static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 7;
-static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 8;
-static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 9;
-static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
-static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 11;
-static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 12;
-static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 13;
-static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 14;
-static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 15;
-#define __BTREE_ITER_FLAGS_END 16
+static const __maybe_unused u16 BTREE_ITER_IS_EXTENTS = 1 << 3;
+static const __maybe_unused u16 BTREE_ITER_NOT_EXTENTS = 1 << 4;
+static const __maybe_unused u16 BTREE_ITER_CACHED = 1 << 5;
+static const __maybe_unused u16 BTREE_ITER_WITH_KEY_CACHE = 1 << 6;
+static const __maybe_unused u16 BTREE_ITER_WITH_UPDATES = 1 << 7;
+static const __maybe_unused u16 BTREE_ITER_WITH_JOURNAL = 1 << 8;
+static const __maybe_unused u16 __BTREE_ITER_ALL_SNAPSHOTS = 1 << 9;
+static const __maybe_unused u16 BTREE_ITER_ALL_SNAPSHOTS = 1 << 10;
+static const __maybe_unused u16 BTREE_ITER_FILTER_SNAPSHOTS = 1 << 11;
+static const __maybe_unused u16 BTREE_ITER_NOPRESERVE = 1 << 12;
+static const __maybe_unused u16 BTREE_ITER_CACHED_NOFILL = 1 << 13;
+static const __maybe_unused u16 BTREE_ITER_KEY_CACHE_FILL = 1 << 14;
+#define __BTREE_ITER_FLAGS_END 15
enum btree_path_uptodate {
BTREE_ITER_UPTODATE = 0,
@@ -223,13 +222,12 @@ enum btree_path_uptodate {
#define TRACK_PATH_ALLOCATED
#endif
+typedef u16 btree_path_idx_t;
+
struct btree_path {
- u8 idx;
- u8 sorted_idx;
+ btree_path_idx_t sorted_idx;
u8 ref;
u8 intent_ref;
- u32 alloc_seq;
- u32 downgrade_seq;
/* btree_iter_copy starts here: */
struct bpos pos;
@@ -283,13 +281,12 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path)
*/
struct btree_iter {
struct btree_trans *trans;
- struct btree_path *path;
- struct btree_path *update_path;
- struct btree_path *key_cache_path;
+ btree_path_idx_t path;
+ btree_path_idx_t update_path;
+ btree_path_idx_t key_cache_path;
enum btree_id btree_id:8;
- unsigned min_depth:3;
- unsigned advanced:1;
+ u8 min_depth;
/* btree_iter_copy starts here: */
u16 flags;
@@ -306,7 +303,6 @@ struct btree_iter {
/* BTREE_ITER_WITH_JOURNAL: */
size_t journal_idx;
- struct bpos journal_pos;
#ifdef TRACK_PATH_ALLOCATED
unsigned long ip_allocated;
#endif
@@ -354,16 +350,16 @@ struct btree_insert_entry {
* to the size of the key being overwritten in the btree:
*/
u8 old_btree_u64s;
+ btree_path_idx_t path;
struct bkey_i *k;
- struct btree_path *path;
- u64 seq;
/* key being overwritten: */
struct bkey old_k;
const struct bch_val *old_v;
unsigned long ip_allocated;
};
-#define BTREE_ITER_MAX 64
+#define BTREE_ITER_INITIAL 64
+#define BTREE_ITER_MAX (1U << 10)
struct btree_trans_commit_hook;
typedef int (btree_trans_commit_hook_fn)(struct btree_trans *, struct btree_trans_commit_hook *);
@@ -377,25 +373,30 @@ struct btree_trans_commit_hook {
#define BTREE_TRANS_MAX_LOCK_HOLD_TIME_NS 10000
+struct btree_trans_paths {
+ unsigned long nr_paths;
+ struct btree_path paths[];
+};
+
struct btree_trans {
struct bch_fs *c;
- const char *fn;
- struct closure ref;
- struct list_head list;
- u64 last_begin_time;
- u8 lock_may_not_fail;
- u8 lock_must_abort;
- struct btree_bkey_cached_common *locking;
- struct six_lock_waiter locking_wait;
+ unsigned long *paths_allocated;
+ struct btree_path *paths;
+ btree_path_idx_t *sorted;
+ struct btree_insert_entry *updates;
- int srcu_idx;
+ void *mem;
+ unsigned mem_top;
+ unsigned mem_bytes;
+ btree_path_idx_t nr_sorted;
+ btree_path_idx_t nr_paths;
+ btree_path_idx_t nr_paths_max;
u8 fn_idx;
- u8 nr_sorted;
u8 nr_updates;
- u8 nr_wb_updates;
- u8 wb_updates_size;
+ u8 lock_must_abort;
+ bool lock_may_not_fail:1;
bool srcu_held:1;
bool used_mempool:1;
bool in_traverse_all:1;
@@ -407,41 +408,59 @@ struct btree_trans {
bool write_locked:1;
enum bch_errcode restarted:16;
u32 restart_count;
+
+ u64 last_begin_time;
unsigned long last_begin_ip;
unsigned long last_restarted_ip;
unsigned long srcu_lock_time;
- /*
- * For when bch2_trans_update notices we'll be splitting a compressed
- * extent:
- */
- unsigned extra_journal_res;
- unsigned nr_max_paths;
-
- u64 paths_allocated;
-
- unsigned mem_top;
- unsigned mem_max;
- unsigned mem_bytes;
- void *mem;
-
- u8 sorted[BTREE_ITER_MAX + 8];
- struct btree_path paths[BTREE_ITER_MAX];
- struct btree_insert_entry updates[BTREE_ITER_MAX];
- struct btree_write_buffered_key *wb_updates;
+ const char *fn;
+ struct btree_bkey_cached_common *locking;
+ struct six_lock_waiter locking_wait;
+ int srcu_idx;
/* update path: */
+ u16 journal_entries_u64s;
+ u16 journal_entries_size;
+ struct jset_entry *journal_entries;
+
struct btree_trans_commit_hook *hooks;
- darray_u64 extra_journal_entries;
struct journal_entry_pin *journal_pin;
struct journal_res journal_res;
u64 *journal_seq;
struct disk_reservation *disk_res;
+
+ struct bch_fs_usage_base fs_usage_delta;
+
unsigned journal_u64s;
+ unsigned extra_disk_res; /* XXX kill */
struct replicas_delta_list *fs_usage_deltas;
+
+ /* Entries before this are zeroed out on every bch2_trans_get() call */
+
+ struct list_head list;
+ struct closure ref;
+
+ unsigned long _paths_allocated[BITS_TO_LONGS(BTREE_ITER_INITIAL)];
+ struct btree_trans_paths trans_paths;
+ struct btree_path _paths[BTREE_ITER_INITIAL];
+ btree_path_idx_t _sorted[BTREE_ITER_INITIAL + 4];
+ struct btree_insert_entry _updates[BTREE_ITER_INITIAL];
};
+static inline struct btree_path *btree_iter_path(struct btree_trans *trans, struct btree_iter *iter)
+{
+ return trans->paths + iter->path;
+}
+
+static inline struct btree_path *btree_iter_key_cache_path(struct btree_trans *trans, struct btree_iter *iter)
+{
+ return iter->key_cache_path
+ ? trans->paths + iter->key_cache_path
+ : NULL;
+}
+
#define BCH_BTREE_WRITE_TYPES() \
x(initial, 0) \
x(init_next_bset, 1) \
@@ -637,7 +656,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
BIT_ULL(BKEY_TYPE_reflink)| \
BIT_ULL(BKEY_TYPE_btree))
-#define BTREE_NODE_TYPE_HAS_MEM_TRIGGERS \
+#define BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS \
(BIT_ULL(BKEY_TYPE_alloc)| \
BIT_ULL(BKEY_TYPE_inodes)| \
BIT_ULL(BKEY_TYPE_stripes)| \
@@ -645,7 +664,7 @@ const char *bch2_btree_node_type_str(enum btree_node_type);
#define BTREE_NODE_TYPE_HAS_TRIGGERS \
(BTREE_NODE_TYPE_HAS_TRANS_TRIGGERS| \
- BTREE_NODE_TYPE_HAS_MEM_TRIGGERS)
+ BTREE_NODE_TYPE_HAS_ATOMIC_TRIGGERS)
static inline bool btree_node_type_needs_gc(enum btree_node_type type)
{
@@ -722,4 +741,9 @@ enum btree_node_sibling {
btree_next_sib,
};
+struct get_locks_fail {
+ unsigned l;
+ struct btree *b;
+};
+
#endif /* _BCACHEFS_BTREE_TYPES_H */
diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c
index 2fd3c8cc6f51..c3ff365acce9 100644
--- a/fs/bcachefs/btree_update.c
+++ b/fs/bcachefs/btree_update.c
@@ -24,7 +24,7 @@ static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l,
}
static int __must_check
-bch2_trans_update_by_path(struct btree_trans *, struct btree_path *,
+bch2_trans_update_by_path(struct btree_trans *, btree_path_idx_t,
struct bkey_i *, enum btree_update_flags,
unsigned long ip);
@@ -200,7 +200,7 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *trans,
*/
if (nr_splits > 1 &&
(compressed_sectors = bch2_bkey_sectors_compressed(old)))
- trans->extra_journal_res += compressed_sectors * (nr_splits - 1);
+ trans->extra_disk_res += compressed_sectors * (nr_splits - 1);
if (front_split) {
update = bch2_bkey_make_mut_noupdate(trans, old);
@@ -339,21 +339,22 @@ err:
}
static noinline int flush_new_cached_update(struct btree_trans *trans,
- struct btree_path *path,
struct btree_insert_entry *i,
enum btree_update_flags flags,
unsigned long ip)
{
- struct btree_path *btree_path;
struct bkey k;
int ret;
- btree_path = bch2_path_get(trans, path->btree_id, path->pos, 1, 0,
- BTREE_ITER_INTENT, _THIS_IP_);
- ret = bch2_btree_path_traverse(trans, btree_path, 0);
+ btree_path_idx_t path_idx =
+ bch2_path_get(trans, i->btree_id, i->old_k.p, 1, 0,
+ BTREE_ITER_INTENT, _THIS_IP_);
+ ret = bch2_btree_path_traverse(trans, path_idx, 0);
if (ret)
goto out;
+ struct btree_path *btree_path = trans->paths + path_idx;
+
/*
* The old key in the insert entry might actually refer to an existing
* key in the btree that has been deleted from cache and not yet
@@ -368,43 +369,34 @@ static noinline int flush_new_cached_update(struct btree_trans *trans,
i->flags |= BTREE_TRIGGER_NORUN;
btree_path_set_should_be_locked(btree_path);
- ret = bch2_trans_update_by_path(trans, btree_path, i->k, flags, ip);
+ ret = bch2_trans_update_by_path(trans, path_idx, i->k, flags, ip);
out:
- bch2_path_put(trans, btree_path, true);
+ bch2_path_put(trans, path_idx, true);
return ret;
}
static int __must_check
-bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
+bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx,
struct bkey_i *k, enum btree_update_flags flags,
unsigned long ip)
{
struct bch_fs *c = trans->c;
struct btree_insert_entry *i, n;
- u64 seq = 0;
int cmp;
+ struct btree_path *path = trans->paths + path_idx;
EBUG_ON(!path->should_be_locked);
- EBUG_ON(trans->nr_updates >= BTREE_ITER_MAX);
+ EBUG_ON(trans->nr_updates >= trans->nr_paths);
EBUG_ON(!bpos_eq(k->k.p, path->pos));
- /*
- * The transaction journal res hasn't been allocated at this point.
- * That occurs at commit time. Reuse the seq field to pass in the seq
- * of a prejournaled key.
- */
- if (flags & BTREE_UPDATE_PREJOURNAL)
- seq = trans->journal_res.seq;
-
n = (struct btree_insert_entry) {
.flags = flags,
.bkey_type = __btree_node_type(path->level, path->btree_id),
.btree_id = path->btree_id,
.level = path->level,
.cached = path->cached,
- .path = path,
+ .path = path_idx,
.k = k,
- .seq = seq,
.ip_allocated = ip,
};
@@ -418,7 +410,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
* Pending updates are kept sorted: first, find position of new update,
* then delete/trim any updates the new update overwrites:
*/
- trans_for_each_update(trans, i) {
+ for (i = trans->updates; i < trans->updates + trans->nr_updates; i++) {
cmp = btree_insert_entry_cmp(&n, i);
if (cmp <= 0)
break;
@@ -432,7 +424,6 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
i->cached = n.cached;
i->k = n.k;
i->path = n.path;
- i->seq = n.seq;
i->ip_allocated = n.ip_allocated;
} else {
array_insert_item(trans->updates, trans->nr_updates,
@@ -452,7 +443,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
}
}
- __btree_path_get(i->path, true);
+ __btree_path_get(trans->paths + i->path, true);
/*
* If a key is present in the key cache, it must also exist in the
@@ -462,7 +453,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, struct btree_path *path,
* work:
*/
if (path->cached && bkey_deleted(&i->old_k))
- return flush_new_cached_update(trans, path, i, flags, ip);
+ return flush_new_cached_update(trans, i, flags, ip);
return 0;
}
@@ -471,9 +462,11 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
struct btree_iter *iter,
struct btree_path *path)
{
- if (!iter->key_cache_path ||
- !iter->key_cache_path->should_be_locked ||
- !bpos_eq(iter->key_cache_path->pos, iter->pos)) {
+ struct btree_path *key_cache_path = btree_iter_key_cache_path(trans, iter);
+
+ if (!key_cache_path ||
+ !key_cache_path->should_be_locked ||
+ !bpos_eq(key_cache_path->pos, iter->pos)) {
struct bkey_cached *ck;
int ret;
@@ -488,19 +481,18 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
iter->flags & BTREE_ITER_INTENT,
_THIS_IP_);
- ret = bch2_btree_path_traverse(trans, iter->key_cache_path,
- BTREE_ITER_CACHED);
+ ret = bch2_btree_path_traverse(trans, iter->key_cache_path, BTREE_ITER_CACHED);
if (unlikely(ret))
return ret;
- ck = (void *) iter->key_cache_path->l[0].b;
+ ck = (void *) trans->paths[iter->key_cache_path].l[0].b;
if (test_bit(BKEY_CACHED_DIRTY, &ck->flags)) {
trace_and_count(trans->c, trans_restart_key_cache_raced, trans, _RET_IP_);
return btree_trans_restart(trans, BCH_ERR_transaction_restart_key_cache_raced);
}
- btree_path_set_should_be_locked(iter->key_cache_path);
+ btree_path_set_should_be_locked(trans->paths + iter->key_cache_path);
}
return 0;
@@ -509,7 +501,7 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans,
int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter,
struct bkey_i *k, enum btree_update_flags flags)
{
- struct btree_path *path = iter->update_path ?: iter->path;
+ btree_path_idx_t path_idx = iter->update_path ?: iter->path;
int ret;
if (iter->flags & BTREE_ITER_IS_EXTENTS)
@@ -529,6 +521,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
/*
* Ensure that updates to cached btrees go to the key cache:
*/
+ struct btree_path *path = trans->paths + path_idx;
if (!(flags & BTREE_UPDATE_KEY_CACHE_RECLAIM) &&
!path->cached &&
!path->level &&
@@ -537,27 +530,15 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter
if (ret)
return ret;
- path = iter->key_cache_path;
+ path_idx = iter->key_cache_path;
}
- return bch2_trans_update_by_path(trans, path, k, flags, _RET_IP_);
+ return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_);
}
-/*
- * Add a transaction update for a key that has already been journaled.
- */
-int __must_check bch2_trans_update_seq(struct btree_trans *trans, u64 seq,
- struct btree_iter *iter, struct bkey_i *k,
- enum btree_update_flags flags)
-{
- trans->journal_res.seq = seq;
- return bch2_trans_update(trans, iter, k, flags|BTREE_UPDATE_NOJOURNAL|
- BTREE_UPDATE_PREJOURNAL);
-}
-
-static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_i *k)
+int bch2_btree_insert_clone_trans(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_i *k)
{
struct bkey_i *n = bch2_trans_kmalloc(trans, bkey_bytes(&k->k));
int ret = PTR_ERR_OR_ZERO(n);
@@ -568,60 +549,30 @@ static noinline int bch2_btree_insert_clone_trans(struct btree_trans *trans,
return bch2_btree_insert_trans(trans, btree, n, 0);
}
-int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
- enum btree_id btree,
- struct bkey_i *k)
+struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
{
- struct btree_write_buffered_key *i;
- int ret;
-
- EBUG_ON(trans->nr_wb_updates > trans->wb_updates_size);
- EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
-
- if (unlikely(trans->journal_replay_not_finished))
- return bch2_btree_insert_clone_trans(trans, btree, k);
-
- trans_for_each_wb_update(trans, i) {
- if (i->btree == btree && bpos_eq(i->k.k.p, k->k.p)) {
- bkey_copy(&i->k, k);
- return 0;
- }
- }
+ unsigned new_top = trans->journal_entries_u64s + u64s;
+ unsigned old_size = trans->journal_entries_size;
- if (!trans->wb_updates ||
- trans->nr_wb_updates == trans->wb_updates_size) {
- struct btree_write_buffered_key *u;
+ if (new_top > trans->journal_entries_size) {
+ trans->journal_entries_size = roundup_pow_of_two(new_top);
- if (trans->nr_wb_updates == trans->wb_updates_size) {
- struct btree_transaction_stats *s = btree_trans_stats(trans);
-
- BUG_ON(trans->wb_updates_size > U8_MAX / 2);
- trans->wb_updates_size = max(1, trans->wb_updates_size * 2);
- if (s)
- s->wb_updates_size = trans->wb_updates_size;
- }
-
- u = bch2_trans_kmalloc_nomemzero(trans,
- trans->wb_updates_size *
- sizeof(struct btree_write_buffered_key));
- ret = PTR_ERR_OR_ZERO(u);
- if (ret)
- return ret;
-
- if (trans->nr_wb_updates)
- memcpy(u, trans->wb_updates, trans->nr_wb_updates *
- sizeof(struct btree_write_buffered_key));
- trans->wb_updates = u;
+ btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size;
}
- trans->wb_updates[trans->nr_wb_updates] = (struct btree_write_buffered_key) {
- .btree = btree,
- };
+ struct jset_entry *n =
+ bch2_trans_kmalloc_nomemzero(trans,
+ trans->journal_entries_size * sizeof(u64));
+ if (IS_ERR(n))
+ return ERR_CAST(n);
- bkey_copy(&trans->wb_updates[trans->nr_wb_updates].k, k);
- trans->nr_wb_updates++;
+ if (trans->journal_entries)
+ memcpy(n, trans->journal_entries, old_size * sizeof(u64));
+ trans->journal_entries = n;
- return 0;
+ struct jset_entry *e = btree_trans_journal_entries_top(trans);
+ trans->journal_entries_u64s = new_top;
+ return e;
}
int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter,
@@ -733,20 +684,6 @@ int bch2_btree_delete_at(struct btree_trans *trans,
return bch2_btree_delete_extent_at(trans, iter, 0, update_flags);
}
-int bch2_btree_delete_at_buffered(struct btree_trans *trans,
- enum btree_id btree, struct bpos pos)
-{
- struct bkey_i *k;
-
- k = bch2_trans_kmalloc(trans, sizeof(*k));
- if (IS_ERR(k))
- return PTR_ERR(k);
-
- bkey_init(&k->k);
- k->k.p = pos;
- return bch2_trans_update_buffered(trans, btree, k);
-}
-
int bch2_btree_delete(struct btree_trans *trans,
enum btree_id btree, struct bpos pos,
unsigned update_flags)
@@ -809,7 +746,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id,
ret = bch2_trans_update(trans, &iter, &delete, update_flags) ?:
bch2_trans_commit(trans, &disk_res, journal_seq,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(trans->c, &disk_res);
err:
/*
@@ -851,56 +788,26 @@ int bch2_btree_delete_range(struct bch_fs *c, enum btree_id id,
int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree,
struct bpos pos, bool set)
{
- struct bkey_i *k;
- int ret = 0;
+ struct bkey_i k;
- k = bch2_trans_kmalloc_nomemzero(trans, sizeof(*k));
- ret = PTR_ERR_OR_ZERO(k);
- if (unlikely(ret))
- return ret;
+ bkey_init(&k.k);
+ k.k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
+ k.k.p = pos;
- bkey_init(&k->k);
- k->k.type = set ? KEY_TYPE_set : KEY_TYPE_deleted;
- k->k.p = pos;
-
- return bch2_trans_update_buffered(trans, btree, k);
+ return bch2_trans_update_buffered(trans, btree, &k);
}
-__printf(2, 0)
-static int __bch2_trans_log_msg(darray_u64 *entries, const char *fmt, va_list args)
+static int __bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf, unsigned u64s)
{
- struct printbuf buf = PRINTBUF;
- struct jset_entry_log *l;
- unsigned u64s;
- int ret;
-
- prt_vprintf(&buf, fmt, args);
- ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
- if (ret)
- goto err;
-
- u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
-
- ret = darray_make_room(entries, jset_u64s(u64s));
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s));
+ int ret = PTR_ERR_OR_ZERO(e);
if (ret)
- goto err;
+ return ret;
- l = (void *) &darray_top(*entries);
- l->entry.u64s = cpu_to_le16(u64s);
- l->entry.btree_id = 0;
- l->entry.level = 1;
- l->entry.type = BCH_JSET_ENTRY_log;
- l->entry.pad[0] = 0;
- l->entry.pad[1] = 0;
- l->entry.pad[2] = 0;
- memcpy(l->d, buf.buf, buf.pos);
- while (buf.pos & 7)
- l->d[buf.pos++] = '\0';
-
- entries->nr += jset_u64s(u64s);
-err:
- printbuf_exit(&buf);
- return ret;
+ struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry);
+ journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s);
+ memcpy(l->d, buf->buf, buf->pos);
+ return 0;
}
__printf(3, 0)
@@ -908,16 +815,32 @@ static int
__bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt,
va_list args)
{
- int ret;
+ struct printbuf buf = PRINTBUF;
+ prt_vprintf(&buf, fmt, args);
+
+ unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64));
+ prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos);
+
+ int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0;
+ if (ret)
+ goto err;
if (!test_bit(JOURNAL_STARTED, &c->journal.flags)) {
- ret = __bch2_trans_log_msg(&c->journal.early_journal_entries, fmt, args);
+ ret = darray_make_room(&c->journal.early_journal_entries, jset_u64s(u64s));
+ if (ret)
+ goto err;
+
+ struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries);
+ journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s);
+ memcpy(l->d, buf.buf, buf.pos);
+ c->journal.early_journal_entries.nr += jset_u64s(u64s);
} else {
ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_LAZY_RW|commit_flags,
- __bch2_trans_log_msg(&trans->extra_journal_entries, fmt, args));
+ BCH_TRANS_COMMIT_lazy_rw|commit_flags,
+ __bch2_trans_log_msg(trans, &buf, u64s));
}
-
+err:
+ printbuf_exit(&buf);
return ret;
}
diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h
index 9816d2286540..b9382b7b288b 100644
--- a/fs/bcachefs/btree_update.h
+++ b/fs/bcachefs/btree_update.h
@@ -21,42 +21,32 @@ void bch2_btree_add_journal_pin(struct bch_fs *, struct btree *, u64);
void bch2_btree_insert_key_leaf(struct btree_trans *, struct btree_path *,
struct bkey_i *, u64);
-enum btree_insert_flags {
+#define BCH_TRANS_COMMIT_FLAGS() \
+ x(no_enospc, "don't check for enospc") \
+ x(no_check_rw, "don't attempt to take a ref on c->writes") \
+ x(lazy_rw, "go read-write if we haven't yet - only for use in recovery") \
+ x(no_journal_res, "don't take a journal reservation, instead " \
+ "pin journal entry referred to by trans->journal_res.seq") \
+ x(journal_reclaim, "operation required for journal reclaim; may return error" \
+ "instead of deadlocking if BCH_WATERMARK_reclaim not specified")\
+
+enum __bch_trans_commit_flags {
/* First bits for bch_watermark: */
- __BTREE_INSERT_NOFAIL = BCH_WATERMARK_BITS,
- __BTREE_INSERT_NOCHECK_RW,
- __BTREE_INSERT_LAZY_RW,
- __BTREE_INSERT_JOURNAL_REPLAY,
- __BTREE_INSERT_JOURNAL_RECLAIM,
- __BTREE_INSERT_NOWAIT,
- __BTREE_INSERT_GC_LOCK_HELD,
- __BCH_HASH_SET_MUST_CREATE,
- __BCH_HASH_SET_MUST_REPLACE,
+ __BCH_TRANS_COMMIT_FLAGS_START = BCH_WATERMARK_BITS,
+#define x(n, ...) __BCH_TRANS_COMMIT_##n,
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
};
-/* Don't check for -ENOSPC: */
-#define BTREE_INSERT_NOFAIL BIT(__BTREE_INSERT_NOFAIL)
-
-#define BTREE_INSERT_NOCHECK_RW BIT(__BTREE_INSERT_NOCHECK_RW)
-#define BTREE_INSERT_LAZY_RW BIT(__BTREE_INSERT_LAZY_RW)
-
-/* Insert is for journal replay - don't get journal reservations: */
-#define BTREE_INSERT_JOURNAL_REPLAY BIT(__BTREE_INSERT_JOURNAL_REPLAY)
-
-/* Insert is being called from journal reclaim path: */
-#define BTREE_INSERT_JOURNAL_RECLAIM BIT(__BTREE_INSERT_JOURNAL_RECLAIM)
-
-/* Don't block on allocation failure (for new btree nodes: */
-#define BTREE_INSERT_NOWAIT BIT(__BTREE_INSERT_NOWAIT)
-#define BTREE_INSERT_GC_LOCK_HELD BIT(__BTREE_INSERT_GC_LOCK_HELD)
-
-#define BCH_HASH_SET_MUST_CREATE BIT(__BCH_HASH_SET_MUST_CREATE)
-#define BCH_HASH_SET_MUST_REPLACE BIT(__BCH_HASH_SET_MUST_REPLACE)
+enum bch_trans_commit_flags {
+#define x(n, ...) BCH_TRANS_COMMIT_##n = BIT(__BCH_TRANS_COMMIT_##n),
+ BCH_TRANS_COMMIT_FLAGS()
+#undef x
+};
int bch2_btree_delete_extent_at(struct btree_trans *, struct btree_iter *,
unsigned, unsigned);
int bch2_btree_delete_at(struct btree_trans *, struct btree_iter *, unsigned);
-int bch2_btree_delete_at_buffered(struct btree_trans *, enum btree_id, struct bpos);
int bch2_btree_delete(struct btree_trans *, enum btree_id, struct bpos, unsigned);
int bch2_btree_insert_nonextent(struct btree_trans *, enum btree_id,
@@ -74,6 +64,12 @@ int bch2_btree_delete_range(struct bch_fs *, enum btree_id,
int bch2_btree_bit_mod(struct btree_trans *, enum btree_id, struct bpos, bool);
+static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans,
+ enum btree_id btree, struct bpos pos)
+{
+ return bch2_btree_bit_mod(trans, btree, pos, false);
+}
+
int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id,
struct bpos, struct bpos);
@@ -105,10 +101,44 @@ int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *,
int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *,
struct bkey_i *, enum btree_update_flags);
-int __must_check bch2_trans_update_seq(struct btree_trans *, u64, struct btree_iter *,
- struct bkey_i *, enum btree_update_flags);
-int __must_check bch2_trans_update_buffered(struct btree_trans *,
- enum btree_id, struct bkey_i *);
+
+struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned);
+
+static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans)
+{
+ return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s);
+}
+
+static inline struct jset_entry *
+bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s)
+{
+ if (!trans->journal_entries ||
+ trans->journal_entries_u64s + u64s > trans->journal_entries_size)
+ return __bch2_trans_jset_entry_alloc(trans, u64s);
+
+ struct jset_entry *e = btree_trans_journal_entries_top(trans);
+ trans->journal_entries_u64s += u64s;
+ return e;
+}
+
+int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *);
+
+static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans,
+ enum btree_id btree,
+ struct bkey_i *k)
+{
+ if (unlikely(trans->journal_replay_not_finished))
+ return bch2_btree_insert_clone_trans(trans, btree, k);
+
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s));
+ int ret = PTR_ERR_OR_ZERO(e);
+ if (ret)
+ return ret;
+
+ journal_entry_init(e, BCH_JSET_ENTRY_write_buffer_keys, btree, 0, k->k.u64s);
+ bkey_copy(e->start, k);
+ return 0;
+}
void bch2_trans_commit_hook(struct btree_trans *,
struct btree_trans_commit_hook *);
@@ -157,28 +187,19 @@ static inline int bch2_trans_commit(struct btree_trans *trans,
bch2_trans_run(_c, commit_do(trans, _disk_res, _journal_seq, _flags, _do))
#define trans_for_each_update(_trans, _i) \
- for ((_i) = (_trans)->updates; \
+ for (struct btree_insert_entry *_i = (_trans)->updates; \
(_i) < (_trans)->updates + (_trans)->nr_updates; \
(_i)++)
-#define trans_for_each_wb_update(_trans, _i) \
- for ((_i) = (_trans)->wb_updates; \
- (_i) < (_trans)->wb_updates + (_trans)->nr_wb_updates; \
- (_i)++)
-
static inline void bch2_trans_reset_updates(struct btree_trans *trans)
{
- struct btree_insert_entry *i;
-
trans_for_each_update(trans, i)
bch2_path_put(trans, i->path, true);
- trans->extra_journal_res = 0;
trans->nr_updates = 0;
- trans->nr_wb_updates = 0;
- trans->wb_updates = NULL;
+ trans->journal_entries_u64s = 0;
trans->hooks = NULL;
- trans->extra_journal_entries.nr = 0;
+ trans->extra_disk_res = 0;
if (trans->fs_usage_deltas) {
trans->fs_usage_deltas->used = 0;
diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c
index 239fcc3c7c99..4530b14ff2c3 100644
--- a/fs/bcachefs/btree_update_interior.c
+++ b/fs/bcachefs/btree_update_interior.c
@@ -25,24 +25,24 @@
#include <linux/random.h>
static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *,
- struct btree_path *, struct btree *,
+ btree_path_idx_t, struct btree *,
struct keylist *, unsigned);
static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *);
-static struct btree_path *get_unlocked_mut_path(struct btree_trans *trans,
- enum btree_id btree_id,
- unsigned level,
- struct bpos pos)
+static btree_path_idx_t get_unlocked_mut_path(struct btree_trans *trans,
+ enum btree_id btree_id,
+ unsigned level,
+ struct bpos pos)
{
- struct btree_path *path;
-
- path = bch2_path_get(trans, btree_id, pos, level + 1, level,
+ btree_path_idx_t path_idx = bch2_path_get(trans, btree_id, pos, level + 1, level,
BTREE_ITER_NOPRESERVE|
BTREE_ITER_INTENT, _RET_IP_);
- path = bch2_btree_path_make_mut(trans, path, true, _RET_IP_);
+ path_idx = bch2_btree_path_make_mut(trans, path_idx, true, _RET_IP_);
+
+ struct btree_path *path = trans->paths + path_idx;
bch2_btree_path_downgrade(trans, path);
__bch2_btree_path_unlock(trans, path);
- return path;
+ return path_idx;
}
/* Debug code: */
@@ -159,14 +159,16 @@ static bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *b,
{
size_t u64s = btree_node_u64s_with_format(nr, &b->format, new_f);
- return __vstruct_bytes(struct btree_node, u64s) < btree_bytes(c);
+ return __vstruct_bytes(struct btree_node, u64s) < btree_buf_bytes(b);
}
/* Btree node freeing/allocation: */
-static void __btree_node_free(struct bch_fs *c, struct btree *b)
+static void __btree_node_free(struct btree_trans *trans, struct btree *b)
{
- trace_and_count(c, btree_node_free, c, b);
+ struct bch_fs *c = trans->c;
+
+ trace_and_count(c, btree_node_free, trans, b);
BUG_ON(btree_node_write_blocked(b));
BUG_ON(btree_node_dirty(b));
@@ -188,15 +190,15 @@ static void bch2_btree_node_free_inmem(struct btree_trans *trans,
struct btree *b)
{
struct bch_fs *c = trans->c;
- unsigned level = b->c.level;
+ unsigned i, level = b->c.level;
bch2_btree_node_lock_write_nofail(trans, path, &b->c);
bch2_btree_node_hash_remove(&c->btree_cache, b);
- __btree_node_free(c, b);
+ __btree_node_free(trans, b);
six_unlock_write(&b->c.lock);
mark_btree_node_locked_noreset(path, level, BTREE_NODE_INTENT_LOCKED);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->l[level].b == b) {
btree_node_unlock(trans, path, level);
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
@@ -210,7 +212,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
struct bch_fs *c = as->c;
struct prealloc_nodes *p = &as->prealloc_nodes[b->c.lock.readers != NULL];
struct btree_path *path;
- unsigned level = b->c.level;
+ unsigned i, level = b->c.level;
BUG_ON(!list_empty(&b->write_blocked));
BUG_ON(b->will_make_reachable != (1UL|(unsigned long) as));
@@ -233,7 +235,7 @@ static void bch2_btree_node_free_never_used(struct btree_update *as,
six_unlock_intent(&b->c.lock);
- trans_for_each_path(trans, path)
+ trans_for_each_path(trans, path, i)
if (path->l[level].b == b) {
btree_node_unlock(trans, path, level);
path->l[level].b = ERR_PTR(-BCH_ERR_no_btree_node_init);
@@ -278,7 +280,8 @@ retry:
writepoint_ptr(&c->btree_write_point),
&devs_have,
res->nr_replicas,
- c->opts.metadata_replicas_required,
+ min(res->nr_replicas,
+ c->opts.metadata_replicas_required),
watermark, 0, cl, &wp);
if (unlikely(ret))
return ERR_PTR(ret);
@@ -363,7 +366,7 @@ static struct btree *bch2_btree_node_alloc(struct btree_update *as,
ret = bch2_btree_node_hash_insert(&c->btree_cache, b, level, as->btree_id);
BUG_ON(ret);
- trace_and_count(c, btree_node_alloc, c, b);
+ trace_and_count(c, btree_node_alloc, trans, b);
bch2_increment_clock(c, btree_sectors(c), WRITE);
return b;
}
@@ -453,7 +456,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans *
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_intent);
btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_write);
- __btree_node_free(c, b);
+ __btree_node_free(trans, b);
six_unlock_write(&b->c.lock);
six_unlock_intent(&b->c.lock);
}
@@ -466,7 +469,6 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
unsigned flags,
struct closure *cl)
{
- struct bch_fs *c = as->c;
struct btree *b;
unsigned interior;
int ret = 0;
@@ -476,11 +478,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
/*
* Protects reaping from the btree node cache and using the btree node
* open bucket reserve:
- *
- * BTREE_INSERT_NOWAIT only applies to btree node allocation, not
- * blocking on this lock:
*/
- ret = bch2_btree_cache_cannibalize_lock(c, cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, cl);
if (ret)
return ret;
@@ -488,9 +487,8 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
struct prealloc_nodes *p = as->prealloc_nodes + interior;
while (p->nr < nr_nodes[interior]) {
- b = __bch2_btree_node_alloc(trans, &as->disk_res,
- flags & BTREE_INSERT_NOWAIT ? NULL : cl,
- interior, flags);
+ b = __bch2_btree_node_alloc(trans, &as->disk_res, cl,
+ interior, flags);
if (IS_ERR(b)) {
ret = PTR_ERR(b);
goto err;
@@ -500,7 +498,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans,
}
}
err:
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return ret;
}
@@ -559,24 +557,20 @@ static void btree_update_add_key(struct btree_update *as,
static int btree_update_nodes_written_trans(struct btree_trans *trans,
struct btree_update *as)
{
- struct bkey_i *k;
- int ret;
-
- ret = darray_make_room(&trans->extra_journal_entries, as->journal_u64s);
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, as->journal_u64s);
+ int ret = PTR_ERR_OR_ZERO(e);
if (ret)
return ret;
- memcpy(&darray_top(trans->extra_journal_entries),
- as->journal_entries,
- as->journal_u64s * sizeof(u64));
- trans->extra_journal_entries.nr += as->journal_u64s;
+ memcpy(e, as->journal_entries, as->journal_u64s * sizeof(u64));
trans->journal_pin = &as->journal;
for_each_keylist_key(&as->old_keys, k) {
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
- ret = bch2_trans_mark_old(trans, as->btree_id, level, bkey_i_to_s_c(k), 0);
+ ret = bch2_key_trigger_old(trans, as->btree_id, level, bkey_i_to_s_c(k),
+ BTREE_TRIGGER_TRANSACTIONAL);
if (ret)
return ret;
}
@@ -584,7 +578,8 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans,
for_each_keylist_key(&as->new_keys, k) {
unsigned level = bkey_i_to_btree_ptr_v2(k)->v.mem_ptr;
- ret = bch2_trans_mark_new(trans, as->btree_id, level, k, 0);
+ ret = bch2_key_trigger_new(trans, as->btree_id, level, bkey_i_to_s(k),
+ BTREE_TRIGGER_TRANSACTIONAL);
if (ret)
return ret;
}
@@ -645,9 +640,9 @@ static void btree_update_nodes_written(struct btree_update *as)
*/
ret = commit_do(trans, &as->disk_res, &journal_seq,
BCH_WATERMARK_reclaim|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_JOURNAL_RECLAIM,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_journal_reclaim,
btree_update_nodes_written_trans(trans, as));
bch2_trans_unlock(trans);
@@ -655,10 +650,11 @@ static void btree_update_nodes_written(struct btree_update *as)
"%s(): error %s", __func__, bch2_err_str(ret));
err:
if (as->b) {
- struct btree_path *path;
b = as->b;
- path = get_unlocked_mut_path(trans, as->btree_id, b->c.level, b->key.k.p);
+ btree_path_idx_t path_idx = get_unlocked_mut_path(trans,
+ as->btree_id, b->c.level, b->key.k.p);
+ struct btree_path *path = trans->paths + path_idx;
/*
* @b is the node we did the final insert into:
*
@@ -728,7 +724,7 @@ err:
btree_node_write_if_need(c, b, SIX_LOCK_intent);
btree_node_unlock(trans, path, b->c.level);
- bch2_path_put(trans, path, true);
+ bch2_path_put(trans, path_idx, true);
}
bch2_journal_pin_drop(&c->journal, &as->journal);
@@ -815,6 +811,12 @@ static void btree_update_updated_node(struct btree_update *as, struct btree *b)
mutex_unlock(&c->btree_interior_update_lock);
}
+static int bch2_update_reparent_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
static void btree_update_reparent(struct btree_update *as,
struct btree_update *child)
{
@@ -825,7 +827,8 @@ static void btree_update_reparent(struct btree_update *as,
child->b = NULL;
child->mode = BTREE_INTERIOR_UPDATING_AS;
- bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &child->journal,
+ bch2_update_reparent_journal_pin_flush);
}
static void btree_update_updated_root(struct btree_update *as, struct btree *b)
@@ -934,6 +937,12 @@ static void bch2_btree_update_get_open_buckets(struct btree_update *as, struct b
b->ob.v[--b->ob.nr];
}
+static int bch2_btree_update_will_free_node_journal_pin_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
+{
+ return 0;
+}
+
/*
* @b is being split/rewritten: it may have pointers to not-yet-written btree
* nodes and thus outstanding btree_updates - redirect @b's
@@ -985,11 +994,13 @@ static void bch2_btree_interior_update_will_free_node(struct btree_update *as,
* when the new nodes are persistent and reachable on disk:
*/
w = btree_current_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+ bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
w = btree_prev_write(b);
- bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal, NULL);
+ bch2_journal_pin_copy(&c->journal, &as->journal, &w->journal,
+ bch2_btree_update_will_free_node_journal_pin_flush);
bch2_journal_pin_drop(&c->journal, &w->journal);
mutex_unlock(&c->btree_interior_update_lock);
@@ -1039,7 +1050,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
struct bch_fs *c = trans->c;
struct btree_update *as;
u64 start_time = local_clock();
- int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
+ int disk_res_flags = (flags & BCH_TRANS_COMMIT_no_enospc)
? BCH_DISK_RESERVATION_NOFAIL : 0;
unsigned nr_nodes[2] = { 0, 0 };
unsigned update_level = level;
@@ -1057,7 +1068,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
flags &= ~BCH_WATERMARK_MASK;
flags |= watermark;
- if (!(flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ if (!(flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark < c->journal.watermark) {
struct journal_res res = { 0 };
@@ -1087,16 +1098,14 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* Always check for space for two keys, even if we won't have to
* split at prior level - it might have been a merge instead:
*/
- if (bch2_btree_node_insert_fits(c, path->l[update_level].b,
+ if (bch2_btree_node_insert_fits(path->l[update_level].b,
BKEY_BTREE_PTR_U64s_MAX * 2))
break;
split = path->l[update_level].b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c);
}
- if (flags & BTREE_INSERT_GC_LOCK_HELD)
- lockdep_assert_held(&c->gc_lock);
- else if (!down_read_trylock(&c->gc_lock)) {
+ if (!down_read_trylock(&c->gc_lock)) {
ret = drop_locks_do(trans, (down_read(&c->gc_lock), 0));
if (ret) {
up_read(&c->gc_lock);
@@ -1110,7 +1119,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
as->c = c;
as->start_time = start_time;
as->mode = BTREE_INTERIOR_NO_UPDATE;
- as->took_gc_lock = !(flags & BTREE_INSERT_GC_LOCK_HELD);
+ as->took_gc_lock = true;
as->btree_id = path->btree_id;
as->update_level = update_level;
INIT_LIST_HEAD(&as->list);
@@ -1153,7 +1162,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
* flag
*/
if (bch2_err_matches(ret, ENOSPC) &&
- (flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+ (flags & BCH_TRANS_COMMIT_journal_reclaim) &&
watermark != BCH_WATERMARK_reclaim) {
ret = -BCH_ERR_journal_reclaim_would_deadlock;
goto err;
@@ -1183,6 +1192,9 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path,
return as;
err:
bch2_btree_update_free(as, trans);
+ if (!bch2_err_matches(ret, ENOSPC) &&
+ !bch2_err_matches(ret, EROFS))
+ bch_err_fn_ratelimited(c, ret);
return ERR_PTR(ret);
}
@@ -1214,7 +1226,7 @@ static void bch2_btree_set_root(struct btree_update *as,
struct bch_fs *c = as->c;
struct btree *old;
- trace_and_count(c, btree_node_set_root, c, b);
+ trace_and_count(c, btree_node_set_root, trans, b);
old = btree_node_root(c, b);
@@ -1390,7 +1402,7 @@ static void __btree_split_node(struct btree_update *as,
unsigned u64s = nr_keys[i].nr_keys * n[i]->data->format.key_u64s +
nr_keys[i].val_u64s;
- if (__vstruct_bytes(struct btree_node, u64s) > btree_bytes(as->c))
+ if (__vstruct_bytes(struct btree_node, u64s) > btree_buf_bytes(b))
n[i]->data->format = b->format;
btree_node_set_format(n[i], n[i]->data->format);
@@ -1445,10 +1457,12 @@ static void __btree_split_node(struct btree_update *as,
*/
static void btree_split_insert_keys(struct btree_update *as,
struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path_idx,
struct btree *b,
struct keylist *keys)
{
+ struct btree_path *path = trans->paths + path_idx;
+
if (!bch2_keylist_empty(keys) &&
bpos_le(bch2_keylist_front(keys)->k.p, b->data->max_key)) {
struct btree_node_iter node_iter;
@@ -1462,25 +1476,25 @@ static void btree_split_insert_keys(struct btree_update *as,
}
static int btree_split(struct btree_update *as, struct btree_trans *trans,
- struct btree_path *path, struct btree *b,
+ btree_path_idx_t path, struct btree *b,
struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
- struct btree *parent = btree_node_parent(path, b);
+ struct btree *parent = btree_node_parent(trans->paths + path, b);
struct btree *n1, *n2 = NULL, *n3 = NULL;
- struct btree_path *path1 = NULL, *path2 = NULL;
+ btree_path_idx_t path1 = 0, path2 = 0;
u64 start_time = local_clock();
int ret = 0;
BUG_ON(!parent && (b != btree_node_root(c, b)));
- BUG_ON(parent && !btree_node_intent_locked(path, b->c.level + 1));
+ BUG_ON(parent && !btree_node_intent_locked(trans->paths + path, b->c.level + 1));
bch2_btree_interior_update_will_free_node(as, b);
if (b->nr.live_u64s > BTREE_SPLIT_THRESHOLD(c)) {
struct btree *n[2];
- trace_and_count(c, btree_node_split, c, b);
+ trace_and_count(c, btree_node_split, trans, b);
n[0] = n1 = bch2_btree_node_alloc(as, trans, b->c.level);
n[1] = n2 = bch2_btree_node_alloc(as, trans, b->c.level);
@@ -1501,15 +1515,15 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
six_unlock_write(&n2->c.lock);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path1, n1);
+ mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path1, n1);
- path2 = get_unlocked_mut_path(trans, path->btree_id, n2->c.level, n2->key.k.p);
+ path2 = get_unlocked_mut_path(trans, as->btree_id, n2->c.level, n2->key.k.p);
six_lock_increment(&n2->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path2, n2);
+ mark_btree_node_locked(trans, trans->paths + path2, n2->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path2, n2);
/*
* Note that on recursive parent_keys == keys, so we
@@ -1526,11 +1540,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n3);
six_unlock_write(&n3->c.lock);
- path2->locks_want++;
- BUG_ON(btree_node_locked(path2, n3->c.level));
+ trans->paths[path2].locks_want++;
+ BUG_ON(btree_node_locked(trans->paths + path2, n3->c.level));
six_lock_increment(&n3->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path2, n3);
+ mark_btree_node_locked(trans, trans->paths + path2, n3->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path2, n3);
n3->sib_u64s[0] = U16_MAX;
n3->sib_u64s[1] = U16_MAX;
@@ -1538,7 +1552,7 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
btree_split_insert_keys(as, trans, path, n3, &as->parent_keys);
}
} else {
- trace_and_count(c, btree_node_compact, c, b);
+ trace_and_count(c, btree_node_compact, trans, b);
n1 = bch2_btree_node_alloc_replacement(as, trans, b);
@@ -1551,10 +1565,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n1);
six_unlock_write(&n1->c.lock);
- path1 = get_unlocked_mut_path(trans, path->btree_id, n1->c.level, n1->key.k.p);
+ path1 = get_unlocked_mut_path(trans, as->btree_id, n1->c.level, n1->key.k.p);
six_lock_increment(&n1->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, path1, n1);
+ mark_btree_node_locked(trans, trans->paths + path1, n1->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + path1, n1);
if (parent)
bch2_keylist_add(&as->parent_keys, &n1->key);
@@ -1568,10 +1582,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
if (ret)
goto err;
} else if (n3) {
- bch2_btree_set_root(as, trans, path, n3);
+ bch2_btree_set_root(as, trans, trans->paths + path, n3);
} else {
/* Root filled up but didn't need to be split */
- bch2_btree_set_root(as, trans, path, n1);
+ bch2_btree_set_root(as, trans, trans->paths + path, n1);
}
if (n3) {
@@ -1591,13 +1605,13 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
* node after another thread has locked and updated the new node, thus
* seeing stale data:
*/
- bch2_btree_node_free_inmem(trans, path, b);
+ bch2_btree_node_free_inmem(trans, trans->paths + path, b);
if (n3)
- bch2_trans_node_add(trans, n3);
+ bch2_trans_node_add(trans, trans->paths + path, n3);
if (n2)
- bch2_trans_node_add(trans, n2);
- bch2_trans_node_add(trans, n1);
+ bch2_trans_node_add(trans, trans->paths + path2, n2);
+ bch2_trans_node_add(trans, trans->paths + path1, n1);
if (n3)
six_unlock_intent(&n3->c.lock);
@@ -1606,11 +1620,11 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans,
six_unlock_intent(&n1->c.lock);
out:
if (path2) {
- __bch2_btree_path_unlock(trans, path2);
+ __bch2_btree_path_unlock(trans, trans->paths + path2);
bch2_path_put(trans, path2, true);
}
if (path1) {
- __bch2_btree_path_unlock(trans, path1);
+ __bch2_btree_path_unlock(trans, trans->paths + path1);
bch2_path_put(trans, path1, true);
}
@@ -1638,13 +1652,14 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
struct keylist *keys)
{
struct btree_path *linked;
+ unsigned i;
__bch2_btree_insert_keys_interior(as, trans, path, b,
path->l[b->c.level].iter, keys);
btree_update_updated_node(as, b);
- trans_for_each_path_with_node(trans, b, linked)
+ trans_for_each_path_with_node(trans, b, linked, i)
bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b);
bch2_trans_verify_paths(trans);
@@ -1655,7 +1670,7 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
*
* @as: btree_update object
* @trans: btree_trans object
- * @path: path that points to current node
+ * @path_idx: path that points to current node
* @b: node to insert keys into
* @keys: list of keys to insert
* @flags: transaction commit flags
@@ -1667,10 +1682,11 @@ bch2_btree_insert_keys_interior(struct btree_update *as,
* for leaf nodes -- inserts into interior nodes have to be atomic.
*/
static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *trans,
- struct btree_path *path, struct btree *b,
+ btree_path_idx_t path_idx, struct btree *b,
struct keylist *keys, unsigned flags)
{
struct bch_fs *c = as->c;
+ struct btree_path *path = trans->paths + path_idx;
int old_u64s = le16_to_cpu(btree_bset_last(b)->u64s);
int old_live_u64s = b->nr.live_u64s;
int live_u64s_added, u64s_added;
@@ -1688,7 +1704,7 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t
bch2_btree_node_prep_for_write(trans, path, b);
- if (!bch2_btree_node_insert_fits(c, b, bch2_keylist_u64s(keys))) {
+ if (!bch2_btree_node_insert_fits(b, bch2_keylist_u64s(keys))) {
bch2_btree_node_unlock_write(trans, path, b);
goto split;
}
@@ -1723,19 +1739,22 @@ split:
return btree_trans_restart(trans, BCH_ERR_transaction_restart_split_race);
}
- return btree_split(as, trans, path, b, keys, flags);
+ return btree_split(as, trans, path_idx, b, keys, flags);
}
int bch2_btree_split_leaf(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path,
unsigned flags)
{
- struct btree *b = path_l(path)->b;
+ /* btree_split & merge may both cause paths array to be reallocated */
+
+ struct btree *b = path_l(trans->paths + path)->b;
struct btree_update *as;
unsigned l;
int ret = 0;
- as = bch2_btree_update_start(trans, path, path->level,
+ as = bch2_btree_update_start(trans, trans->paths + path,
+ trans->paths[path].level,
true, flags);
if (IS_ERR(as))
return PTR_ERR(as);
@@ -1748,20 +1767,21 @@ int bch2_btree_split_leaf(struct btree_trans *trans,
bch2_btree_update_done(as, trans);
- for (l = path->level + 1; btree_node_intent_locked(path, l) && !ret; l++)
+ for (l = trans->paths[path].level + 1;
+ btree_node_intent_locked(&trans->paths[path], l) && !ret;
+ l++)
ret = bch2_foreground_maybe_merge(trans, path, l, flags);
return ret;
}
int __bch2_foreground_maybe_merge(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path,
unsigned level,
unsigned flags,
enum btree_node_sibling sib)
{
struct bch_fs *c = trans->c;
- struct btree_path *sib_path = NULL, *new_path = NULL;
struct btree_update *as;
struct bkey_format_state new_s;
struct bkey_format new_f;
@@ -1769,13 +1789,15 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
struct btree *b, *m, *n, *prev, *next, *parent;
struct bpos sib_pos;
size_t sib_u64s;
+ enum btree_id btree = trans->paths[path].btree_id;
+ btree_path_idx_t sib_path = 0, new_path = 0;
u64 start_time = local_clock();
int ret = 0;
- BUG_ON(!path->should_be_locked);
- BUG_ON(!btree_node_locked(path, level));
+ BUG_ON(!trans->paths[path].should_be_locked);
+ BUG_ON(!btree_node_locked(&trans->paths[path], level));
- b = path->l[level].b;
+ b = trans->paths[path].l[level].b;
if ((sib == btree_prev_sib && bpos_eq(b->data->min_key, POS_MIN)) ||
(sib == btree_next_sib && bpos_eq(b->data->max_key, SPOS_MAX))) {
@@ -1787,18 +1809,18 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
? bpos_predecessor(b->data->min_key)
: bpos_successor(b->data->max_key);
- sib_path = bch2_path_get(trans, path->btree_id, sib_pos,
+ sib_path = bch2_path_get(trans, btree, sib_pos,
U8_MAX, level, BTREE_ITER_INTENT, _THIS_IP_);
ret = bch2_btree_path_traverse(trans, sib_path, false);
if (ret)
goto err;
- btree_path_set_should_be_locked(sib_path);
+ btree_path_set_should_be_locked(trans->paths + sib_path);
- m = sib_path->l[level].b;
+ m = trans->paths[sib_path].l[level].b;
- if (btree_node_parent(path, b) !=
- btree_node_parent(sib_path, m)) {
+ if (btree_node_parent(trans->paths + path, b) !=
+ btree_node_parent(trans->paths + sib_path, m)) {
b->sib_u64s[sib] = U16_MAX;
goto out;
}
@@ -1851,14 +1873,14 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
if (b->sib_u64s[sib] > c->btree_foreground_merge_threshold)
goto out;
- parent = btree_node_parent(path, b);
- as = bch2_btree_update_start(trans, path, level, false,
- BTREE_INSERT_NOFAIL|flags);
+ parent = btree_node_parent(trans->paths + path, b);
+ as = bch2_btree_update_start(trans, trans->paths + path, level, false,
+ BCH_TRANS_COMMIT_no_enospc|flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto err;
- trace_and_count(c, btree_node_merge, c, b);
+ trace_and_count(c, btree_node_merge, trans, b);
bch2_btree_interior_update_will_free_node(as, b);
bch2_btree_interior_update_will_free_node(as, m);
@@ -1882,10 +1904,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_add_new_node(as, n);
six_unlock_write(&n->c.lock);
- new_path = get_unlocked_mut_path(trans, path->btree_id, n->c.level, n->key.k.p);
+ new_path = get_unlocked_mut_path(trans, btree, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, new_path, n);
+ mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + new_path, n);
bkey_init(&delete.k);
delete.k.p = prev->key.k.p;
@@ -1903,10 +1925,10 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans,
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
- bch2_btree_node_free_inmem(trans, path, b);
- bch2_btree_node_free_inmem(trans, sib_path, m);
+ bch2_btree_node_free_inmem(trans, trans->paths + path, b);
+ bch2_btree_node_free_inmem(trans, trans->paths + sib_path, m);
- bch2_trans_node_add(trans, n);
+ bch2_trans_node_add(trans, trans->paths + path, n);
bch2_trans_verify_paths(trans);
@@ -1934,16 +1956,16 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
unsigned flags)
{
struct bch_fs *c = trans->c;
- struct btree_path *new_path = NULL;
struct btree *n, *parent;
struct btree_update *as;
+ btree_path_idx_t new_path = 0;
int ret;
- flags |= BTREE_INSERT_NOFAIL;
+ flags |= BCH_TRANS_COMMIT_no_enospc;
- parent = btree_node_parent(iter->path, b);
- as = bch2_btree_update_start(trans, iter->path, b->c.level,
- false, flags);
+ struct btree_path *path = btree_iter_path(trans, iter);
+ parent = btree_node_parent(path, b);
+ as = bch2_btree_update_start(trans, path, b->c.level, false, flags);
ret = PTR_ERR_OR_ZERO(as);
if (ret)
goto out;
@@ -1958,27 +1980,27 @@ int bch2_btree_node_rewrite(struct btree_trans *trans,
new_path = get_unlocked_mut_path(trans, iter->btree_id, n->c.level, n->key.k.p);
six_lock_increment(&n->c.lock, SIX_LOCK_intent);
- mark_btree_node_locked(trans, new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
- bch2_btree_path_level_init(trans, new_path, n);
+ mark_btree_node_locked(trans, trans->paths + new_path, n->c.level, BTREE_NODE_INTENT_LOCKED);
+ bch2_btree_path_level_init(trans, trans->paths + new_path, n);
- trace_and_count(c, btree_node_rewrite, c, b);
+ trace_and_count(c, btree_node_rewrite, trans, b);
if (parent) {
bch2_keylist_add(&as->parent_keys, &n->key);
- ret = bch2_btree_insert_node(as, trans, iter->path, parent,
- &as->parent_keys, flags);
+ ret = bch2_btree_insert_node(as, trans, iter->path,
+ parent, &as->parent_keys, flags);
if (ret)
goto err;
} else {
- bch2_btree_set_root(as, trans, iter->path, n);
+ bch2_btree_set_root(as, trans, btree_iter_path(trans, iter), n);
}
bch2_btree_update_get_open_buckets(as, n);
bch2_btree_node_write(c, n, SIX_LOCK_intent, 0);
- bch2_btree_node_free_inmem(trans, iter->path, b);
+ bch2_btree_node_free_inmem(trans, btree_iter_path(trans, iter), b);
- bch2_trans_node_add(trans, n);
+ bch2_trans_node_add(trans, trans->paths + iter->path, n);
six_unlock_intent(&n->c.lock);
bch2_btree_update_done(as, trans);
@@ -2047,8 +2069,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work)
ret = bch2_trans_do(c, NULL, NULL, 0,
async_btree_node_rewrite_trans(trans, a));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite);
kfree(a);
}
@@ -2071,7 +2092,7 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
a->seq = b->data->keys.seq;
INIT_WORK(&a->work, async_btree_node_rewrite_work);
- if (unlikely(!test_bit(BCH_FS_MAY_GO_RW, &c->flags))) {
+ if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) {
mutex_lock(&c->pending_node_rewrites_lock);
list_add(&a->list, &c->pending_node_rewrites);
mutex_unlock(&c->pending_node_rewrites_lock);
@@ -2079,15 +2100,15 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b)
}
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) {
- if (test_bit(BCH_FS_STARTED, &c->flags)) {
+ if (test_bit(BCH_FS_started, &c->flags)) {
bch_err(c, "%s: error getting c->writes ref", __func__);
kfree(a);
return;
}
ret = bch2_fs_read_write_early(c);
+ bch_err_msg(c, ret, "going read-write");
if (ret) {
- bch_err_msg(c, ret, "going read-write");
kfree(a);
return;
}
@@ -2138,13 +2159,12 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
int ret;
if (!skip_triggers) {
- ret = bch2_trans_mark_old(trans, b->c.btree_id, b->c.level + 1,
- bkey_i_to_s_c(&b->key), 0);
- if (ret)
- return ret;
-
- ret = bch2_trans_mark_new(trans, b->c.btree_id, b->c.level + 1,
- new_key, 0);
+ ret = bch2_key_trigger_old(trans, b->c.btree_id, b->c.level + 1,
+ bkey_i_to_s_c(&b->key),
+ BTREE_TRIGGER_TRANSACTIONAL) ?:
+ bch2_key_trigger_new(trans, b->c.btree_id, b->c.level + 1,
+ bkey_i_to_s(new_key),
+ BTREE_TRIGGER_TRANSACTIONAL);
if (ret)
return ret;
}
@@ -2156,7 +2176,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
BUG_ON(ret);
}
- parent = btree_node_parent(iter->path, b);
+ parent = btree_node_parent(btree_iter_path(trans, iter), b);
if (parent) {
bch2_trans_copy_iter(&iter2, iter);
@@ -2164,10 +2184,11 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
iter2.flags & BTREE_ITER_INTENT,
_THIS_IP_);
- BUG_ON(iter2.path->level != b->c.level);
- BUG_ON(!bpos_eq(iter2.path->pos, new_key->k.p));
+ struct btree_path *path2 = btree_iter_path(trans, &iter2);
+ BUG_ON(path2->level != b->c.level);
+ BUG_ON(!bpos_eq(path2->pos, new_key->k.p));
- btree_path_set_level_up(trans, iter2.path);
+ btree_path_set_level_up(trans, path2);
trans->paths_sorted = false;
@@ -2178,23 +2199,23 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
} else {
BUG_ON(btree_node_root(c, b) != b);
- ret = darray_make_room(&trans->extra_journal_entries,
+ struct jset_entry *e = bch2_trans_jset_entry_alloc(trans,
jset_u64s(new_key->k.u64s));
+ ret = PTR_ERR_OR_ZERO(e);
if (ret)
return ret;
- journal_entry_set((void *) &darray_top(trans->extra_journal_entries),
+ journal_entry_set(e,
BCH_JSET_ENTRY_btree_root,
b->c.btree_id, b->c.level,
new_key, new_key->k.u64s);
- trans->extra_journal_entries.nr += jset_u64s(new_key->k.u64s);
}
ret = bch2_trans_commit(trans, NULL, NULL, commit_flags);
if (ret)
goto err;
- bch2_btree_node_lock_write_nofail(trans, iter->path, &b->c);
+ bch2_btree_node_lock_write_nofail(trans, btree_iter_path(trans, iter), &b->c);
if (new_hash) {
mutex_lock(&c->btree_cache.lock);
@@ -2209,7 +2230,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans,
bkey_copy(&b->key, new_key);
}
- bch2_btree_node_unlock_write(trans, iter->path, b);
+ bch2_btree_node_unlock_write(trans, btree_iter_path(trans, iter), b);
out:
bch2_trans_iter_exit(trans, &iter2);
return ret;
@@ -2228,7 +2249,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
{
struct bch_fs *c = trans->c;
struct btree *new_hash = NULL;
- struct btree_path *path = iter->path;
+ struct btree_path *path = btree_iter_path(trans, iter);
struct closure cl;
int ret = 0;
@@ -2243,7 +2264,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
* btree_iter_traverse():
*/
if (btree_ptr_hash_val(new_key) != b->hash_val) {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
if (ret) {
ret = drop_locks_do(trans, (closure_sync(&cl), 0));
if (ret)
@@ -2267,7 +2288,7 @@ int bch2_btree_node_update_key(struct btree_trans *trans, struct btree_iter *ite
six_unlock_intent(&new_hash->c.lock);
}
closure_sync(&cl);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
return ret;
}
@@ -2286,7 +2307,7 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans,
goto out;
/* has node been freed? */
- if (iter.path->l[b->c.level].b != b) {
+ if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) {
/* node has been freed: */
BUG_ON(!btree_node_dying(b));
goto out;
@@ -2328,12 +2349,12 @@ static int __bch2_btree_root_alloc(struct btree_trans *trans, enum btree_id id)
closure_init_stack(&cl);
do {
- ret = bch2_btree_cache_cannibalize_lock(c, &cl);
+ ret = bch2_btree_cache_cannibalize_lock(trans, &cl);
closure_sync(&cl);
} while (ret);
b = bch2_btree_node_mem_alloc(trans, false);
- bch2_btree_cache_cannibalize_unlock(c);
+ bch2_btree_cache_cannibalize_unlock(trans);
set_btree_node_fake(b);
set_btree_node_need_rewrite(b);
diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h
index a6668992a272..c593c925d1e3 100644
--- a/fs/bcachefs/btree_update_interior.h
+++ b/fs/bcachefs/btree_update_interior.h
@@ -117,16 +117,17 @@ struct btree *__bch2_btree_node_alloc_replacement(struct btree_update *,
struct btree *,
struct bkey_format);
-int bch2_btree_split_leaf(struct btree_trans *, struct btree_path *, unsigned);
+int bch2_btree_split_leaf(struct btree_trans *, btree_path_idx_t, unsigned);
-int __bch2_foreground_maybe_merge(struct btree_trans *, struct btree_path *,
+int __bch2_foreground_maybe_merge(struct btree_trans *, btree_path_idx_t,
unsigned, unsigned, enum btree_node_sibling);
static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path_idx,
unsigned level, unsigned flags,
enum btree_node_sibling sib)
{
+ struct btree_path *path = trans->paths + path_idx;
struct btree *b;
EBUG_ON(!btree_node_locked(path, level));
@@ -135,11 +136,11 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans,
if (b->sib_u64s[sib] > trans->c->btree_foreground_merge_threshold)
return 0;
- return __bch2_foreground_maybe_merge(trans, path, level, flags, sib);
+ return __bch2_foreground_maybe_merge(trans, path_idx, level, flags, sib);
}
static inline int bch2_foreground_maybe_merge(struct btree_trans *trans,
- struct btree_path *path,
+ btree_path_idx_t path,
unsigned level,
unsigned flags)
{
@@ -183,21 +184,19 @@ static inline void btree_node_reset_sib_u64s(struct btree *b)
b->sib_u64s[1] = b->nr.live_u64s;
}
-static inline void *btree_data_end(struct bch_fs *c, struct btree *b)
+static inline void *btree_data_end(struct btree *b)
{
- return (void *) b->data + btree_bytes(c);
+ return (void *) b->data + btree_buf_bytes(b);
}
-static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c,
- struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_start(struct btree *b)
{
- return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s);
+ return (void *) ((u64 *) btree_data_end(b) - b->whiteout_u64s);
}
-static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c,
- struct btree *b)
+static inline struct bkey_packed *unwritten_whiteouts_end(struct btree *b)
{
- return btree_data_end(c, b);
+ return btree_data_end(b);
}
static inline void *write_block(struct btree *b)
@@ -220,13 +219,11 @@ static inline bool bkey_written(struct btree *b, struct bkey_packed *k)
return __btree_addr_written(b, k);
}
-static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
- struct btree *b,
- void *end)
+static inline ssize_t __bch2_btree_u64s_remaining(struct btree *b, void *end)
{
ssize_t used = bset_byte_offset(b, end) / sizeof(u64) +
b->whiteout_u64s;
- ssize_t total = c->opts.btree_node_size >> 3;
+ ssize_t total = btree_buf_bytes(b) >> 3;
/* Always leave one extra u64 for bch2_varint_decode: */
used++;
@@ -234,10 +231,9 @@ static inline ssize_t __bch_btree_u64s_remaining(struct bch_fs *c,
return total - used;
}
-static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c,
- struct btree *b)
+static inline size_t bch2_btree_keys_u64s_remaining(struct btree *b)
{
- ssize_t remaining = __bch_btree_u64s_remaining(c, b,
+ ssize_t remaining = __bch2_btree_u64s_remaining(b,
btree_bkey_last(b, bset_tree_last(b)));
BUG_ON(remaining < 0);
@@ -259,14 +255,13 @@ static inline unsigned btree_write_set_buffer(struct btree *b)
return 8 << BTREE_WRITE_SET_U64s_BITS;
}
-static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
- struct btree *b)
+static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, struct btree *b)
{
struct bset_tree *t = bset_tree_last(b);
struct btree_node_entry *bne = max(write_block(b),
(void *) btree_bkey_last(b, bset_tree_last(b)));
ssize_t remaining_space =
- __bch_btree_u64s_remaining(c, b, bne->keys.start);
+ __bch2_btree_u64s_remaining(b, bne->keys.start);
if (unlikely(bset_written(b, bset(b, t)))) {
if (remaining_space > (ssize_t) (block_bytes(c) >> 3))
@@ -280,12 +275,11 @@ static inline struct btree_node_entry *want_new_bset(struct bch_fs *c,
return NULL;
}
-static inline void push_whiteout(struct bch_fs *c, struct btree *b,
- struct bpos pos)
+static inline void push_whiteout(struct btree *b, struct bpos pos)
{
struct bkey_packed k;
- BUG_ON(bch_btree_keys_u64s_remaining(c, b) < BKEY_U64s);
+ BUG_ON(bch2_btree_keys_u64s_remaining(b) < BKEY_U64s);
EBUG_ON(btree_node_just_written(b));
if (!bkey_pack_pos(&k, pos, b)) {
@@ -298,20 +292,19 @@ static inline void push_whiteout(struct bch_fs *c, struct btree *b,
k.needs_whiteout = true;
b->whiteout_u64s += k.u64s;
- bkey_p_copy(unwritten_whiteouts_start(c, b), &k);
+ bkey_p_copy(unwritten_whiteouts_start(b), &k);
}
/*
* write lock must be held on @b (else the dirty bset that we were going to
* insert into could be written out from under us)
*/
-static inline bool bch2_btree_node_insert_fits(struct bch_fs *c,
- struct btree *b, unsigned u64s)
+static inline bool bch2_btree_node_insert_fits(struct btree *b, unsigned u64s)
{
if (unlikely(btree_node_need_rewrite(b)))
return false;
- return u64s <= bch_btree_keys_u64s_remaining(c, b);
+ return u64s <= bch2_btree_keys_u64s_remaining(b);
}
void bch2_btree_updates_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c
index 4e6241db518b..ac7844861966 100644
--- a/fs/bcachefs/btree_write_buffer.c
+++ b/fs/bcachefs/btree_write_buffer.c
@@ -7,45 +7,143 @@
#include "btree_write_buffer.h"
#include "error.h"
#include "journal.h"
+#include "journal_io.h"
#include "journal_reclaim.h"
-#include <linux/sort.h>
+#include <linux/prefetch.h>
-static int btree_write_buffered_key_cmp(const void *_l, const void *_r)
+static int bch2_btree_write_buffer_journal_flush(struct journal *,
+ struct journal_entry_pin *, u64);
+
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *, struct journal_buf *);
+
+static inline bool __wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
+ return (cmp_int(l->hi, r->hi) ?:
+ cmp_int(l->mi, r->mi) ?:
+ cmp_int(l->lo, r->lo)) >= 0;
+}
- return cmp_int(l->btree, r->btree) ?:
- bpos_cmp(l->k.k.p, r->k.k.p) ?:
- cmp_int(l->journal_seq, r->journal_seq) ?:
- cmp_int(l->journal_offset, r->journal_offset);
+static inline bool wb_key_ref_cmp(const struct wb_key_ref *l, const struct wb_key_ref *r)
+{
+#ifdef CONFIG_X86_64
+ int cmp;
+
+ asm("mov (%[l]), %%rax;"
+ "sub (%[r]), %%rax;"
+ "mov 8(%[l]), %%rax;"
+ "sbb 8(%[r]), %%rax;"
+ "mov 16(%[l]), %%rax;"
+ "sbb 16(%[r]), %%rax;"
+ : "=@ccae" (cmp)
+ : [l] "r" (l), [r] "r" (r)
+ : "rax", "cc");
+
+ EBUG_ON(cmp != __wb_key_ref_cmp(l, r));
+ return cmp;
+#else
+ return __wb_key_ref_cmp(l, r);
+#endif
}
-static int btree_write_buffered_journal_cmp(const void *_l, const void *_r)
+/* Compare excluding idx, the low 24 bits: */
+static inline bool wb_key_eq(const void *_l, const void *_r)
{
- const struct btree_write_buffered_key *l = _l;
- const struct btree_write_buffered_key *r = _r;
+ const struct wb_key_ref *l = _l;
+ const struct wb_key_ref *r = _r;
- return cmp_int(l->journal_seq, r->journal_seq);
+ return !((l->hi ^ r->hi)|
+ (l->mi ^ r->mi)|
+ ((l->lo >> 24) ^ (r->lo >> 24)));
}
-static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
- struct btree_iter *iter,
- struct btree_write_buffered_key *wb,
- unsigned commit_flags,
- bool *write_locked,
- size_t *fast)
+static noinline void wb_sort(struct wb_key_ref *base, size_t num)
+{
+ size_t n = num, a = num / 2;
+
+ if (!a) /* num < 2 || size == 0 */
+ return;
+
+ for (;;) {
+ size_t b, c, d;
+
+ if (a) /* Building heap: sift down --a */
+ --a;
+ else if (--n) /* Sorting: Extract root to --n */
+ swap(base[0], base[n]);
+ else /* Sort complete */
+ break;
+
+ /*
+ * Sift element at "a" down into heap. This is the
+ * "bottom-up" variant, which significantly reduces
+ * calls to cmp_func(): we find the sift-down path all
+ * the way to the leaves (one compare per level), then
+ * backtrack to find where to insert the target element.
+ *
+ * Because elements tend to sift down close to the leaves,
+ * this uses fewer compares than doing two per level
+ * on the way down. (A bit more than half as many on
+ * average, 3/4 worst-case.)
+ */
+ for (b = a; c = 2*b + 1, (d = c + 1) < n;)
+ b = wb_key_ref_cmp(base + c, base + d) ? c : d;
+ if (d == n) /* Special case last leaf with no sibling */
+ b = c;
+
+ /* Now backtrack from "b" to the correct location for "a" */
+ while (b != a && wb_key_ref_cmp(base + a, base + b))
+ b = (b - 1) / 2;
+ c = b; /* Where "a" belongs */
+ while (b != a) { /* Shift it into place */
+ b = (b - 1) / 2;
+ swap(base[b], base[c]);
+ }
+ }
+}
+
+static noinline int wb_flush_one_slowpath(struct btree_trans *trans,
+ struct btree_iter *iter,
+ struct btree_write_buffered_key *wb)
+{
+ struct btree_path *path = btree_iter_path(trans, iter);
+
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+
+ trans->journal_res.seq = wb->journal_seq;
+
+ return bch2_trans_update(trans, iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
+ bch2_trans_commit(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim);
+}
+
+static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *iter,
+ struct btree_write_buffered_key *wb,
+ bool *write_locked, size_t *fast)
{
- struct bch_fs *c = trans->c;
struct btree_path *path;
int ret;
+ EBUG_ON(!wb->journal_seq);
+ EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq);
+ EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq);
+
ret = bch2_btree_iter_traverse(iter);
if (ret)
return ret;
- path = iter->path;
+ /*
+ * We can't clone a path that has write locks: unshare it now, before
+ * set_pos and traverse():
+ */
+ if (btree_iter_path(trans, iter)->ref > 1)
+ iter->path = __bch2_btree_path_make_mut(trans, iter->path, true, _THIS_IP_);
+
+ path = btree_iter_path(trans, iter);
if (!*write_locked) {
ret = bch2_btree_node_lock_write(trans, path, &path->l[0].b->c);
@@ -56,52 +154,14 @@ static int bch2_btree_write_buffer_flush_one(struct btree_trans *trans,
*write_locked = true;
}
- if (!bch2_btree_node_insert_fits(c, path->l[0].b, wb->k.k.u64s)) {
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ if (unlikely(!bch2_btree_node_insert_fits(path->l[0].b, wb->k.k.u64s))) {
*write_locked = false;
- goto trans_commit;
+ return wb_flush_one_slowpath(trans, iter, wb);
}
bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq);
(*fast)++;
-
- if (path->ref > 1) {
- /*
- * We can't clone a path that has write locks: if the path is
- * shared, unlock before set_pos(), traverse():
- */
- bch2_btree_node_unlock_write(trans, path, path->l[0].b);
- *write_locked = false;
- }
return 0;
-trans_commit:
- return bch2_trans_update_seq(trans, wb->journal_seq, iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL,
- commit_flags|
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM);
-}
-
-static union btree_write_buffer_state btree_write_buffer_switch(struct btree_write_buffer *wb)
-{
- union btree_write_buffer_state old, new;
- u64 v = READ_ONCE(wb->state.v);
-
- do {
- old.v = new.v = v;
-
- new.nr = 0;
- new.idx++;
- } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
-
- while (old.idx == 0 ? wb->state.ref0 : wb->state.ref1)
- cpu_relax();
-
- smp_mb();
-
- return old;
}
/*
@@ -124,41 +184,87 @@ btree_write_buffered_insert(struct btree_trans *trans,
bch2_trans_iter_init(trans, &iter, wb->btree, bkey_start_pos(&wb->k.k),
BTREE_ITER_CACHED|BTREE_ITER_INTENT);
+ trans->journal_res.seq = wb->journal_seq;
+
ret = bch2_btree_iter_traverse(&iter) ?:
- bch2_trans_update_seq(trans, wb->journal_seq, &iter, &wb->k,
- BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ bch2_trans_update(trans, &iter, &wb->k,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_flags,
- bool locked)
+static void move_keys_from_inc_to_flushing(struct btree_write_buffer *wb)
+{
+ struct bch_fs *c = container_of(wb, struct bch_fs, btree_write_buffer);
+ struct journal *j = &c->journal;
+
+ if (!wb->inc.keys.nr)
+ return;
+
+ bch2_journal_pin_add(j, wb->inc.keys.data[0].journal_seq, &wb->flushing.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ darray_resize(&wb->flushing.keys, min_t(size_t, 1U << 20, wb->flushing.keys.nr + wb->inc.keys.nr));
+ darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+ if (!wb->flushing.keys.nr && wb->sorted.size >= wb->inc.keys.nr) {
+ swap(wb->flushing.keys, wb->inc.keys);
+ goto out;
+ }
+
+ size_t nr = min(darray_room(wb->flushing.keys),
+ wb->sorted.size - wb->flushing.keys.nr);
+ nr = min(nr, wb->inc.keys.nr);
+
+ memcpy(&darray_top(wb->flushing.keys),
+ wb->inc.keys.data,
+ sizeof(wb->inc.keys.data[0]) * nr);
+
+ memmove(wb->inc.keys.data,
+ wb->inc.keys.data + nr,
+ sizeof(wb->inc.keys.data[0]) * (wb->inc.keys.nr - nr));
+
+ wb->flushing.keys.nr += nr;
+ wb->inc.keys.nr -= nr;
+out:
+ if (!wb->inc.keys.nr)
+ bch2_journal_pin_drop(j, &wb->inc.pin);
+ else
+ bch2_journal_pin_update(j, wb->inc.keys.data[0].journal_seq, &wb->inc.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ if (j->watermark) {
+ spin_lock(&j->lock);
+ bch2_journal_set_watermark(j);
+ spin_unlock(&j->lock);
+ }
+
+ BUG_ON(wb->sorted.size < wb->flushing.keys.nr);
+}
+
+static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
struct journal *j = &c->journal;
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct journal_entry_pin pin;
- struct btree_write_buffered_key *i, *keys;
struct btree_iter iter = { NULL };
- size_t nr = 0, skipped = 0, fast = 0, slowpath = 0;
+ size_t skipped = 0, fast = 0, slowpath = 0;
bool write_locked = false;
- union btree_write_buffer_state s;
int ret = 0;
- memset(&pin, 0, sizeof(pin));
-
- if (!locked && !mutex_trylock(&wb->flush_lock))
- return 0;
-
- bch2_journal_pin_copy(j, &pin, &wb->journal_pin, NULL);
- bch2_journal_pin_drop(j, &wb->journal_pin);
+ bch2_trans_unlock(trans);
+ bch2_trans_begin(trans);
- s = btree_write_buffer_switch(wb);
- keys = wb->keys[s.idx];
- nr = s.nr;
+ mutex_lock(&wb->inc.lock);
+ move_keys_from_inc_to_flushing(wb);
+ mutex_unlock(&wb->inc.lock);
- if (race_fault())
- goto slowpath;
+ for (size_t i = 0; i < wb->flushing.keys.nr; i++) {
+ wb->sorted.data[i].idx = i;
+ wb->sorted.data[i].btree = wb->flushing.keys.data[i].btree;
+ memcpy(&wb->sorted.data[i].pos, &wb->flushing.keys.data[i].k.k.p, sizeof(struct bpos));
+ }
+ wb->sorted.nr = wb->flushing.keys.nr;
/*
* We first sort so that we can detect and skip redundant updates, and
@@ -168,208 +274,373 @@ int __bch2_btree_write_buffer_flush(struct btree_trans *trans, unsigned commit_f
* However, since we're not flushing in the order they appear in the
* journal we won't be able to drop our journal pin until everything is
* flushed - which means this could deadlock the journal if we weren't
- * passing BTREE_INSERT_JOURNAL_RECLAIM. This causes the update to fail
+ * passing BCH_TRANS_COMMIT_journal_reclaim. This causes the update to fail
* if it would block taking a journal reservation.
*
* If that happens, simply skip the key so we can optimistically insert
* as many keys as possible in the fast path.
*/
- sort(keys, nr, sizeof(keys[0]),
- btree_write_buffered_key_cmp, NULL);
+ wb_sort(wb->sorted.data, wb->sorted.nr);
+
+ darray_for_each(wb->sorted, i) {
+ struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx];
+
+ for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++)
+ prefetch(&wb->flushing.keys.data[n->idx]);
+
+ BUG_ON(!k->journal_seq);
+
+ if (i + 1 < &darray_top(wb->sorted) &&
+ wb_key_eq(i, i + 1)) {
+ struct btree_write_buffered_key *n = &wb->flushing.keys.data[i[1].idx];
- for (i = keys; i < keys + nr; i++) {
- if (i + 1 < keys + nr &&
- i[0].btree == i[1].btree &&
- bpos_eq(i[0].k.k.p, i[1].k.k.p)) {
skipped++;
- i->journal_seq = 0;
+ n->journal_seq = min_t(u64, n->journal_seq, k->journal_seq);
+ k->journal_seq = 0;
continue;
}
- if (write_locked &&
- (iter.path->btree_id != i->btree ||
- bpos_gt(i->k.k.p, iter.path->l[0].b->key.k.p))) {
- bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
- write_locked = false;
+ if (write_locked) {
+ struct btree_path *path = btree_iter_path(trans, &iter);
+
+ if (path->btree_id != i->btree ||
+ bpos_gt(k->k.k.p, path->l[0].b->key.k.p)) {
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ write_locked = false;
+ }
}
- if (!iter.path || iter.path->btree_id != i->btree) {
+ if (!iter.path || iter.btree_id != k->btree) {
bch2_trans_iter_exit(trans, &iter);
- bch2_trans_iter_init(trans, &iter, i->btree, i->k.k.p,
+ bch2_trans_iter_init(trans, &iter, k->btree, k->k.k.p,
BTREE_ITER_INTENT|BTREE_ITER_ALL_SNAPSHOTS);
}
- bch2_btree_iter_set_pos(&iter, i->k.k.p);
- iter.path->preserve = false;
+ bch2_btree_iter_set_pos(&iter, k->k.k.p);
+ btree_iter_path(trans, &iter)->preserve = false;
do {
- ret = bch2_btree_write_buffer_flush_one(trans, &iter, i,
- commit_flags, &write_locked, &fast);
+ if (race_fault()) {
+ ret = -BCH_ERR_journal_reclaim_would_deadlock;
+ break;
+ }
+
+ ret = wb_flush_one(trans, &iter, k, &write_locked, &fast);
if (!write_locked)
bch2_trans_begin(trans);
} while (bch2_err_matches(ret, BCH_ERR_transaction_restart));
- if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
+ if (!ret) {
+ k->journal_seq = 0;
+ } else if (ret == -BCH_ERR_journal_reclaim_would_deadlock) {
slowpath++;
- continue;
- }
- if (ret)
+ ret = 0;
+ } else
break;
-
- i->journal_seq = 0;
}
- if (write_locked)
- bch2_btree_node_unlock_write(trans, iter.path, iter.path->l[0].b);
+ if (write_locked) {
+ struct btree_path *path = btree_iter_path(trans, &iter);
+ bch2_btree_node_unlock_write(trans, path, path->l[0].b);
+ }
bch2_trans_iter_exit(trans, &iter);
- trace_write_buffer_flush(trans, nr, skipped, fast, wb->size);
-
- if (slowpath)
- goto slowpath;
+ if (ret)
+ goto err;
+ if (slowpath) {
+ /*
+ * Flush in the order they were present in the journal, so that
+ * we can release journal pins:
+ * The fastpath zapped the seq of keys that were successfully flushed so
+ * we can skip those here.
+ */
+ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr);
+
+ darray_for_each(wb->flushing.keys, i) {
+ if (!i->journal_seq)
+ continue;
+
+ bch2_journal_pin_update(j, i->journal_seq, &wb->flushing.pin,
+ bch2_btree_write_buffer_journal_flush);
+
+ bch2_trans_begin(trans);
+
+ ret = commit_do(trans, NULL, NULL,
+ BCH_WATERMARK_reclaim|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_no_journal_res|
+ BCH_TRANS_COMMIT_journal_reclaim,
+ btree_write_buffered_insert(trans, i));
+ if (ret)
+ goto err;
+ }
+ }
+err:
bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret));
-out:
- bch2_journal_pin_drop(j, &pin);
- mutex_unlock(&wb->flush_lock);
+ trace_write_buffer_flush(trans, wb->flushing.keys.nr, skipped, fast, 0);
+ bch2_journal_pin_drop(j, &wb->flushing.pin);
+ wb->flushing.keys.nr = 0;
return ret;
-slowpath:
- trace_write_buffer_flush_slowpath(trans, i - keys, nr);
+}
- /*
- * Now sort the rest by journal seq and bump the journal pin as we go.
- * The slowpath zapped the seq of keys that were successfully flushed so
- * we can skip those here.
- */
- sort(keys, nr, sizeof(keys[0]),
- btree_write_buffered_journal_cmp,
- NULL);
+static int fetch_wb_keys_from_journal(struct bch_fs *c, u64 seq)
+{
+ struct journal *j = &c->journal;
+ struct journal_buf *buf;
+ int ret = 0;
- commit_flags &= ~BCH_WATERMARK_MASK;
- commit_flags |= BCH_WATERMARK_reclaim;
+ while (!ret && (buf = bch2_next_write_buffer_flush_journal_buf(j, seq))) {
+ ret = bch2_journal_keys_to_write_buffer(c, buf);
+ mutex_unlock(&j->buf_lock);
+ }
- for (i = keys; i < keys + nr; i++) {
- if (!i->journal_seq)
- continue;
+ return ret;
+}
- if (i->journal_seq > pin.seq) {
- struct journal_entry_pin pin2;
+static int btree_write_buffer_flush_seq(struct btree_trans *trans, u64 seq)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret = 0, fetch_from_journal_err;
- memset(&pin2, 0, sizeof(pin2));
+ do {
+ bch2_trans_unlock(trans);
- bch2_journal_pin_add(j, i->journal_seq, &pin2, NULL);
- bch2_journal_pin_drop(j, &pin);
- bch2_journal_pin_copy(j, &pin, &pin2, NULL);
- bch2_journal_pin_drop(j, &pin2);
- }
+ fetch_from_journal_err = fetch_wb_keys_from_journal(c, seq);
- ret = commit_do(trans, NULL, NULL,
- commit_flags|
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_JOURNAL_RECLAIM,
- btree_write_buffered_insert(trans, i));
- if (bch2_fs_fatal_err_on(ret, c, "%s: insert error %s", __func__, bch2_err_str(ret)))
- break;
- }
+ /*
+ * On memory allocation failure, bch2_btree_write_buffer_flush_locked()
+ * is not guaranteed to empty wb->inc:
+ */
+ mutex_lock(&wb->flushing.lock);
+ ret = bch2_btree_write_buffer_flush_locked(trans);
+ mutex_unlock(&wb->flushing.lock);
+ } while (!ret &&
+ (fetch_from_journal_err ||
+ (wb->inc.pin.seq && wb->inc.pin.seq <= seq) ||
+ (wb->flushing.pin.seq && wb->flushing.pin.seq <= seq)));
- goto out;
+ return ret;
}
-int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
+static int bch2_btree_write_buffer_journal_flush(struct journal *j,
+ struct journal_entry_pin *_pin, u64 seq)
{
- bch2_trans_unlock(trans);
- mutex_lock(&trans->c->btree_write_buffer.flush_lock);
- return __bch2_btree_write_buffer_flush(trans, 0, true);
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
+ return bch2_trans_run(c, btree_write_buffer_flush_seq(trans, seq));
}
-int bch2_btree_write_buffer_flush(struct btree_trans *trans)
+int bch2_btree_write_buffer_flush_sync(struct btree_trans *trans)
{
- return __bch2_btree_write_buffer_flush(trans, 0, false);
+ struct bch_fs *c = trans->c;
+
+ trace_and_count(c, write_buffer_flush_sync, trans, _RET_IP_);
+
+ return btree_write_buffer_flush_seq(trans, journal_cur_seq(&c->journal));
}
-static int bch2_btree_write_buffer_journal_flush(struct journal *j,
- struct journal_entry_pin *_pin, u64 seq)
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *trans)
{
- struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ struct bch_fs *c = trans->c;
struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret = 0;
- mutex_lock(&wb->flush_lock);
+ if (mutex_trylock(&wb->flushing.lock)) {
+ ret = bch2_btree_write_buffer_flush_locked(trans);
+ mutex_unlock(&wb->flushing.lock);
+ }
- return bch2_trans_run(c,
- __bch2_btree_write_buffer_flush(trans, BTREE_INSERT_NOCHECK_RW, true));
+ return ret;
}
-static inline u64 btree_write_buffer_ref(int idx)
+int bch2_btree_write_buffer_tryflush(struct btree_trans *trans)
{
- return ((union btree_write_buffer_state) {
- .ref0 = idx == 0,
- .ref1 = idx == 1,
- }).v;
+ struct bch_fs *c = trans->c;
+
+ if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer))
+ return -BCH_ERR_erofs_no_writes;
+
+ int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans);
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+ return ret;
}
-int bch2_btree_insert_keys_write_buffer(struct btree_trans *trans)
+static void bch2_btree_write_buffer_flush_work(struct work_struct *work)
{
- struct bch_fs *c = trans->c;
+ struct bch_fs *c = container_of(work, struct bch_fs, btree_write_buffer.flush_work);
struct btree_write_buffer *wb = &c->btree_write_buffer;
- struct btree_write_buffered_key *i;
- union btree_write_buffer_state old, new;
- int ret = 0;
- u64 v;
+ int ret;
+
+ mutex_lock(&wb->flushing.lock);
+ do {
+ ret = bch2_trans_run(c, bch2_btree_write_buffer_flush_locked(trans));
+ } while (!ret && bch2_btree_write_buffer_should_flush(c));
+ mutex_unlock(&wb->flushing.lock);
+
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+}
- trans_for_each_wb_update(trans, i) {
- EBUG_ON(i->k.k.u64s > BTREE_WRITE_BUFERED_U64s_MAX);
+int bch2_journal_key_to_wb_slowpath(struct bch_fs *c,
+ struct journal_keys_to_wb *dst,
+ enum btree_id btree, struct bkey_i *k)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+ int ret;
+retry:
+ ret = darray_make_room_gfp(&dst->wb->keys, 1, GFP_KERNEL);
+ if (!ret && dst->wb == &wb->flushing)
+ ret = darray_resize(&wb->sorted, wb->flushing.keys.size);
+
+ if (unlikely(ret)) {
+ if (dst->wb == &c->btree_write_buffer.flushing) {
+ mutex_unlock(&dst->wb->lock);
+ dst->wb = &c->btree_write_buffer.inc;
+ bch2_journal_pin_add(&c->journal, dst->seq, &dst->wb->pin,
+ bch2_btree_write_buffer_journal_flush);
+ goto retry;
+ }
- i->journal_seq = trans->journal_res.seq;
- i->journal_offset = trans->journal_res.offset;
+ return ret;
}
- preempt_disable();
- v = READ_ONCE(wb->state.v);
- do {
- old.v = new.v = v;
+ dst->room = darray_room(dst->wb->keys);
+ if (dst->wb == &wb->flushing)
+ dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+ BUG_ON(!dst->room);
+ BUG_ON(!dst->seq);
+
+ struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+ wb_k->journal_seq = dst->seq;
+ wb_k->btree = btree;
+ bkey_copy(&wb_k->k, k);
+ dst->wb->keys.nr++;
+ dst->room--;
+ return 0;
+}
+
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *c, struct journal_keys_to_wb *dst, u64 seq)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ if (mutex_trylock(&wb->flushing.lock)) {
+ mutex_lock(&wb->inc.lock);
+ move_keys_from_inc_to_flushing(wb);
- new.v += btree_write_buffer_ref(new.idx);
- new.nr += trans->nr_wb_updates;
- if (new.nr > wb->size) {
- ret = -BCH_ERR_btree_insert_need_flush_buffer;
- goto out;
+ /*
+ * Attempt to skip wb->inc, and add keys directly to
+ * wb->flushing, saving us a copy later:
+ */
+
+ if (!wb->inc.keys.nr) {
+ dst->wb = &wb->flushing;
+ } else {
+ mutex_unlock(&wb->flushing.lock);
+ dst->wb = &wb->inc;
}
- } while ((v = atomic64_cmpxchg_acquire(&wb->state.counter, old.v, new.v)) != old.v);
+ } else {
+ mutex_lock(&wb->inc.lock);
+ dst->wb = &wb->inc;
+ }
- memcpy(wb->keys[new.idx] + old.nr,
- trans->wb_updates,
- sizeof(trans->wb_updates[0]) * trans->nr_wb_updates);
+ dst->room = darray_room(dst->wb->keys);
+ if (dst->wb == &wb->flushing)
+ dst->room = min(dst->room, wb->sorted.size - wb->flushing.keys.nr);
+ dst->seq = seq;
- bch2_journal_pin_add(&c->journal, trans->journal_res.seq, &wb->journal_pin,
+ bch2_journal_pin_add(&c->journal, seq, &dst->wb->pin,
bch2_btree_write_buffer_journal_flush);
+}
- atomic64_sub_return_release(btree_write_buffer_ref(new.idx), &wb->state.counter);
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_to_wb *dst)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ if (!dst->wb->keys.nr)
+ bch2_journal_pin_drop(&c->journal, &dst->wb->pin);
+
+ if (bch2_btree_write_buffer_should_flush(c) &&
+ __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) &&
+ !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work))
+ bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer);
+
+ if (dst->wb == &wb->flushing)
+ mutex_unlock(&wb->flushing.lock);
+ mutex_unlock(&wb->inc.lock);
+}
+
+static int bch2_journal_keys_to_write_buffer(struct bch_fs *c, struct journal_buf *buf)
+{
+ struct journal_keys_to_wb dst;
+ struct jset_entry *entry;
+ struct bkey_i *k;
+ int ret = 0;
+
+ bch2_journal_keys_to_write_buffer_start(c, &dst, le64_to_cpu(buf->data->seq));
+
+ for_each_jset_entry_type(entry, buf->data, BCH_JSET_ENTRY_write_buffer_keys) {
+ jset_entry_for_each_key(entry, k) {
+ ret = bch2_journal_key_to_wb(c, &dst, entry->btree_id, k);
+ if (ret)
+ goto out;
+ }
+
+ entry->type = BCH_JSET_ENTRY_btree_keys;
+ }
+
+ buf->need_flush_to_write_buffer = false;
out:
- preempt_enable();
+ bch2_journal_keys_to_write_buffer_end(c, &dst);
+ return ret;
+}
+
+static int wb_keys_resize(struct btree_write_buffer_keys *wb, size_t new_size)
+{
+ if (wb->keys.size >= new_size)
+ return 0;
+
+ if (!mutex_trylock(&wb->lock))
+ return -EINTR;
+
+ int ret = darray_resize(&wb->keys, new_size);
+ mutex_unlock(&wb->lock);
return ret;
}
+int bch2_btree_write_buffer_resize(struct bch_fs *c, size_t new_size)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb_keys_resize(&wb->flushing, new_size) ?:
+ wb_keys_resize(&wb->inc, new_size);
+}
+
void bch2_fs_btree_write_buffer_exit(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
- BUG_ON(wb->state.nr && !bch2_journal_error(&c->journal));
+ BUG_ON((wb->inc.keys.nr || wb->flushing.keys.nr) &&
+ !bch2_journal_error(&c->journal));
- kvfree(wb->keys[1]);
- kvfree(wb->keys[0]);
+ darray_exit(&wb->sorted);
+ darray_exit(&wb->flushing.keys);
+ darray_exit(&wb->inc.keys);
}
int bch2_fs_btree_write_buffer_init(struct bch_fs *c)
{
struct btree_write_buffer *wb = &c->btree_write_buffer;
- mutex_init(&wb->flush_lock);
- wb->size = c->opts.btree_write_buffer_size;
+ mutex_init(&wb->inc.lock);
+ mutex_init(&wb->flushing.lock);
+ INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work);
- wb->keys[0] = kvmalloc_array(wb->size, sizeof(*wb->keys[0]), GFP_KERNEL);
- wb->keys[1] = kvmalloc_array(wb->size, sizeof(*wb->keys[1]), GFP_KERNEL);
- if (!wb->keys[0] || !wb->keys[1])
- return -BCH_ERR_ENOMEM_fs_btree_write_buffer_init;
+ /* Will be resized by journal as needed: */
+ unsigned initial_size = 1 << 16;
- return 0;
+ return darray_make_room(&wb->inc.keys, initial_size) ?:
+ darray_make_room(&wb->flushing.keys, initial_size) ?:
+ darray_make_room(&wb->sorted, initial_size);
}
diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h
index 322df1c8304e..eebcd2b15249 100644
--- a/fs/bcachefs/btree_write_buffer.h
+++ b/fs/bcachefs/btree_write_buffer.h
@@ -2,12 +2,59 @@
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_H
-int __bch2_btree_write_buffer_flush(struct btree_trans *, unsigned, bool);
+#include "bkey.h"
+
+static inline bool bch2_btree_write_buffer_should_flush(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb->inc.keys.nr + wb->flushing.keys.nr > wb->inc.keys.size / 4;
+}
+
+static inline bool bch2_btree_write_buffer_must_wait(struct bch_fs *c)
+{
+ struct btree_write_buffer *wb = &c->btree_write_buffer;
+
+ return wb->inc.keys.nr > wb->inc.keys.size * 3 / 4;
+}
+
+struct btree_trans;
int bch2_btree_write_buffer_flush_sync(struct btree_trans *);
-int bch2_btree_write_buffer_flush(struct btree_trans *);
+int bch2_btree_write_buffer_flush_nocheck_rw(struct btree_trans *);
+int bch2_btree_write_buffer_tryflush(struct btree_trans *);
+
+struct journal_keys_to_wb {
+ struct btree_write_buffer_keys *wb;
+ size_t room;
+ u64 seq;
+};
+
+int bch2_journal_key_to_wb_slowpath(struct bch_fs *,
+ struct journal_keys_to_wb *,
+ enum btree_id, struct bkey_i *);
+
+static inline int bch2_journal_key_to_wb(struct bch_fs *c,
+ struct journal_keys_to_wb *dst,
+ enum btree_id btree, struct bkey_i *k)
+{
+ EBUG_ON(!dst->seq);
+
+ if (unlikely(!dst->room))
+ return bch2_journal_key_to_wb_slowpath(c, dst, btree, k);
+
+ struct btree_write_buffered_key *wb_k = &darray_top(dst->wb->keys);
+ wb_k->journal_seq = dst->seq;
+ wb_k->btree = btree;
+ bkey_copy(&wb_k->k, k);
+ dst->wb->keys.nr++;
+ dst->room--;
+ return 0;
+}
-int bch2_btree_insert_keys_write_buffer(struct btree_trans *);
+void bch2_journal_keys_to_write_buffer_start(struct bch_fs *, struct journal_keys_to_wb *, u64);
+void bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_to_wb *);
+int bch2_btree_write_buffer_resize(struct bch_fs *, size_t);
void bch2_fs_btree_write_buffer_exit(struct bch_fs *);
int bch2_fs_btree_write_buffer_init(struct bch_fs *);
diff --git a/fs/bcachefs/btree_write_buffer_types.h b/fs/bcachefs/btree_write_buffer_types.h
index 99993ba77aea..9b9433de9c36 100644
--- a/fs/bcachefs/btree_write_buffer_types.h
+++ b/fs/bcachefs/btree_write_buffer_types.h
@@ -2,43 +2,56 @@
#ifndef _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
#define _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H
+#include "darray.h"
#include "journal_types.h"
#define BTREE_WRITE_BUFERED_VAL_U64s_MAX 4
#define BTREE_WRITE_BUFERED_U64s_MAX (BKEY_U64s + BTREE_WRITE_BUFERED_VAL_U64s_MAX)
-struct btree_write_buffered_key {
- u64 journal_seq;
- unsigned journal_offset;
- enum btree_id btree;
- __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
-};
-
-union btree_write_buffer_state {
+struct wb_key_ref {
+union {
struct {
- atomic64_t counter;
- };
-
- struct {
- u64 v;
- };
-
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ unsigned idx:24;
+ u8 pos[sizeof(struct bpos)];
+ enum btree_id btree:8;
+#else
+ enum btree_id btree:8;
+ u8 pos[sizeof(struct bpos)];
+ unsigned idx:24;
+#endif
+ } __packed;
struct {
- u64 nr:23;
- u64 idx:1;
- u64 ref0:20;
- u64 ref1:20;
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
+ u64 lo;
+ u64 mi;
+ u64 hi;
+#else
+ u64 hi;
+ u64 mi;
+ u64 lo;
+#endif
};
};
+};
-struct btree_write_buffer {
- struct mutex flush_lock;
- struct journal_entry_pin journal_pin;
+struct btree_write_buffered_key {
+ enum btree_id btree:8;
+ u64 journal_seq:56;
+ __BKEY_PADDED(k, BTREE_WRITE_BUFERED_VAL_U64s_MAX);
+};
- union btree_write_buffer_state state;
- size_t size;
+struct btree_write_buffer_keys {
+ DARRAY(struct btree_write_buffered_key) keys;
+ struct journal_entry_pin pin;
+ struct mutex lock;
+};
- struct btree_write_buffered_key *keys[2];
+struct btree_write_buffer {
+ DARRAY(struct wb_key_ref) sorted;
+ struct btree_write_buffer_keys inc;
+ struct btree_write_buffer_keys flushing;
+ struct work_struct flush_work;
};
#endif /* _BCACHEFS_BTREE_WRITE_BUFFER_TYPES_H */
diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c
index 5a91d3189fcf..54f7826ac498 100644
--- a/fs/bcachefs/buckets.c
+++ b/fs/bcachefs/buckets.c
@@ -25,7 +25,7 @@
#include <linux/preempt.h>
-static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
+static inline void fs_usage_data_type_to_base(struct bch_fs_usage_base *fs_usage,
enum bch_data_type data_type,
s64 sectors)
{
@@ -47,31 +47,27 @@ static inline void fs_usage_data_type_to_base(struct bch_fs_usage *fs_usage,
void bch2_fs_usage_initialize(struct bch_fs *c)
{
- struct bch_fs_usage *usage;
- struct bch_dev *ca;
- unsigned i;
-
percpu_down_write(&c->mark_lock);
- usage = c->usage_base;
+ struct bch_fs_usage *usage = c->usage_base;
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
- for (i = 0; i < BCH_REPLICAS_MAX; i++)
- usage->reserved += usage->persistent_reserved[i];
+ for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++)
+ usage->b.reserved += usage->persistent_reserved[i];
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ for (unsigned i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
- fs_usage_data_type_to_base(usage, e->data_type, usage->replicas[i]);
+ fs_usage_data_type_to_base(&usage->b, e->data_type, usage->replicas[i]);
}
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bch_dev_usage dev = bch2_dev_usage_read(ca);
- usage->hidden += (dev.d[BCH_DATA_sb].buckets +
- dev.d[BCH_DATA_journal].buckets) *
+ usage->b.hidden += (dev.d[BCH_DATA_sb].buckets +
+ dev.d[BCH_DATA_journal].buckets) *
ca->mi.bucket_size;
}
@@ -158,8 +154,7 @@ retry:
void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
{
- struct bch_dev *ca;
- unsigned i, u64s = fs_usage_u64s(c);
+ unsigned u64s = fs_usage_u64s(c);
BUG_ON(idx >= ARRAY_SIZE(c->usage));
@@ -171,7 +166,7 @@ void bch2_fs_usage_acc_to_base(struct bch_fs *c, unsigned idx)
percpu_memset(c->usage[idx], 0, u64s * sizeof(u64));
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL) {
+ for_each_member_device_rcu(c, ca, NULL) {
u64s = dev_usage_u64s();
acc_u64s_percpu((u64 *) ca->usage_base,
@@ -193,15 +188,15 @@ void bch2_fs_usage_to_text(struct printbuf *out,
prt_printf(out, "capacity:\t\t\t%llu\n", c->capacity);
prt_printf(out, "hidden:\t\t\t\t%llu\n",
- fs_usage->u.hidden);
+ fs_usage->u.b.hidden);
prt_printf(out, "data:\t\t\t\t%llu\n",
- fs_usage->u.data);
+ fs_usage->u.b.data);
prt_printf(out, "cached:\t\t\t\t%llu\n",
- fs_usage->u.cached);
+ fs_usage->u.b.cached);
prt_printf(out, "reserved:\t\t\t%llu\n",
- fs_usage->u.reserved);
+ fs_usage->u.b.reserved);
prt_printf(out, "nr_inodes:\t\t\t%llu\n",
- fs_usage->u.nr_inodes);
+ fs_usage->u.b.nr_inodes);
prt_printf(out, "online reserved:\t\t%llu\n",
fs_usage->online_reserved);
@@ -214,7 +209,7 @@ void bch2_fs_usage_to_text(struct printbuf *out,
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
prt_printf(out, "\t");
@@ -230,10 +225,10 @@ static u64 reserve_factor(u64 r)
u64 bch2_fs_sectors_used(struct bch_fs *c, struct bch_fs_usage_online *fs_usage)
{
- return min(fs_usage->u.hidden +
- fs_usage->u.btree +
- fs_usage->u.data +
- reserve_factor(fs_usage->u.reserved +
+ return min(fs_usage->u.b.hidden +
+ fs_usage->u.b.btree +
+ fs_usage->u.b.data +
+ reserve_factor(fs_usage->u.b.reserved +
fs_usage->online_reserved),
c->capacity);
}
@@ -245,17 +240,17 @@ __bch2_fs_usage_read_short(struct bch_fs *c)
u64 data, reserved;
ret.capacity = c->capacity -
- bch2_fs_usage_read_one(c, &c->usage_base->hidden);
+ bch2_fs_usage_read_one(c, &c->usage_base->b.hidden);
- data = bch2_fs_usage_read_one(c, &c->usage_base->data) +
- bch2_fs_usage_read_one(c, &c->usage_base->btree);
- reserved = bch2_fs_usage_read_one(c, &c->usage_base->reserved) +
+ data = bch2_fs_usage_read_one(c, &c->usage_base->b.data) +
+ bch2_fs_usage_read_one(c, &c->usage_base->b.btree);
+ reserved = bch2_fs_usage_read_one(c, &c->usage_base->b.reserved) +
percpu_u64_get(c->online_reserved);
ret.used = min(ret.capacity, data + reserve_factor(reserved));
ret.free = ret.capacity - ret.used;
- ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->nr_inodes);
+ ret.nr_inodes = bch2_fs_usage_read_one(c, &c->usage_base->b.nr_inodes);
return ret;
}
@@ -277,18 +272,34 @@ void bch2_dev_usage_init(struct bch_dev *ca)
ca->usage_base->d[BCH_DATA_free].buckets = ca->mi.nbuckets - ca->mi.first_bucket;
}
-static inline int bucket_sectors_fragmented(struct bch_dev *ca,
- struct bch_alloc_v4 a)
+void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev_usage *usage)
{
- return a.dirty_sectors
- ? max(0, (int) ca->mi.bucket_size - (int) a.dirty_sectors)
- : 0;
+ prt_tab(out);
+ prt_str(out, "buckets");
+ prt_tab_rjust(out);
+ prt_str(out, "sectors");
+ prt_tab_rjust(out);
+ prt_str(out, "fragmented");
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ for (unsigned i = 0; i < BCH_DATA_NR; i++) {
+ bch2_prt_data_type(out, i);
+ prt_tab(out);
+ prt_u64(out, usage->d[i].buckets);
+ prt_tab_rjust(out);
+ prt_u64(out, usage->d[i].sectors);
+ prt_tab_rjust(out);
+ prt_u64(out, usage->d[i].fragmented);
+ prt_tab_rjust(out);
+ prt_newline(out);
+ }
}
-static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
- struct bch_alloc_v4 old,
- struct bch_alloc_v4 new,
- u64 journal_seq, bool gc)
+void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
+ const struct bch_alloc_v4 *old,
+ const struct bch_alloc_v4 *new,
+ u64 journal_seq, bool gc)
{
struct bch_fs_usage *fs_usage;
struct bch_dev_usage *u;
@@ -296,56 +307,51 @@ static void bch2_dev_usage_update(struct bch_fs *c, struct bch_dev *ca,
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
- if (data_type_is_hidden(old.data_type))
- fs_usage->hidden -= ca->mi.bucket_size;
- if (data_type_is_hidden(new.data_type))
- fs_usage->hidden += ca->mi.bucket_size;
+ if (data_type_is_hidden(old->data_type))
+ fs_usage->b.hidden -= ca->mi.bucket_size;
+ if (data_type_is_hidden(new->data_type))
+ fs_usage->b.hidden += ca->mi.bucket_size;
u = dev_usage_ptr(ca, journal_seq, gc);
- u->d[old.data_type].buckets--;
- u->d[new.data_type].buckets++;
-
- u->buckets_ec -= (int) !!old.stripe;
- u->buckets_ec += (int) !!new.stripe;
+ u->d[old->data_type].buckets--;
+ u->d[new->data_type].buckets++;
- u->d[old.data_type].sectors -= old.dirty_sectors;
- u->d[new.data_type].sectors += new.dirty_sectors;
+ u->d[old->data_type].sectors -= bch2_bucket_sectors_dirty(*old);
+ u->d[new->data_type].sectors += bch2_bucket_sectors_dirty(*new);
- u->d[BCH_DATA_cached].sectors += new.cached_sectors;
- u->d[BCH_DATA_cached].sectors -= old.cached_sectors;
+ u->d[BCH_DATA_cached].sectors += new->cached_sectors;
+ u->d[BCH_DATA_cached].sectors -= old->cached_sectors;
- u->d[old.data_type].fragmented -= bucket_sectors_fragmented(ca, old);
- u->d[new.data_type].fragmented += bucket_sectors_fragmented(ca, new);
+ u->d[old->data_type].fragmented -= bch2_bucket_sectors_fragmented(ca, *old);
+ u->d[new->data_type].fragmented += bch2_bucket_sectors_fragmented(ca, *new);
preempt_enable();
}
-static void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
- struct bucket old, struct bucket new,
- u64 journal_seq, bool gc)
+static inline struct bch_alloc_v4 bucket_m_to_alloc(struct bucket b)
{
- struct bch_alloc_v4 old_a = {
- .gen = old.gen,
- .data_type = old.data_type,
- .dirty_sectors = old.dirty_sectors,
- .cached_sectors = old.cached_sectors,
- .stripe = old.stripe,
- };
- struct bch_alloc_v4 new_a = {
- .gen = new.gen,
- .data_type = new.data_type,
- .dirty_sectors = new.dirty_sectors,
- .cached_sectors = new.cached_sectors,
- .stripe = new.stripe,
+ return (struct bch_alloc_v4) {
+ .gen = b.gen,
+ .data_type = b.data_type,
+ .dirty_sectors = b.dirty_sectors,
+ .cached_sectors = b.cached_sectors,
+ .stripe = b.stripe,
};
+}
- bch2_dev_usage_update(c, ca, old_a, new_a, journal_seq, gc);
+void bch2_dev_usage_update_m(struct bch_fs *c, struct bch_dev *ca,
+ struct bucket *old, struct bucket *new)
+{
+ struct bch_alloc_v4 old_a = bucket_m_to_alloc(*old);
+ struct bch_alloc_v4 new_a = bucket_m_to_alloc(*new);
+
+ bch2_dev_usage_update(c, ca, &old_a, &new_a, 0, true);
}
static inline int __update_replicas(struct bch_fs *c,
struct bch_fs_usage *fs_usage,
- struct bch_replicas_entry *r,
+ struct bch_replicas_entry_v1 *r,
s64 sectors)
{
int idx = bch2_replicas_entry_idx(c, r);
@@ -353,14 +359,14 @@ static inline int __update_replicas(struct bch_fs *c,
if (idx < 0)
return -1;
- fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
return 0;
}
-static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
- struct bch_replicas_entry *r, s64 sectors,
- unsigned journal_seq, bool gc)
+int bch2_update_replicas(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_replicas_entry_v1 *r, s64 sectors,
+ unsigned journal_seq, bool gc)
{
struct bch_fs_usage *fs_usage;
int idx, ret = 0;
@@ -388,7 +394,7 @@ static inline int update_replicas(struct bch_fs *c, struct bkey_s_c k,
preempt_disable();
fs_usage = fs_usage_ptr(c, journal_seq, gc);
- fs_usage_data_type_to_base(fs_usage, r->data_type, sectors);
+ fs_usage_data_type_to_base(&fs_usage->b, r->data_type, sectors);
fs_usage->replicas[idx] += sectors;
preempt_enable();
err:
@@ -407,7 +413,7 @@ static inline int update_cached_sectors(struct bch_fs *c,
bch2_replicas_entry_cached(&r.e, dev);
- return update_replicas(c, k, &r.e, sectors, journal_seq, gc);
+ return bch2_update_replicas(c, k, &r.e, sectors, journal_seq, gc);
}
static int __replicas_deltas_realloc(struct btree_trans *trans, unsigned more,
@@ -453,9 +459,9 @@ int bch2_replicas_deltas_realloc(struct btree_trans *trans, unsigned more)
__replicas_deltas_realloc(trans, more, _gfp));
}
-static inline int update_replicas_list(struct btree_trans *trans,
- struct bch_replicas_entry *r,
- s64 sectors)
+int bch2_update_replicas_list(struct btree_trans *trans,
+ struct bch_replicas_entry_v1 *r,
+ s64 sectors)
{
struct replicas_delta_list *d;
struct replicas_delta *n;
@@ -481,139 +487,13 @@ static inline int update_replicas_list(struct btree_trans *trans,
return 0;
}
-static inline int update_cached_sectors_list(struct btree_trans *trans,
- unsigned dev, s64 sectors)
+int bch2_update_cached_sectors_list(struct btree_trans *trans, unsigned dev, s64 sectors)
{
struct bch_replicas_padded r;
bch2_replicas_entry_cached(&r.e, dev);
- return update_replicas_list(trans, &r.e, sectors);
-}
-
-int bch2_mark_alloc(struct btree_trans *trans,
- enum btree_id btree, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- bool gc = flags & BTREE_TRIGGER_GC;
- u64 journal_seq = trans->journal_res.seq;
- u64 bucket_journal_seq;
- struct bch_fs *c = trans->c;
- struct bch_alloc_v4 old_a_convert, new_a_convert;
- const struct bch_alloc_v4 *old_a, *new_a;
- struct bch_dev *ca;
- int ret = 0;
-
- /*
- * alloc btree is read in by bch2_alloc_read, not gc:
- */
- if ((flags & BTREE_TRIGGER_GC) &&
- !(flags & BTREE_TRIGGER_BUCKET_INVALIDATE))
- return 0;
-
- if (bch2_trans_inconsistent_on(!bch2_dev_bucket_exists(c, new.k->p), trans,
- "alloc key for invalid device or bucket"))
- return -EIO;
-
- ca = bch_dev_bkey_exists(c, new.k->p.inode);
-
- old_a = bch2_alloc_to_v4(old, &old_a_convert);
- new_a = bch2_alloc_to_v4(new, &new_a_convert);
-
- bucket_journal_seq = new_a->journal_seq;
-
- if ((flags & BTREE_TRIGGER_INSERT) &&
- data_type_is_empty(old_a->data_type) !=
- data_type_is_empty(new_a->data_type) &&
- new.k->type == KEY_TYPE_alloc_v4) {
- struct bch_alloc_v4 *v = (struct bch_alloc_v4 *) new.v;
-
- EBUG_ON(!journal_seq);
-
- /*
- * If the btree updates referring to a bucket weren't flushed
- * before the bucket became empty again, then the we don't have
- * to wait on a journal flush before we can reuse the bucket:
- */
- v->journal_seq = bucket_journal_seq =
- data_type_is_empty(new_a->data_type) &&
- (journal_seq == v->journal_seq ||
- bch2_journal_noflush_seq(&c->journal, v->journal_seq))
- ? 0 : journal_seq;
- }
-
- if (!data_type_is_empty(old_a->data_type) &&
- data_type_is_empty(new_a->data_type) &&
- bucket_journal_seq) {
- ret = bch2_set_bucket_needs_journal_commit(&c->buckets_waiting_for_journal,
- c->journal.flushed_seq_ondisk,
- new.k->p.inode, new.k->p.offset,
- bucket_journal_seq);
- if (ret) {
- bch2_fs_fatal_error(c,
- "error setting bucket_needs_journal_commit: %i", ret);
- return ret;
- }
- }
-
- percpu_down_read(&c->mark_lock);
- if (!gc && new_a->gen != old_a->gen)
- *bucket_gen(ca, new.k->p.offset) = new_a->gen;
-
- bch2_dev_usage_update(c, ca, *old_a, *new_a, journal_seq, gc);
-
- if (gc) {
- struct bucket *g = gc_bucket(ca, new.k->p.offset);
-
- bucket_lock(g);
-
- g->gen_valid = 1;
- g->gen = new_a->gen;
- g->data_type = new_a->data_type;
- g->stripe = new_a->stripe;
- g->stripe_redundancy = new_a->stripe_redundancy;
- g->dirty_sectors = new_a->dirty_sectors;
- g->cached_sectors = new_a->cached_sectors;
-
- bucket_unlock(g);
- }
- percpu_up_read(&c->mark_lock);
-
- /*
- * need to know if we're getting called from the invalidate path or
- * not:
- */
-
- if ((flags & BTREE_TRIGGER_BUCKET_INVALIDATE) &&
- old_a->cached_sectors) {
- ret = update_cached_sectors(c, new, ca->dev_idx,
- -((s64) old_a->cached_sectors),
- journal_seq, gc);
- if (ret) {
- bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
- __func__);
- return ret;
- }
- }
-
- if (new_a->data_type == BCH_DATA_free &&
- (!new_a->journal_seq || new_a->journal_seq < c->journal.flushed_seq_ondisk))
- closure_wake_up(&c->freelist_wait);
-
- if (new_a->data_type == BCH_DATA_need_discard &&
- (!bucket_journal_seq || bucket_journal_seq < c->journal.flushed_seq_ondisk))
- bch2_do_discards(c);
-
- if (old_a->data_type != BCH_DATA_cached &&
- new_a->data_type == BCH_DATA_cached &&
- should_invalidate_buckets(ca, bch2_dev_usage_read(ca)))
- bch2_do_invalidates(c);
-
- if (new_a->data_type == BCH_DATA_need_gc_gens)
- bch2_do_gc_gens(c);
-
- return 0;
+ return bch2_update_replicas_list(trans, &r.e, sectors);
}
int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
@@ -643,8 +523,8 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on(g->data_type &&
g->data_type != data_type, c,
"different types of data in same bucket: %s, %s",
- bch2_data_types[g->data_type],
- bch2_data_types[data_type])) {
+ bch2_data_type_str(g->data_type),
+ bch2_data_type_str(data_type))) {
ret = -EIO;
goto err;
}
@@ -652,37 +532,33 @@ int bch2_mark_metadata_bucket(struct bch_fs *c, struct bch_dev *ca,
if (bch2_fs_inconsistent_on((u64) g->dirty_sectors + sectors > ca->mi.bucket_size, c,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %u > bucket size",
ca->dev_idx, b, g->gen,
- bch2_data_types[g->data_type ?: data_type],
+ bch2_data_type_str(g->data_type ?: data_type),
g->dirty_sectors, sectors)) {
ret = -EIO;
goto err;
}
-
g->data_type = data_type;
g->dirty_sectors += sectors;
new = *g;
err:
bucket_unlock(g);
if (!ret)
- bch2_dev_usage_update_m(c, ca, old, new, 0, true);
+ bch2_dev_usage_update_m(c, ca, &old, &new);
percpu_up_read(&c->mark_lock);
return ret;
}
-static int check_bucket_ref(struct btree_trans *trans,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 b_gen, u8 bucket_data_type,
- u32 dirty_sectors, u32 cached_sectors)
+int bch2_check_bucket_ref(struct btree_trans *trans,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 b_gen, u8 bucket_data_type,
+ u32 bucket_sectors)
{
struct bch_fs *c = trans->c;
struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
size_t bucket_nr = PTR_BUCKET_NR(ca, ptr);
- u32 bucket_sectors = !ptr->cached
- ? dirty_sectors
- : cached_sectors;
struct printbuf buf = PRINTBUF;
int ret = 0;
@@ -699,7 +575,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@@ -712,7 +588,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -727,7 +603,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"while marking %s",
ptr->dev, bucket_nr, b_gen,
*bucket_gen(ca, bucket_nr),
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
ptr->gen,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -748,8 +624,8 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type],
- bch2_data_types[ptr_data_type],
+ bch2_data_type_str(bucket_data_type),
+ bch2_data_type_str(ptr_data_type),
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
ret = -EIO;
@@ -762,7 +638,7 @@ static int check_bucket_ref(struct btree_trans *trans,
"bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n"
"while marking %s",
ptr->dev, bucket_nr, b_gen,
- bch2_data_types[bucket_data_type ?: ptr_data_type],
+ bch2_data_type_str(bucket_data_type ?: ptr_data_type),
bucket_sectors, sectors,
(printbuf_reset(&buf),
bch2_bkey_val_to_text(&buf, c, k), buf.buf));
@@ -777,508 +653,6 @@ err:
goto out;
}
-static int mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c k,
- unsigned ptr_idx,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- u64 journal_seq = trans->journal_res.seq;
- const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- unsigned nr_data = s->nr_blocks - s->nr_redundant;
- bool parity = ptr_idx >= nr_data;
- enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
- s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
- const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
- struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
- struct bucket old, new, *g;
- struct printbuf buf = PRINTBUF;
- int ret = 0;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- /* * XXX doesn't handle deletion */
-
- percpu_down_read(&c->mark_lock);
- g = PTR_GC_BUCKET(ca, ptr);
-
- if (g->dirty_sectors ||
- (g->stripe && g->stripe != k.k->p.offset)) {
- bch2_fs_inconsistent(c,
- "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
- ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
- ret = -EINVAL;
- goto err;
- }
-
- bucket_lock(g);
- old = *g;
-
- ret = check_bucket_ref(trans, k, ptr, sectors, data_type,
- g->gen, g->data_type,
- g->dirty_sectors, g->cached_sectors);
- if (ret)
- goto err;
-
- g->data_type = data_type;
- g->dirty_sectors += sectors;
-
- g->stripe = k.k->p.offset;
- g->stripe_redundancy = s->nr_redundant;
- new = *g;
-err:
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
- percpu_up_read(&c->mark_lock);
- printbuf_exit(&buf);
- return ret;
-}
-
-static int __mark_pointer(struct btree_trans *trans,
- struct bkey_s_c k,
- const struct bch_extent_ptr *ptr,
- s64 sectors, enum bch_data_type ptr_data_type,
- u8 bucket_gen, u8 *bucket_data_type,
- u32 *dirty_sectors, u32 *cached_sectors)
-{
- u32 *dst_sectors = !ptr->cached
- ? dirty_sectors
- : cached_sectors;
- int ret = check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
- bucket_gen, *bucket_data_type,
- *dirty_sectors, *cached_sectors);
-
- if (ret)
- return ret;
-
- *dst_sectors += sectors;
-
- if (!*dirty_sectors && !*cached_sectors)
- *bucket_data_type = 0;
- else if (*bucket_data_type != BCH_DATA_stripe)
- *bucket_data_type = ptr_data_type;
-
- return 0;
-}
-
-static int bch2_mark_pointer(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k,
- struct extent_ptr_decoded p,
- s64 sectors,
- unsigned flags)
-{
- u64 journal_seq = trans->journal_res.seq;
- struct bch_fs *c = trans->c;
- struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
- struct bucket old, new, *g;
- enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
- u8 bucket_data_type;
- int ret = 0;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- percpu_down_read(&c->mark_lock);
- g = PTR_GC_BUCKET(ca, &p.ptr);
- bucket_lock(g);
- old = *g;
-
- bucket_data_type = g->data_type;
- ret = __mark_pointer(trans, k, &p.ptr, sectors,
- data_type, g->gen,
- &bucket_data_type,
- &g->dirty_sectors,
- &g->cached_sectors);
- if (!ret)
- g->data_type = bucket_data_type;
-
- new = *g;
- bucket_unlock(g);
- if (!ret)
- bch2_dev_usage_update_m(c, ca, old, new, journal_seq, true);
- percpu_up_read(&c->mark_lock);
-
- return ret;
-}
-
-static int bch2_mark_stripe_ptr(struct btree_trans *trans,
- struct bkey_s_c k,
- struct bch_extent_stripe_ptr p,
- enum bch_data_type data_type,
- s64 sectors,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bch_replicas_padded r;
- struct gc_stripe *m;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- m = genradix_ptr_alloc(&c->gc_stripes, p.idx, GFP_KERNEL);
- if (!m) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu",
- (u64) p.idx);
- return -BCH_ERR_ENOMEM_mark_stripe_ptr;
- }
-
- mutex_lock(&c->ec_stripes_heap_lock);
-
- if (!m || !m->alive) {
- mutex_unlock(&c->ec_stripes_heap_lock);
- bch_err_ratelimited(c, "pointer to nonexistent stripe %llu",
- (u64) p.idx);
- bch2_inconsistent_error(c);
- return -EIO;
- }
-
- m->block_sectors[p.block] += sectors;
-
- r = m->r;
- mutex_unlock(&c->ec_stripes_heap_lock);
-
- r.e.data_type = data_type;
- update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
-
- return 0;
-}
-
-static int __mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- u64 journal_seq = trans->journal_res.seq;
- struct bch_fs *c = trans->c;
- struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- struct bch_replicas_padded r;
- enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
- ? BCH_DATA_btree
- : BCH_DATA_user;
- s64 sectors = bkey_is_btree_ptr(k.k)
- ? btree_sectors(c)
- : k.k->size;
- s64 dirty_sectors = 0;
- bool stale;
- int ret;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- r.e.data_type = data_type;
- r.e.nr_devs = 0;
- r.e.nr_required = 1;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = ptr_disk_sectors(sectors, p);
-
- if (flags & BTREE_TRIGGER_OVERWRITE)
- disk_sectors = -disk_sectors;
-
- ret = bch2_mark_pointer(trans, btree_id, level, k, p, disk_sectors, flags);
- if (ret < 0)
- return ret;
-
- stale = ret > 0;
-
- if (p.ptr.cached) {
- if (!stale) {
- ret = update_cached_sectors(c, k, p.ptr.dev,
- disk_sectors, journal_seq, true);
- if (ret) {
- bch2_fs_fatal_error(c, "%s(): no replicas entry while updating cached sectors",
- __func__);
- return ret;
- }
- }
- } else if (!p.has_ec) {
- dirty_sectors += disk_sectors;
- r.e.devs[r.e.nr_devs++] = p.ptr.dev;
- } else {
- ret = bch2_mark_stripe_ptr(trans, k, p.ec, data_type,
- disk_sectors, flags);
- if (ret)
- return ret;
-
- /*
- * There may be other dirty pointers in this extent, but
- * if so they're not required for mounting if we have an
- * erasure coded pointer in this extent:
- */
- r.e.nr_required = 0;
- }
- }
-
- if (r.e.nr_devs) {
- ret = update_replicas(c, k, &r.e, dirty_sectors, journal_seq, true);
- if (ret) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, k);
- bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
- printbuf_exit(&buf);
- return ret;
- }
- }
-
- return 0;
-}
-
-int bch2_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- return mem_trigger_run_overwrite_then_insert(__mark_extent, trans, btree_id, level, old, new, flags);
-}
-
-int bch2_mark_stripe(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- bool gc = flags & BTREE_TRIGGER_GC;
- u64 journal_seq = trans->journal_res.seq;
- struct bch_fs *c = trans->c;
- u64 idx = new.k->p.offset;
- const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(old).v : NULL;
- const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
- ? bkey_s_c_to_stripe(new).v : NULL;
- unsigned i;
- int ret;
-
- BUG_ON(gc && old_s);
-
- if (!gc) {
- struct stripe *m = genradix_ptr(&c->stripes, idx);
-
- if (!m) {
- struct printbuf buf1 = PRINTBUF;
- struct printbuf buf2 = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf1, c, old);
- bch2_bkey_val_to_text(&buf2, c, new);
- bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
- "old %s\n"
- "new %s", idx, buf1.buf, buf2.buf);
- printbuf_exit(&buf2);
- printbuf_exit(&buf1);
- bch2_inconsistent_error(c);
- return -1;
- }
-
- if (!new_s) {
- bch2_stripes_heap_del(c, m, idx);
-
- memset(m, 0, sizeof(*m));
- } else {
- m->sectors = le16_to_cpu(new_s->sectors);
- m->algorithm = new_s->algorithm;
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
- m->blocks_nonempty = 0;
-
- for (i = 0; i < new_s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
-
- if (!old_s)
- bch2_stripes_heap_insert(c, m, idx);
- else
- bch2_stripes_heap_update(c, m, idx);
- }
- } else {
- struct gc_stripe *m =
- genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
-
- if (!m) {
- bch_err(c, "error allocating memory for gc_stripes, idx %llu",
- idx);
- return -BCH_ERR_ENOMEM_mark_stripe;
- }
- /*
- * This will be wrong when we bring back runtime gc: we should
- * be unmarking the old key and then marking the new key
- */
- m->alive = true;
- m->sectors = le16_to_cpu(new_s->sectors);
- m->nr_blocks = new_s->nr_blocks;
- m->nr_redundant = new_s->nr_redundant;
-
- for (i = 0; i < new_s->nr_blocks; i++)
- m->ptrs[i] = new_s->ptrs[i];
-
- bch2_bkey_to_replicas(&m->r.e, new);
-
- /*
- * gc recalculates this field from stripe ptr
- * references:
- */
- memset(m->block_sectors, 0, sizeof(m->block_sectors));
-
- for (i = 0; i < new_s->nr_blocks; i++) {
- ret = mark_stripe_bucket(trans, new, i, flags);
- if (ret)
- return ret;
- }
-
- ret = update_replicas(c, new, &m->r.e,
- ((s64) m->sectors * m->nr_redundant),
- journal_seq, gc);
- if (ret) {
- struct printbuf buf = PRINTBUF;
-
- bch2_bkey_val_to_text(&buf, c, new);
- bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
- printbuf_exit(&buf);
- return ret;
- }
- }
-
- return 0;
-}
-
-static int __mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bch_fs_usage *fs_usage;
- unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
- s64 sectors = (s64) k.k->size;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- if (flags & BTREE_TRIGGER_OVERWRITE)
- sectors = -sectors;
- sectors *= replicas;
-
- percpu_down_read(&c->mark_lock);
- preempt_disable();
-
- fs_usage = fs_usage_ptr(c, trans->journal_res.seq, flags & BTREE_TRIGGER_GC);
- replicas = clamp_t(unsigned, replicas, 1,
- ARRAY_SIZE(fs_usage->persistent_reserved));
-
- fs_usage->reserved += sectors;
- fs_usage->persistent_reserved[replicas - 1] += sectors;
-
- preempt_enable();
- percpu_up_read(&c->mark_lock);
-
- return 0;
-}
-
-int bch2_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- return mem_trigger_run_overwrite_then_insert(__mark_reservation, trans, btree_id, level, old, new, flags);
-}
-
-static s64 __bch2_mark_reflink_p(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 start, u64 end,
- u64 *idx, unsigned flags, size_t r_idx)
-{
- struct bch_fs *c = trans->c;
- struct reflink_gc *r;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- u64 next_idx = end;
- s64 ret = 0;
- struct printbuf buf = PRINTBUF;
-
- if (r_idx >= c->reflink_gc_nr)
- goto not_found;
-
- r = genradix_ptr(&c->reflink_gc_table, r_idx);
- next_idx = min(next_idx, r->offset - r->size);
- if (*idx < next_idx)
- goto not_found;
-
- BUG_ON((s64) r->refcount + add < 0);
-
- r->refcount += add;
- *idx = r->offset;
- return 0;
-not_found:
- if (fsck_err(c, reflink_p_to_missing_reflink_v,
- "pointer to missing indirect extent\n"
- " %s\n"
- " missing range %llu-%llu",
- (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
- *idx, next_idx)) {
- struct bkey_i_error *new;
-
- new = bch2_trans_kmalloc(trans, sizeof(*new));
- ret = PTR_ERR_OR_ZERO(new);
- if (ret)
- goto err;
-
- bkey_init(&new->k);
- new->k.type = KEY_TYPE_error;
- new->k.p = bkey_start_pos(p.k);
- new->k.p.offset += *idx - start;
- bch2_key_resize(&new->k, next_idx - *idx);
- ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, &new->k_i,
- BTREE_TRIGGER_NORUN);
- }
-
- *idx = next_idx;
-err:
-fsck_err:
- printbuf_exit(&buf);
- return ret;
-}
-
-static int __mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- struct reflink_gc *ref;
- size_t l, r, m;
- u64 idx = le64_to_cpu(p.v->idx), start = idx;
- u64 end = le64_to_cpu(p.v->idx) + p.k->size;
- int ret = 0;
-
- BUG_ON(!(flags & BTREE_TRIGGER_GC));
-
- if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_reflink_p_fix) {
- idx -= le32_to_cpu(p.v->front_pad);
- end += le32_to_cpu(p.v->back_pad);
- }
-
- l = 0;
- r = c->reflink_gc_nr;
- while (l < r) {
- m = l + (r - l) / 2;
-
- ref = genradix_ptr(&c->reflink_gc_table, m);
- if (ref->offset <= idx)
- l = m + 1;
- else
- r = m;
- }
-
- while (idx < end && !ret)
- ret = __bch2_mark_reflink_p(trans, p, start, end,
- &idx, flags, l++);
-
- return ret;
-}
-
-int bch2_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- return mem_trigger_run_overwrite_then_insert(__mark_reflink_p, trans, btree_id, level, old, new, flags);
-}
-
void bch2_trans_fs_usage_revert(struct btree_trans *trans,
struct replicas_delta_list *deltas)
{
@@ -1303,11 +677,11 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
BUG_ON(__update_replicas(c, dst, &d->r, -d->delta));
}
- dst->nr_inodes -= deltas->nr_inodes;
+ dst->b.nr_inodes -= deltas->nr_inodes;
for (i = 0; i < BCH_REPLICAS_MAX; i++) {
added -= deltas->persistent_reserved[i];
- dst->reserved -= deltas->persistent_reserved[i];
+ dst->b.reserved -= deltas->persistent_reserved[i];
dst->persistent_reserved[i] -= deltas->persistent_reserved[i];
}
@@ -1320,48 +694,25 @@ void bch2_trans_fs_usage_revert(struct btree_trans *trans,
percpu_up_read(&c->mark_lock);
}
-int bch2_trans_fs_usage_apply(struct btree_trans *trans,
- struct replicas_delta_list *deltas)
+void bch2_trans_account_disk_usage_change(struct btree_trans *trans)
{
struct bch_fs *c = trans->c;
+ u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
static int warned_disk_usage = 0;
bool warn = false;
- u64 disk_res_sectors = trans->disk_res ? trans->disk_res->sectors : 0;
- struct replicas_delta *d, *d2;
- struct replicas_delta *top = (void *) deltas->d + deltas->used;
- struct bch_fs_usage *dst;
- s64 added = 0, should_not_have_added;
- unsigned i;
percpu_down_read(&c->mark_lock);
preempt_disable();
- dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+ struct bch_fs_usage_base *dst = &fs_usage_ptr(c, trans->journal_res.seq, false)->b;
+ struct bch_fs_usage_base *src = &trans->fs_usage_delta;
- for (d = deltas->d; d != top; d = replicas_delta_next(d)) {
- switch (d->r.data_type) {
- case BCH_DATA_btree:
- case BCH_DATA_user:
- case BCH_DATA_parity:
- added += d->delta;
- }
-
- if (__update_replicas(c, dst, &d->r, d->delta))
- goto need_mark;
- }
-
- dst->nr_inodes += deltas->nr_inodes;
-
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
- added += deltas->persistent_reserved[i];
- dst->reserved += deltas->persistent_reserved[i];
- dst->persistent_reserved[i] += deltas->persistent_reserved[i];
- }
+ s64 added = src->btree + src->data + src->reserved;
/*
* Not allowed to reduce sectors_available except by getting a
* reservation:
*/
- should_not_have_added = added - (s64) disk_res_sectors;
+ s64 should_not_have_added = added - (s64) disk_res_sectors;
if (unlikely(should_not_have_added > 0)) {
u64 old, new, v = atomic64_read(&c->sectors_available);
@@ -1380,6 +731,13 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
this_cpu_sub(*c->online_reserved, added);
}
+ dst->hidden += src->hidden;
+ dst->btree += src->btree;
+ dst->data += src->data;
+ dst->cached += src->cached;
+ dst->reserved += src->reserved;
+ dst->nr_inodes += src->nr_inodes;
+
preempt_enable();
percpu_up_read(&c->mark_lock);
@@ -1387,6 +745,34 @@ int bch2_trans_fs_usage_apply(struct btree_trans *trans,
bch2_trans_inconsistent(trans,
"disk usage increased %lli more than %llu sectors reserved)",
should_not_have_added, disk_res_sectors);
+}
+
+int bch2_trans_fs_usage_apply(struct btree_trans *trans,
+ struct replicas_delta_list *deltas)
+{
+ struct bch_fs *c = trans->c;
+ struct replicas_delta *d, *d2;
+ struct replicas_delta *top = (void *) deltas->d + deltas->used;
+ struct bch_fs_usage *dst;
+ unsigned i;
+
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
+ dst = fs_usage_ptr(c, trans->journal_res.seq, false);
+
+ for (d = deltas->d; d != top; d = replicas_delta_next(d))
+ if (__update_replicas(c, dst, &d->r, d->delta))
+ goto need_mark;
+
+ dst->b.nr_inodes += deltas->nr_inodes;
+
+ for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ dst->b.reserved += deltas->persistent_reserved[i];
+ dst->persistent_reserved[i] += deltas->persistent_reserved[i];
+ }
+
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
return 0;
need_mark:
/* revert changes: */
@@ -1398,92 +784,184 @@ need_mark:
return -1;
}
-/* trans_mark: */
+/* KEY_TYPE_extent: */
+
+static int __mark_pointer(struct btree_trans *trans,
+ struct bkey_s_c k,
+ const struct bch_extent_ptr *ptr,
+ s64 sectors, enum bch_data_type ptr_data_type,
+ u8 bucket_gen, u8 *bucket_data_type,
+ u32 *dirty_sectors, u32 *cached_sectors)
+{
+ u32 *dst_sectors = !ptr->cached
+ ? dirty_sectors
+ : cached_sectors;
+ int ret = bch2_check_bucket_ref(trans, k, ptr, sectors, ptr_data_type,
+ bucket_gen, *bucket_data_type, *dst_sectors);
+
+ if (ret)
+ return ret;
+
+ *dst_sectors += sectors;
+
+ if (!*dirty_sectors && !*cached_sectors)
+ *bucket_data_type = 0;
+ else if (*bucket_data_type != BCH_DATA_stripe)
+ *bucket_data_type = ptr_data_type;
+
+ return 0;
+}
-static inline int bch2_trans_mark_pointer(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, struct extent_ptr_decoded p,
- unsigned flags)
+static int bch2_trigger_pointer(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, struct extent_ptr_decoded p,
+ s64 *sectors,
+ unsigned flags)
{
bool insert = !(flags & BTREE_TRIGGER_OVERWRITE);
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
struct bpos bucket;
struct bch_backpointer bp;
- s64 sectors;
- int ret;
bch2_extent_ptr_to_bp(trans->c, btree_id, level, k, p, &bucket, &bp);
- sectors = bp.bucket_len;
- if (!insert)
- sectors = -sectors;
+ *sectors = insert ? bp.bucket_len : -((s64) bp.bucket_len);
- a = bch2_trans_start_alloc_update(trans, &iter, bucket);
- if (IS_ERR(a))
- return PTR_ERR(a);
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, &iter, bucket);
+ int ret = PTR_ERR_OR_ZERO(a);
+ if (ret)
+ return ret;
- ret = __mark_pointer(trans, k, &p.ptr, sectors, bp.data_type,
- a->v.gen, &a->v.data_type,
- &a->v.dirty_sectors, &a->v.cached_sectors) ?:
- bch2_trans_update(trans, &iter, &a->k_i, 0);
- bch2_trans_iter_exit(trans, &iter);
+ ret = __mark_pointer(trans, k, &p.ptr, *sectors, bp.data_type,
+ a->v.gen, &a->v.data_type,
+ &a->v.dirty_sectors, &a->v.cached_sectors) ?:
+ bch2_trans_update(trans, &iter, &a->k_i, 0);
+ bch2_trans_iter_exit(trans, &iter);
- if (ret)
- return ret;
-
- if (!p.ptr.cached) {
- ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
if (ret)
return ret;
+
+ if (!p.ptr.cached) {
+ ret = bch2_bucket_backpointer_mod(trans, bucket, bp, k, insert);
+ if (ret)
+ return ret;
+ }
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ struct bch_fs *c = trans->c;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, p.ptr.dev);
+ enum bch_data_type data_type = bkey_ptr_data_type(btree_id, level, k, p);
+
+ percpu_down_read(&c->mark_lock);
+ struct bucket *g = PTR_GC_BUCKET(ca, &p.ptr);
+ bucket_lock(g);
+ struct bucket old = *g;
+
+ u8 bucket_data_type = g->data_type;
+ int ret = __mark_pointer(trans, k, &p.ptr, *sectors,
+ data_type, g->gen,
+ &bucket_data_type,
+ &g->dirty_sectors,
+ &g->cached_sectors);
+ if (ret) {
+ bucket_unlock(g);
+ percpu_up_read(&c->mark_lock);
+ return ret;
+ }
+
+ g->data_type = bucket_data_type;
+ struct bucket new = *g;
+ bucket_unlock(g);
+ bch2_dev_usage_update_m(c, ca, &old, &new);
+ percpu_up_read(&c->mark_lock);
}
return 0;
}
-static int bch2_trans_mark_stripe_ptr(struct btree_trans *trans,
- struct extent_ptr_decoded p,
- s64 sectors, enum bch_data_type data_type)
+static int bch2_trigger_stripe_ptr(struct btree_trans *trans,
+ struct bkey_s_c k,
+ struct extent_ptr_decoded p,
+ enum bch_data_type data_type,
+ s64 sectors, unsigned flags)
{
- struct btree_iter iter;
- struct bkey_i_stripe *s;
- struct bch_replicas_padded r;
- int ret = 0;
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct btree_iter iter;
+ struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter,
+ BTREE_ID_stripes, POS(0, p.ec.idx),
+ BTREE_ITER_WITH_UPDATES, stripe);
+ int ret = PTR_ERR_OR_ZERO(s);
+ if (unlikely(ret)) {
+ bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
+ "pointer to nonexistent stripe %llu",
+ (u64) p.ec.idx);
+ goto err;
+ }
- s = bch2_bkey_get_mut_typed(trans, &iter,
- BTREE_ID_stripes, POS(0, p.ec.idx),
- BTREE_ITER_WITH_UPDATES, stripe);
- ret = PTR_ERR_OR_ZERO(s);
- if (unlikely(ret)) {
- bch2_trans_inconsistent_on(bch2_err_matches(ret, ENOENT), trans,
- "pointer to nonexistent stripe %llu",
- (u64) p.ec.idx);
- goto err;
- }
+ if (!bch2_ptr_matches_stripe(&s->v, p)) {
+ bch2_trans_inconsistent(trans,
+ "stripe pointer doesn't match stripe %llu",
+ (u64) p.ec.idx);
+ ret = -EIO;
+ goto err;
+ }
- if (!bch2_ptr_matches_stripe(&s->v, p)) {
- bch2_trans_inconsistent(trans,
- "stripe pointer doesn't match stripe %llu",
- (u64) p.ec.idx);
- ret = -EIO;
- goto err;
+ stripe_blockcount_set(&s->v, p.ec.block,
+ stripe_blockcount_get(&s->v, p.ec.block) +
+ sectors);
+
+ struct bch_replicas_padded r;
+ bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
+ r.e.data_type = data_type;
+ ret = bch2_update_replicas_list(trans, &r.e, sectors);
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
}
- stripe_blockcount_set(&s->v, p.ec.block,
- stripe_blockcount_get(&s->v, p.ec.block) +
- sectors);
+ if (flags & BTREE_TRIGGER_GC) {
+ struct bch_fs *c = trans->c;
- bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(&s->k_i));
- r.e.data_type = data_type;
- ret = update_replicas_list(trans, &r.e, sectors);
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL);
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ (u64) p.ec.idx);
+ return -BCH_ERR_ENOMEM_mark_stripe_ptr;
+ }
+
+ mutex_lock(&c->ec_stripes_heap_lock);
+
+ if (!m || !m->alive) {
+ mutex_unlock(&c->ec_stripes_heap_lock);
+ struct printbuf buf = PRINTBUF;
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s",
+ (u64) p.ec.idx, buf.buf);
+ printbuf_exit(&buf);
+ bch2_inconsistent_error(c);
+ return -EIO;
+ }
+
+ m->block_sectors[p.ec.block] += sectors;
+
+ struct bch_replicas_padded r = m->r;
+ mutex_unlock(&c->ec_stripes_heap_lock);
+
+ r.e.data_type = data_type;
+ bch2_update_replicas(c, k, &r.e, sectors, trans->journal_res.seq, true);
+ }
+
+ return 0;
}
-static int __trans_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+static int __trigger_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
+ bool gc = flags & BTREE_TRIGGER_GC;
struct bch_fs *c = trans->c;
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -1492,11 +970,7 @@ static int __trans_mark_extent(struct btree_trans *trans,
enum bch_data_type data_type = bkey_is_btree_ptr(k.k)
? BCH_DATA_btree
: BCH_DATA_user;
- s64 sectors = bkey_is_btree_ptr(k.k)
- ? btree_sectors(c)
- : k.k->size;
s64 dirty_sectors = 0;
- bool stale;
int ret = 0;
r.e.data_type = data_type;
@@ -1504,21 +978,20 @@ static int __trans_mark_extent(struct btree_trans *trans,
r.e.nr_required = 1;
bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- s64 disk_sectors = ptr_disk_sectors(sectors, p);
-
- if (flags & BTREE_TRIGGER_OVERWRITE)
- disk_sectors = -disk_sectors;
-
- ret = bch2_trans_mark_pointer(trans, btree_id, level, k, p, flags);
+ s64 disk_sectors;
+ ret = bch2_trigger_pointer(trans, btree_id, level, k, p, &disk_sectors, flags);
if (ret < 0)
return ret;
- stale = ret > 0;
+ bool stale = ret > 0;
if (p.ptr.cached) {
if (!stale) {
- ret = update_cached_sectors_list(trans, p.ptr.dev,
- disk_sectors);
+ ret = !gc
+ ? bch2_update_cached_sectors_list(trans, p.ptr.dev, disk_sectors)
+ : update_cached_sectors(c, k, p.ptr.dev, disk_sectors, 0, true);
+ bch2_fs_fatal_err_on(ret && gc, c, "%s(): no replicas entry while updating cached sectors",
+ __func__);
if (ret)
return ret;
}
@@ -1526,324 +999,122 @@ static int __trans_mark_extent(struct btree_trans *trans,
dirty_sectors += disk_sectors;
r.e.devs[r.e.nr_devs++] = p.ptr.dev;
} else {
- ret = bch2_trans_mark_stripe_ptr(trans, p,
- disk_sectors, data_type);
+ ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags);
if (ret)
return ret;
+ /*
+ * There may be other dirty pointers in this extent, but
+ * if so they're not required for mounting if we have an
+ * erasure coded pointer in this extent:
+ */
r.e.nr_required = 0;
}
}
- if (r.e.nr_devs)
- ret = update_replicas_list(trans, &r.e, dirty_sectors);
-
- return ret;
-}
-
-int bch2_trans_mark_extent(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- int mod = (int) bch2_bkey_needs_rebalance(c, bkey_i_to_s_c(new)) -
- (int) bch2_bkey_needs_rebalance(c, old);
+ if (r.e.nr_devs) {
+ ret = !gc
+ ? bch2_update_replicas_list(trans, &r.e, dirty_sectors)
+ : bch2_update_replicas(c, k, &r.e, dirty_sectors, 0, true);
+ if (unlikely(ret && gc)) {
+ struct printbuf buf = PRINTBUF;
- if (mod) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new->k.p, mod > 0);
+ bch2_bkey_val_to_text(&buf, c, k);
+ bch2_fs_fatal_error(c, "%s(): no replicas entry for %s", __func__, buf.buf);
+ printbuf_exit(&buf);
+ }
if (ret)
return ret;
}
- return trigger_run_overwrite_then_insert(__trans_mark_extent, trans, btree_id, level, old, new, flags);
-}
-
-static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
- struct bkey_s_c_stripe s,
- unsigned idx, bool deleting)
-{
- struct bch_fs *c = trans->c;
- const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
- struct btree_iter iter;
- struct bkey_i_alloc_v4 *a;
- enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
- ? BCH_DATA_parity : 0;
- s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
- int ret = 0;
-
- if (deleting)
- sectors = -sectors;
-
- a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
- if (IS_ERR(a))
- return PTR_ERR(a);
-
- ret = check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
- a->v.gen, a->v.data_type,
- a->v.dirty_sectors, a->v.cached_sectors);
- if (ret)
- goto err;
-
- if (!deleting) {
- if (bch2_trans_inconsistent_on(a->v.stripe ||
- a->v.stripe_redundancy, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
- a->v.dirty_sectors,
- a->v.stripe, s.k->p.offset)) {
- ret = -EIO;
- goto err;
- }
-
- if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
- "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
- a->v.dirty_sectors,
- s.k->p.offset)) {
- ret = -EIO;
- goto err;
- }
-
- a->v.stripe = s.k->p.offset;
- a->v.stripe_redundancy = s.v->nr_redundant;
- a->v.data_type = BCH_DATA_stripe;
- } else {
- if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
- a->v.stripe_redundancy != s.v->nr_redundant, trans,
- "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
- iter.pos.inode, iter.pos.offset, a->v.gen,
- s.k->p.offset, a->v.stripe)) {
- ret = -EIO;
- goto err;
- }
-
- a->v.stripe = 0;
- a->v.stripe_redundancy = 0;
- a->v.data_type = alloc_data_type(a->v, BCH_DATA_user);
- }
-
- a->v.dirty_sectors += sectors;
- if (data_type)
- a->v.data_type = !deleting ? data_type : 0;
-
- ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
- if (ret)
- goto err;
-err:
- bch2_trans_iter_exit(trans, &iter);
- return ret;
+ return 0;
}
-int bch2_trans_mark_stripe(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_extent(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- const struct bch_stripe *old_s = NULL;
- struct bch_stripe *new_s = NULL;
- struct bch_replicas_padded r;
- unsigned i, nr_blocks;
- int ret = 0;
-
- if (old.k->type == KEY_TYPE_stripe)
- old_s = bkey_s_c_to_stripe(old).v;
- if (new->k.type == KEY_TYPE_stripe)
- new_s = &bkey_i_to_stripe(new)->v;
-
- /*
- * If the pointers aren't changing, we don't need to do anything:
- */
- if (new_s && old_s &&
- new_s->nr_blocks == old_s->nr_blocks &&
- new_s->nr_redundant == old_s->nr_redundant &&
- !memcmp(old_s->ptrs, new_s->ptrs,
- new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+ struct bkey_ptrs_c new_ptrs = bch2_bkey_ptrs_c(new.s_c);
+ struct bkey_ptrs_c old_ptrs = bch2_bkey_ptrs_c(old);
+ unsigned new_ptrs_bytes = (void *) new_ptrs.end - (void *) new_ptrs.start;
+ unsigned old_ptrs_bytes = (void *) old_ptrs.end - (void *) old_ptrs.start;
+
+ /* if pointers aren't changing - nothing to do: */
+ if (new_ptrs_bytes == old_ptrs_bytes &&
+ !memcmp(new_ptrs.start,
+ old_ptrs.start,
+ new_ptrs_bytes))
return 0;
- BUG_ON(new_s && old_s &&
- (new_s->nr_blocks != old_s->nr_blocks ||
- new_s->nr_redundant != old_s->nr_redundant));
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ struct bch_fs *c = trans->c;
+ int mod = (int) bch2_bkey_needs_rebalance(c, new.s_c) -
+ (int) bch2_bkey_needs_rebalance(c, old);
- nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
-
- if (new_s) {
- s64 sectors = le16_to_cpu(new_s->sectors);
-
- bch2_bkey_to_replicas(&r.e, bkey_i_to_s_c(new));
- ret = update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
- if (ret)
- return ret;
- }
-
- if (old_s) {
- s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
-
- bch2_bkey_to_replicas(&r.e, old);
- ret = update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
- if (ret)
- return ret;
- }
-
- for (i = 0; i < nr_blocks; i++) {
- if (new_s && old_s &&
- !memcmp(&new_s->ptrs[i],
- &old_s->ptrs[i],
- sizeof(new_s->ptrs[i])))
- continue;
-
- if (new_s) {
- ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_i_to_s_c_stripe(new), i, false);
- if (ret)
- break;
- }
-
- if (old_s) {
- ret = bch2_trans_mark_stripe_bucket(trans,
- bkey_s_c_to_stripe(old), i, true);
+ if (mod) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_rebalance_work, new.k->p, mod > 0);
if (ret)
- break;
+ return ret;
}
}
- return ret;
+ if (flags & (BTREE_TRIGGER_TRANSACTIONAL|BTREE_TRIGGER_GC))
+ return trigger_run_overwrite_then_insert(__trigger_extent, trans, btree_id, level, old, new, flags);
+
+ return 0;
}
-static int __trans_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+/* KEY_TYPE_reservation */
+
+static int __trigger_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
{
+ struct bch_fs *c = trans->c;
unsigned replicas = bkey_s_c_to_reservation(k).v->nr_replicas;
- s64 sectors = (s64) k.k->size;
- struct replicas_delta_list *d;
- int ret;
+ s64 sectors = (s64) k.k->size * replicas;
if (flags & BTREE_TRIGGER_OVERWRITE)
sectors = -sectors;
- sectors *= replicas;
-
- ret = bch2_replicas_deltas_realloc(trans, 0);
- if (ret)
- return ret;
- d = trans->fs_usage_deltas;
- replicas = clamp_t(unsigned, replicas, 1,
- ARRAY_SIZE(d->persistent_reserved));
-
- d->persistent_reserved[replicas - 1] += sectors;
- return 0;
-}
-
-int bch2_trans_mark_reservation(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
-{
- return trigger_run_overwrite_then_insert(__trans_mark_reservation, trans, btree_id, level, old, new, flags);
-}
-
-static int trans_mark_reflink_p_segment(struct btree_trans *trans,
- struct bkey_s_c_reflink_p p,
- u64 *idx, unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_i *k;
- __le64 *refcount;
- int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
- struct printbuf buf = PRINTBUF;
- int ret;
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ int ret = bch2_replicas_deltas_realloc(trans, 0);
+ if (ret)
+ return ret;
- k = bch2_bkey_get_mut_noupdate(trans, &iter,
- BTREE_ID_reflink, POS(0, *idx),
- BTREE_ITER_WITH_UPDATES);
- ret = PTR_ERR_OR_ZERO(k);
- if (ret)
- goto err;
+ struct replicas_delta_list *d = trans->fs_usage_deltas;
+ replicas = min(replicas, ARRAY_SIZE(d->persistent_reserved));
- refcount = bkey_refcount(k);
- if (!refcount) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "nonexistent indirect extent at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
- goto err;
+ d->persistent_reserved[replicas - 1] += sectors;
}
- if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
- bch2_bkey_val_to_text(&buf, c, p.s_c);
- bch2_trans_inconsistent(trans,
- "indirect extent refcount underflow at %llu while marking\n %s",
- *idx, buf.buf);
- ret = -EIO;
- goto err;
- }
+ if (flags & BTREE_TRIGGER_GC) {
+ percpu_down_read(&c->mark_lock);
+ preempt_disable();
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
- u64 pad;
+ struct bch_fs_usage *fs_usage = this_cpu_ptr(c->usage_gc);
- pad = max_t(s64, le32_to_cpu(v->front_pad),
- le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
- BUG_ON(pad > U32_MAX);
- v->front_pad = cpu_to_le32(pad);
+ replicas = min(replicas, ARRAY_SIZE(fs_usage->persistent_reserved));
+ fs_usage->b.reserved += sectors;
+ fs_usage->persistent_reserved[replicas - 1] += sectors;
- pad = max_t(s64, le32_to_cpu(v->back_pad),
- k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
- BUG_ON(pad > U32_MAX);
- v->back_pad = cpu_to_le32(pad);
+ preempt_enable();
+ percpu_up_read(&c->mark_lock);
}
- le64_add_cpu(refcount, add);
-
- bch2_btree_iter_set_pos_to_extent_start(&iter);
- ret = bch2_trans_update(trans, &iter, k, 0);
- if (ret)
- goto err;
-
- *idx = k->k.p.offset;
-err:
- bch2_trans_iter_exit(trans, &iter);
- printbuf_exit(&buf);
- return ret;
+ return 0;
}
-static int __trans_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c k, unsigned flags)
+int bch2_trigger_reservation(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
- u64 idx, end_idx;
- int ret = 0;
-
- idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
- end_idx = le64_to_cpu(p.v->idx) + p.k->size +
- le32_to_cpu(p.v->back_pad);
-
- while (idx < end_idx && !ret)
- ret = trans_mark_reflink_p_segment(trans, p, &idx, flags);
- return ret;
+ return trigger_run_overwrite_then_insert(__trigger_reservation, trans, btree_id, level, old, new, flags);
}
-int bch2_trans_mark_reflink_p(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
-{
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_reflink_p *v = &bkey_i_to_reflink_p(new)->v;
-
- v->front_pad = v->back_pad = 0;
- }
-
- return trigger_run_overwrite_then_insert(__trans_mark_reflink_p, trans, btree_id, level, old, new, flags);
-}
+/* Mark superblocks: */
static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
struct bch_dev *ca, size_t b,
@@ -1871,9 +1142,9 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans,
"bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n"
"while marking %s",
iter.pos.inode, iter.pos.offset, a->v.gen,
- bch2_data_types[a->v.data_type],
- bch2_data_types[type],
- bch2_data_types[type]);
+ bch2_data_type_str(a->v.data_type),
+ bch2_data_type_str(type),
+ bch2_data_type_str(type));
ret = -EIO;
goto err;
}
@@ -1974,17 +1245,13 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca)
{
int ret = bch2_trans_run(c, __bch2_trans_mark_dev_sb(trans, ca));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
int bch2_trans_mark_dev_sbs(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
int ret = bch2_trans_mark_dev_sb(c, ca);
if (ret) {
percpu_ref_put(&ca->ref);
diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h
index 21f6cb356921..6387e039f789 100644
--- a/fs/bcachefs/buckets.h
+++ b/fs/bcachefs/buckets.h
@@ -203,6 +203,7 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca)
}
void bch2_dev_usage_init(struct bch_dev *);
+void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev_usage *);
static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark)
{
@@ -301,6 +302,12 @@ u64 bch2_fs_sectors_used(struct bch_fs *, struct bch_fs_usage_online *);
struct bch_fs_usage_short
bch2_fs_usage_read_short(struct bch_fs *);
+void bch2_dev_usage_update(struct bch_fs *, struct bch_dev *,
+ const struct bch_alloc_v4 *,
+ const struct bch_alloc_v4 *, u64, bool);
+void bch2_dev_usage_update_m(struct bch_fs *, struct bch_dev *,
+ struct bucket *, struct bucket *);
+
/* key/bucket marking: */
static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
@@ -315,43 +322,41 @@ static inline struct bch_fs_usage *fs_usage_ptr(struct bch_fs *c,
: c->usage[journal_seq & JOURNAL_BUF_MASK]);
}
+int bch2_update_replicas(struct bch_fs *, struct bkey_s_c,
+ struct bch_replicas_entry_v1 *, s64,
+ unsigned, bool);
+int bch2_update_replicas_list(struct btree_trans *,
+ struct bch_replicas_entry_v1 *, s64);
+int bch2_update_cached_sectors_list(struct btree_trans *, unsigned, s64);
int bch2_replicas_deltas_realloc(struct btree_trans *, unsigned);
void bch2_fs_usage_initialize(struct bch_fs *);
+int bch2_check_bucket_ref(struct btree_trans *, struct bkey_s_c,
+ const struct bch_extent_ptr *,
+ s64, enum bch_data_type, u8, u8, u32);
+
int bch2_mark_metadata_bucket(struct bch_fs *, struct bch_dev *,
size_t, enum bch_data_type, unsigned,
struct gc_pos, unsigned);
-int bch2_mark_alloc(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_extent(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_stripe(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_reservation(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-int bch2_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
-
-int bch2_trans_mark_extent(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_stripe(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_reservation(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_trans_mark_reflink_p(struct btree_trans *, enum btree_id, unsigned, struct bkey_s_c, struct bkey_i *, unsigned);
-
-#define mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
+int bch2_trigger_extent(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
+int bch2_trigger_reservation(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
+
+#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags)\
({ \
int ret = 0; \
\
if (_old.k->type) \
ret = _fn(_trans, _btree_id, _level, _old, _flags & ~BTREE_TRIGGER_INSERT); \
if (!ret && _new.k->type) \
- ret = _fn(_trans, _btree_id, _level, _new, _flags & ~BTREE_TRIGGER_OVERWRITE); \
+ ret = _fn(_trans, _btree_id, _level, _new.s_c, _flags & ~BTREE_TRIGGER_OVERWRITE);\
ret; \
})
-#define trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, _new, _flags) \
- mem_trigger_run_overwrite_then_insert(_fn, _trans, _btree_id, _level, _old, bkey_i_to_s_c(_new), _flags)
+void bch2_trans_account_disk_usage_change(struct btree_trans *);
void bch2_trans_fs_usage_revert(struct btree_trans *, struct replicas_delta_list *);
int bch2_trans_fs_usage_apply(struct btree_trans *, struct replicas_delta_list *);
@@ -382,6 +387,21 @@ static inline bool is_superblock_bucket(struct bch_dev *ca, u64 b)
return false;
}
+static inline const char *bch2_data_type_str(enum bch_data_type type)
+{
+ return type < BCH_DATA_NR
+ ? __bch2_data_types[type]
+ : "(invalid data type)";
+}
+
+static inline void bch2_prt_data_type(struct printbuf *out, enum bch_data_type type)
+{
+ if (type < BCH_DATA_NR)
+ prt_str(out, __bch2_data_types[type]);
+ else
+ prt_printf(out, "(invalid data type %u)", type);
+}
+
/* disk reservations: */
static inline void bch2_disk_reservation_put(struct bch_fs *c,
diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h
index 2a9dab9006ef..6a31740222a7 100644
--- a/fs/bcachefs/buckets_types.h
+++ b/fs/bcachefs/buckets_types.h
@@ -33,8 +33,6 @@ struct bucket_gens {
};
struct bch_dev_usage {
- u64 buckets_ec;
-
struct {
u64 buckets;
u64 sectors; /* _compressed_ sectors: */
@@ -47,23 +45,18 @@ struct bch_dev_usage {
} d[BCH_DATA_NR];
};
-struct bch_fs_usage {
- /* all fields are in units of 512 byte sectors: */
+struct bch_fs_usage_base {
u64 hidden;
u64 btree;
u64 data;
u64 cached;
u64 reserved;
u64 nr_inodes;
+};
- /* XXX: add stats for compression ratio */
-#if 0
- u64 uncompressed;
- u64 compressed;
-#endif
-
- /* broken out: */
-
+struct bch_fs_usage {
+ /* all fields are in units of 512 byte sectors: */
+ struct bch_fs_usage_base b;
u64 persistent_reserved[BCH_REPLICAS_MAX];
u64 replicas[];
};
diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c
index 4bb88aefed12..226b39c17667 100644
--- a/fs/bcachefs/chardev.c
+++ b/fs/bcachefs/chardev.c
@@ -7,22 +7,27 @@
#include "chardev.h"
#include "journal.h"
#include "move.h"
+#include "recovery.h"
#include "replicas.h"
#include "super.h"
#include "super-io.h"
+#include "thread_with_file.h"
-#include <linux/anon_inodes.h>
#include <linux/cdev.h>
#include <linux/device.h>
-#include <linux/file.h>
#include <linux/fs.h>
#include <linux/ioctl.h>
-#include <linux/kthread.h>
#include <linux/major.h>
#include <linux/sched/task.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+__must_check
+static int copy_to_user_errcode(void __user *to, const void *from, unsigned long n)
+{
+ return copy_to_user(to, from, n) ? -EFAULT : 0;
+}
+
/* returns with ref on ca->ref */
static struct bch_dev *bch2_device_lookup(struct bch_fs *c, u64 dev,
unsigned flags)
@@ -132,8 +137,106 @@ static long bch2_ioctl_incremental(struct bch_ioctl_incremental __user *user_arg
}
#endif
+struct fsck_thread {
+ struct thread_with_stdio thr;
+ struct bch_fs *c;
+ char **devs;
+ size_t nr_devs;
+ struct bch_opts opts;
+};
+
+static void bch2_fsck_thread_exit(struct thread_with_stdio *_thr)
+{
+ struct fsck_thread *thr = container_of(_thr, struct fsck_thread, thr);
+ if (thr->devs)
+ for (size_t i = 0; i < thr->nr_devs; i++)
+ kfree(thr->devs[i]);
+ kfree(thr->devs);
+ kfree(thr);
+}
+
+static int bch2_fsck_offline_thread_fn(void *arg)
+{
+ struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct bch_fs *c = bch2_fs_open(thr->devs, thr->nr_devs, thr->opts);
+
+ thr->thr.thr.ret = PTR_ERR_OR_ZERO(c);
+ if (!thr->thr.thr.ret)
+ bch2_fs_stop(c);
+
+ thread_with_stdio_done(&thr->thr);
+ return 0;
+}
+
+static long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg)
+{
+ struct bch_ioctl_fsck_offline arg;
+ struct fsck_thread *thr = NULL;
+ u64 *devs = NULL;
+ long ret = 0;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
+ return -EFAULT;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!(devs = kcalloc(arg.nr_devs, sizeof(*devs), GFP_KERNEL)) ||
+ !(thr = kzalloc(sizeof(*thr), GFP_KERNEL)) ||
+ !(thr->devs = kcalloc(arg.nr_devs, sizeof(*thr->devs), GFP_KERNEL))) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ thr->opts = bch2_opts_empty();
+ thr->nr_devs = arg.nr_devs;
+
+ if (copy_from_user(devs, &user_arg->devs[0],
+ array_size(sizeof(user_arg->devs[0]), arg.nr_devs))) {
+ ret = -EINVAL;
+ goto err;
+ }
+
+ for (size_t i = 0; i < arg.nr_devs; i++) {
+ thr->devs[i] = strndup_user((char __user *)(unsigned long) devs[i], PATH_MAX);
+ ret = PTR_ERR_OR_ZERO(thr->devs[i]);
+ if (ret)
+ goto err;
+ }
+
+ if (arg.opts) {
+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+
+ ret = PTR_ERR_OR_ZERO(optstr) ?:
+ bch2_parse_mount_opts(NULL, &thr->opts, optstr);
+ kfree(optstr);
+
+ if (ret)
+ goto err;
+ }
+
+ opt_set(thr->opts, stdio, (u64)(unsigned long)&thr->thr.stdio);
+
+ ret = bch2_run_thread_with_stdio(&thr->thr,
+ bch2_fsck_thread_exit,
+ bch2_fsck_offline_thread_fn);
+err:
+ if (ret < 0) {
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ pr_err("ret %s", bch2_err_str(ret));
+ }
+ kfree(devs);
+ return ret;
+}
+
static long bch2_global_ioctl(unsigned cmd, void __user *arg)
{
+ long ret;
+
switch (cmd) {
#if 0
case BCH_IOCTL_ASSEMBLE:
@@ -141,18 +244,25 @@ static long bch2_global_ioctl(unsigned cmd, void __user *arg)
case BCH_IOCTL_INCREMENTAL:
return bch2_ioctl_incremental(arg);
#endif
+ case BCH_IOCTL_FSCK_OFFLINE: {
+ ret = bch2_ioctl_fsck_offline(arg);
+ break;
+ }
default:
- return -ENOTTY;
+ ret = -ENOTTY;
+ break;
}
+
+ if (ret < 0)
+ ret = bch2_err_class(ret);
+ return ret;
}
static long bch2_ioctl_query_uuid(struct bch_fs *c,
struct bch_ioctl_query_uuid __user *user_arg)
{
- if (copy_to_user(&user_arg->uuid, &c->sb.user_uuid,
- sizeof(c->sb.user_uuid)))
- return -EFAULT;
- return 0;
+ return copy_to_user_errcode(&user_arg->uuid, &c->sb.user_uuid,
+ sizeof(c->sb.user_uuid));
}
#if 0
@@ -295,31 +405,27 @@ static long bch2_ioctl_disk_set_state(struct bch_fs *c,
}
struct bch_data_ctx {
+ struct thread_with_file thr;
+
struct bch_fs *c;
struct bch_ioctl_data arg;
struct bch_move_stats stats;
-
- int ret;
-
- struct task_struct *thread;
};
static int bch2_data_thread(void *arg)
{
- struct bch_data_ctx *ctx = arg;
-
- ctx->ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
+ struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr);
+ ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg);
ctx->stats.data_type = U8_MAX;
return 0;
}
static int bch2_data_job_release(struct inode *inode, struct file *file)
{
- struct bch_data_ctx *ctx = file->private_data;
+ struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
- kthread_stop(ctx->thread);
- put_task_struct(ctx->thread);
+ bch2_thread_with_file_exit(&ctx->thr);
kfree(ctx);
return 0;
}
@@ -327,7 +433,7 @@ static int bch2_data_job_release(struct inode *inode, struct file *file)
static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
- struct bch_data_ctx *ctx = file->private_data;
+ struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr);
struct bch_fs *c = ctx->c;
struct bch_ioctl_data_event e = {
.type = BCH_DATA_EVENT_PROGRESS,
@@ -341,10 +447,7 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf,
if (len < sizeof(e))
return -EINVAL;
- if (copy_to_user(buf, &e, sizeof(e)))
- return -EFAULT;
-
- return sizeof(e);
+ return copy_to_user_errcode(buf, &e, sizeof(e)) ?: sizeof(e);
}
static const struct file_operations bcachefs_data_ops = {
@@ -356,10 +459,8 @@ static const struct file_operations bcachefs_data_ops = {
static long bch2_ioctl_data(struct bch_fs *c,
struct bch_ioctl_data arg)
{
- struct bch_data_ctx *ctx = NULL;
- struct file *file = NULL;
- unsigned flags = O_RDONLY|O_CLOEXEC|O_NONBLOCK;
- int ret, fd = -1;
+ struct bch_data_ctx *ctx;
+ int ret;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -374,36 +475,11 @@ static long bch2_ioctl_data(struct bch_fs *c,
ctx->c = c;
ctx->arg = arg;
- ctx->thread = kthread_create(bch2_data_thread, ctx,
- "bch-data/%s", c->name);
- if (IS_ERR(ctx->thread)) {
- ret = PTR_ERR(ctx->thread);
- goto err;
- }
-
- ret = get_unused_fd_flags(flags);
+ ret = bch2_run_thread_with_file(&ctx->thr,
+ &bcachefs_data_ops,
+ bch2_data_thread);
if (ret < 0)
- goto err;
- fd = ret;
-
- file = anon_inode_getfile("[bcachefs]", &bcachefs_data_ops, ctx, flags);
- if (IS_ERR(file)) {
- ret = PTR_ERR(file);
- goto err;
- }
-
- fd_install(fd, file);
-
- get_task_struct(ctx->thread);
- wake_up_process(ctx->thread);
-
- return fd;
-err:
- if (fd >= 0)
- put_unused_fd(fd);
- if (!IS_ERR_OR_NULL(ctx->thread))
- kthread_stop(ctx->thread);
- kfree(ctx);
+ kfree(ctx);
return ret;
}
@@ -417,7 +493,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
unsigned i;
int ret = 0;
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EINVAL;
if (get_user(replica_entries_bytes, &user_arg->replica_entries_bytes))
@@ -444,7 +520,7 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
dst_end = (void *) arg->replicas + replica_entries_bytes;
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *src_e =
+ struct bch_replicas_entry_v1 *src_e =
cpu_replicas_entry(&c->replicas, i);
/* check that we have enough space for one replicas entry */
@@ -474,14 +550,15 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c,
if (ret)
goto err;
- if (copy_to_user(user_arg, arg,
- sizeof(*arg) + arg->replica_entries_bytes))
- ret = -EFAULT;
+
+ ret = copy_to_user_errcode(user_arg, arg,
+ sizeof(*arg) + arg->replica_entries_bytes);
err:
kfree(arg);
return ret;
}
+/* obsolete, didn't allow for new data types: */
static long bch2_ioctl_dev_usage(struct bch_fs *c,
struct bch_ioctl_dev_usage __user *user_arg)
{
@@ -490,7 +567,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
struct bch_dev *ca;
unsigned i;
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EINVAL;
if (copy_from_user(&arg, user_arg, sizeof(arg)))
@@ -511,7 +588,6 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
arg.state = ca->mi.state;
arg.bucket_size = ca->mi.bucket_size;
arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
- arg.buckets_ec = src.buckets_ec;
for (i = 0; i < BCH_DATA_NR; i++) {
arg.d[i].buckets = src.d[i].buckets;
@@ -521,10 +597,58 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c,
percpu_ref_put(&ca->ref);
- if (copy_to_user(user_arg, &arg, sizeof(arg)))
+ return copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+}
+
+static long bch2_ioctl_dev_usage_v2(struct bch_fs *c,
+ struct bch_ioctl_dev_usage_v2 __user *user_arg)
+{
+ struct bch_ioctl_dev_usage_v2 arg;
+ struct bch_dev_usage src;
+ struct bch_dev *ca;
+ int ret = 0;
+
+ if (!test_bit(BCH_FS_started, &c->flags))
+ return -EINVAL;
+
+ if (copy_from_user(&arg, user_arg, sizeof(arg)))
return -EFAULT;
- return 0;
+ if ((arg.flags & ~BCH_BY_INDEX) ||
+ arg.pad[0] ||
+ arg.pad[1] ||
+ arg.pad[2])
+ return -EINVAL;
+
+ ca = bch2_device_lookup(c, arg.dev, arg.flags);
+ if (IS_ERR(ca))
+ return PTR_ERR(ca);
+
+ src = bch2_dev_usage_read(ca);
+
+ arg.state = ca->mi.state;
+ arg.bucket_size = ca->mi.bucket_size;
+ arg.nr_data_types = min(arg.nr_data_types, BCH_DATA_NR);
+ arg.nr_buckets = ca->mi.nbuckets - ca->mi.first_bucket;
+
+ ret = copy_to_user_errcode(user_arg, &arg, sizeof(arg));
+ if (ret)
+ goto err;
+
+ for (unsigned i = 0; i < arg.nr_data_types; i++) {
+ struct bch_ioctl_dev_usage_type t = {
+ .buckets = src.d[i].buckets,
+ .sectors = src.d[i].sectors,
+ .fragmented = src.d[i].fragmented,
+ };
+
+ ret = copy_to_user_errcode(&user_arg->d[i], &t, sizeof(t));
+ if (ret)
+ goto err;
+ }
+err:
+ percpu_ref_put(&ca->ref);
+ return ret;
}
static long bch2_ioctl_read_super(struct bch_fs *c,
@@ -561,9 +685,8 @@ static long bch2_ioctl_read_super(struct bch_fs *c,
goto err;
}
- if (copy_to_user((void __user *)(unsigned long)arg.sb, sb,
- vstruct_bytes(sb)))
- ret = -EFAULT;
+ ret = copy_to_user_errcode((void __user *)(unsigned long)arg.sb, sb,
+ vstruct_bytes(sb));
err:
if (!IS_ERR_OR_NULL(ca))
percpu_ref_put(&ca->ref);
@@ -575,8 +698,6 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
struct bch_ioctl_disk_get_idx arg)
{
dev_t dev = huge_decode_dev(arg.dev);
- struct bch_dev *ca;
- unsigned i;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -584,10 +705,10 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c,
if (!dev)
return -EINVAL;
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
if (ca->dev == dev) {
percpu_ref_put(&ca->io_ref);
- return i;
+ return ca->dev_idx;
}
return -BCH_ERR_ENOENT_dev_idx_not_found;
@@ -642,6 +763,97 @@ static long bch2_ioctl_disk_resize_journal(struct bch_fs *c,
return ret;
}
+static int bch2_fsck_online_thread_fn(void *arg)
+{
+ struct fsck_thread *thr = container_of(arg, struct fsck_thread, thr);
+ struct bch_fs *c = thr->c;
+
+ c->stdio_filter = current;
+ c->stdio = &thr->thr.stdio;
+
+ /*
+ * XXX: can we figure out a way to do this without mucking with c->opts?
+ */
+ unsigned old_fix_errors = c->opts.fix_errors;
+ if (opt_defined(thr->opts, fix_errors))
+ c->opts.fix_errors = thr->opts.fix_errors;
+ else
+ c->opts.fix_errors = FSCK_FIX_ask;
+
+ c->opts.fsck = true;
+ set_bit(BCH_FS_fsck_running, &c->flags);
+
+ c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
+ int ret = bch2_run_online_recovery_passes(c);
+
+ clear_bit(BCH_FS_fsck_running, &c->flags);
+ bch_err_fn(c, ret);
+
+ c->stdio = NULL;
+ c->stdio_filter = NULL;
+ c->opts.fix_errors = old_fix_errors;
+
+ thread_with_stdio_done(&thr->thr);
+
+ up(&c->online_fsck_mutex);
+ bch2_ro_ref_put(c);
+ return 0;
+}
+
+static long bch2_ioctl_fsck_online(struct bch_fs *c,
+ struct bch_ioctl_fsck_online arg)
+{
+ struct fsck_thread *thr = NULL;
+ long ret = 0;
+
+ if (arg.flags)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!bch2_ro_ref_tryget(c))
+ return -EROFS;
+
+ if (down_trylock(&c->online_fsck_mutex)) {
+ bch2_ro_ref_put(c);
+ return -EAGAIN;
+ }
+
+ thr = kzalloc(sizeof(*thr), GFP_KERNEL);
+ if (!thr) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ thr->c = c;
+ thr->opts = bch2_opts_empty();
+
+ if (arg.opts) {
+ char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16);
+
+ ret = PTR_ERR_OR_ZERO(optstr) ?:
+ bch2_parse_mount_opts(c, &thr->opts, optstr);
+ kfree(optstr);
+
+ if (ret)
+ goto err;
+ }
+
+ ret = bch2_run_thread_with_stdio(&thr->thr,
+ bch2_fsck_thread_exit,
+ bch2_fsck_online_thread_fn);
+err:
+ if (ret < 0) {
+ bch_err_fn(c, ret);
+ if (thr)
+ bch2_fsck_thread_exit(&thr->thr);
+ up(&c->online_fsck_mutex);
+ bch2_ro_ref_put(c);
+ }
+ return ret;
+}
+
#define BCH_IOCTL(_name, _argtype) \
do { \
_argtype i; \
@@ -663,6 +875,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
return bch2_ioctl_fs_usage(c, arg);
case BCH_IOCTL_DEV_USAGE:
return bch2_ioctl_dev_usage(c, arg);
+ case BCH_IOCTL_DEV_USAGE_V2:
+ return bch2_ioctl_dev_usage_v2(c, arg);
#if 0
case BCH_IOCTL_START:
BCH_IOCTL(start, struct bch_ioctl_start);
@@ -675,7 +889,7 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(disk_get_idx, struct bch_ioctl_disk_get_idx);
}
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EINVAL;
switch (cmd) {
@@ -695,7 +909,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg)
BCH_IOCTL(disk_resize, struct bch_ioctl_disk_resize);
case BCH_IOCTL_DISK_RESIZE_JOURNAL:
BCH_IOCTL(disk_resize_journal, struct bch_ioctl_disk_resize_journal);
-
+ case BCH_IOCTL_FSCK_ONLINE:
+ BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online);
default:
return -ENOTTY;
}
diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h
index 13998388c545..1b8c2c1016dc 100644
--- a/fs/bcachefs/checksum.h
+++ b/fs/bcachefs/checksum.h
@@ -45,6 +45,29 @@ struct bch_csum bch2_checksum(struct bch_fs *, unsigned, struct nonce,
bch2_checksum(_c, _type, _nonce, _start, vstruct_end(_i) - _start);\
})
+static inline void bch2_csum_to_text(struct printbuf *out,
+ enum bch_csum_type type,
+ struct bch_csum csum)
+{
+ const u8 *p = (u8 *) &csum;
+ unsigned bytes = type < BCH_CSUM_NR ? bch_crc_bytes[type] : 16;
+
+ for (unsigned i = 0; i < bytes; i++)
+ prt_hex_byte(out, p[i]);
+}
+
+static inline void bch2_csum_err_msg(struct printbuf *out,
+ enum bch_csum_type type,
+ struct bch_csum expected,
+ struct bch_csum got)
+{
+ prt_printf(out, "checksum error: got ");
+ bch2_csum_to_text(out, type, got);
+ prt_str(out, " should be ");
+ bch2_csum_to_text(out, type, expected);
+ prt_printf(out, " type %s", bch2_csum_types[type]);
+}
+
int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t);
int bch2_request_key(struct bch_sb *, struct bch_key *);
#ifndef __KERNEL__
diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c
index f41889093a2c..363644451106 100644
--- a/fs/bcachefs/clock.c
+++ b/fs/bcachefs/clock.c
@@ -109,7 +109,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
if (cpu_timeout != MAX_SCHEDULE_TIMEOUT)
mod_timer(&wait.cpu_timer, cpu_timeout + jiffies);
- while (1) {
+ do {
set_current_state(TASK_INTERRUPTIBLE);
if (kthread && kthread_should_stop())
break;
@@ -119,7 +119,7 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock,
schedule();
try_to_freeze();
- }
+ } while (0);
__set_current_state(TASK_RUNNING);
del_timer_sync(&wait.cpu_timer);
diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c
index 51af8ea230ed..33df8cf86bd8 100644
--- a/fs/bcachefs/compress.c
+++ b/fs/bcachefs/compress.c
@@ -572,10 +572,6 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features)
ZSTD_parameters params = zstd_get_params(zstd_max_clevel(),
c->opts.encoded_extent_max);
- /*
- * ZSTD is lying: if we allocate the size of the workspace it says it
- * requires, it returns memory allocation errors
- */
c->zstd_workspace_size = zstd_cctx_workspace_bound(&params.cParams);
struct {
diff --git a/fs/bcachefs/compress.h b/fs/bcachefs/compress.h
index 607fd5e232c9..58c2eb45570f 100644
--- a/fs/bcachefs/compress.h
+++ b/fs/bcachefs/compress.h
@@ -47,6 +47,14 @@ static inline enum bch_compression_type bch2_compression_opt_to_type(unsigned v)
return __bch2_compression_opt_to_type[bch2_compression_decode(v).type];
}
+static inline void bch2_prt_compression_type(struct printbuf *out, enum bch_compression_type type)
+{
+ if (type < BCH_COMPRESSION_TYPE_NR)
+ prt_str(out, __bch2_compression_types[type]);
+ else
+ prt_printf(out, "(invalid compression type %u)", type);
+}
+
int bch2_bio_uncompress_inplace(struct bch_fs *, struct bio *,
struct bch_extent_crc_unpacked *);
int bch2_bio_uncompress(struct bch_fs *, struct bio *, struct bio *,
diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h
index e367c625f057..4b340d13caac 100644
--- a/fs/bcachefs/darray.h
+++ b/fs/bcachefs/darray.h
@@ -20,6 +20,7 @@ struct { \
#define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0)
typedef DARRAY(char) darray_char;
+typedef DARRAY(char *) darray_str;
int __bch2_darray_resize(darray_char *, size_t, size_t, gfp_t);
@@ -81,11 +82,14 @@ static inline int __darray_make_room(darray_char *d, size_t t_size, size_t more,
#define darray_remove_item(_d, _pos) \
array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data)
+#define __darray_for_each(_d, _i) \
+ for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++)
+
#define darray_for_each(_d, _i) \
- for (_i = (_d).data; _i < (_d).data + (_d).nr; _i++)
+ for (typeof(&(_d).data[0]) _i = (_d).data; _i < (_d).data + (_d).nr; _i++)
#define darray_for_each_reverse(_d, _i) \
- for (_i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
+ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data; --_i)
#define darray_init(_d) \
do { \
diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c
index 37d6ecae8c30..4150feca42a2 100644
--- a/fs/bcachefs/data_update.c
+++ b/fs/bcachefs/data_update.c
@@ -267,19 +267,31 @@ restart_drop_extra_replicas:
goto out;
}
+ if (trace_data_update_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "\nold: ");
+ bch2_bkey_val_to_text(&buf, c, old);
+ prt_str(&buf, "\nk: ");
+ bch2_bkey_val_to_text(&buf, c, k);
+ prt_str(&buf, "\nnew: ");
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert));
+
+ trace_data_update(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, bkey_start_pos(&insert->k)) ?:
bch2_insert_snapshot_whiteouts(trans, m->btree_id,
k.k->p, insert->k.p) ?:
- bch2_bkey_set_needs_rebalance(c, insert,
- op->opts.background_target,
- op->opts.background_compression) ?:
+ bch2_bkey_set_needs_rebalance(c, insert, &op->opts) ?:
bch2_trans_update(trans, &iter, insert,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
bch2_trans_commit(trans, &op->res,
NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL|
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc|
m->data_opts.btree_insert_flags);
if (!ret) {
bch2_btree_iter_set_pos(&iter, next_pos);
@@ -300,14 +312,14 @@ next:
}
continue;
nowork:
- if (m->stats && m->stats) {
+ if (m->stats) {
BUG_ON(k.k->p.offset <= iter.pos.offset);
atomic64_inc(&m->stats->keys_raced);
atomic64_add(k.k->p.offset - iter.pos.offset,
&m->stats->sectors_raced);
}
- this_cpu_inc(c->counters[BCH_COUNTER_move_extent_fail]);
+ count_event(c, move_extent_fail);
bch2_btree_iter_advance(&iter);
goto next;
@@ -342,7 +354,6 @@ void bch2_data_update_exit(struct data_update *update)
struct bch_fs *c = update->op.c;
struct bkey_ptrs_c ptrs =
bch2_bkey_ptrs_c(bkey_i_to_s_c(update->k.k));
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr) {
if (c->opts.nocow_enabled)
@@ -363,7 +374,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
struct bio *bio = &update->op.wbio.bio;
struct bkey_i_extent *e;
struct write_point *wp;
- struct bch_extent_ptr *ptr;
struct closure cl;
struct btree_iter iter;
struct bkey_s_c k;
@@ -404,6 +414,8 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans,
continue;
}
+ bch_err_fn_ratelimited(c, ret);
+
if (ret)
return;
@@ -476,7 +488,7 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans,
return bch2_trans_relock(trans) ?:
bch2_trans_update(trans, iter, n, BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
int bch2_data_update_init(struct btree_trans *trans,
@@ -493,7 +505,6 @@ int bch2_data_update_init(struct btree_trans *trans,
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
struct extent_ptr_decoded p;
- const struct bch_extent_ptr *ptr;
unsigned i, reserve_sectors = k.k->size * data_opts.extra_replicas;
unsigned ptrs_locked = 0;
int ret = 0;
@@ -516,7 +527,7 @@ int bch2_data_update_init(struct btree_trans *trans,
BCH_WRITE_DATA_ENCODED|
BCH_WRITE_MOVE|
m->data_opts.write_flags;
- m->op.compression_opt = io_opts.background_compression ?: io_opts.compression;
+ m->op.compression_opt = background_compression(io_opts);
m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK;
bkey_for_each_ptr(ptrs, ptr)
@@ -639,7 +650,6 @@ done:
void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c
index 57c5128db173..7bdba8507fc9 100644
--- a/fs/bcachefs/debug.c
+++ b/fs/bcachefs/debug.c
@@ -44,19 +44,19 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b,
return false;
bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_sorted, btree_bytes(c)),
+ buf_pages(n_sorted, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_sorted, btree_bytes(c));
+ bch2_bio_map(bio, n_sorted, btree_buf_bytes(b));
submit_bio_wait(bio);
bio_put(bio);
percpu_ref_put(&ca->io_ref);
- memcpy(n_ondisk, n_sorted, btree_bytes(c));
+ memcpy(n_ondisk, n_sorted, btree_buf_bytes(b));
v->written = 0;
if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error)
@@ -137,7 +137,7 @@ void __bch2_btree_verify(struct bch_fs *c, struct btree *b)
mutex_lock(&c->verify_lock);
if (!c->verify_ondisk) {
- c->verify_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ c->verify_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!c->verify_ondisk)
goto out;
}
@@ -199,19 +199,19 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
return;
}
- n_ondisk = kvpmalloc(btree_bytes(c), GFP_KERNEL);
+ n_ondisk = kvpmalloc(btree_buf_bytes(b), GFP_KERNEL);
if (!n_ondisk) {
prt_printf(out, "memory allocation failure\n");
goto out;
}
bio = bio_alloc_bioset(ca->disk_sb.bdev,
- buf_pages(n_ondisk, btree_bytes(c)),
+ buf_pages(n_ondisk, btree_buf_bytes(b)),
REQ_OP_READ|REQ_META,
GFP_NOFS,
&c->btree_bio);
bio->bi_iter.bi_sector = pick.ptr.offset;
- bch2_bio_map(bio, n_ondisk, btree_bytes(c));
+ bch2_bio_map(bio, n_ondisk, btree_buf_bytes(b));
ret = submit_bio_wait(bio);
if (ret) {
@@ -293,7 +293,7 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c,
out:
if (bio)
bio_put(bio);
- kvpfree(n_ondisk, btree_bytes(c));
+ kvpfree(n_ondisk, btree_buf_bytes(b));
percpu_ref_put(&ca->io_ref);
}
@@ -366,35 +366,23 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- ret = flush_buf(i);
- if (ret)
- return ret;
-
- trans = bch2_trans_get(i->c);
- ret = for_each_btree_key2(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
- bch2_bkey_val_to_text(&i->buf, i->c, k);
- prt_newline(&i->buf);
- drop_locks_do(trans, flush_buf(i));
- }));
- i->from = iter.pos;
-
- bch2_trans_put(trans);
-
- if (!ret)
- ret = flush_buf(i);
-
- return ret ?: i->ret;
+ return flush_buf(i) ?:
+ bch2_trans_run(i->c,
+ for_each_btree_key(trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ bch2_bkey_val_to_text(&i->buf, i->c, k);
+ prt_newline(&i->buf);
+ bch2_trans_unlock(trans);
+ i->from = bpos_successor(iter.pos);
+ flush_buf(i);
+ }))) ?:
+ i->ret;
}
static const struct file_operations btree_debug_ops = {
@@ -462,44 +450,32 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- ssize_t ret;
i->ubuf = buf;
i->size = size;
i->ret = 0;
- ret = flush_buf(i);
- if (ret)
- return ret;
-
- trans = bch2_trans_get(i->c);
-
- ret = for_each_btree_key2(trans, iter, i->id, i->from,
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
- struct btree_path_level *l = &iter.path->l[0];
- struct bkey_packed *_k =
- bch2_btree_node_iter_peek(&l->iter, l->b);
-
- if (bpos_gt(l->b->key.k.p, i->prev_node)) {
- bch2_btree_node_to_text(&i->buf, i->c, l->b);
- i->prev_node = l->b->key.k.p;
- }
-
- bch2_bfloat_to_text(&i->buf, l->b, _k);
- drop_locks_do(trans, flush_buf(i));
- }));
- i->from = iter.pos;
-
- bch2_trans_put(trans);
-
- if (!ret)
- ret = flush_buf(i);
-
- return ret ?: i->ret;
+ return flush_buf(i) ?:
+ bch2_trans_run(i->c,
+ for_each_btree_key(trans, iter, i->id, i->from,
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ struct btree_path_level *l =
+ &btree_iter_path(trans, &iter)->l[0];
+ struct bkey_packed *_k =
+ bch2_btree_node_iter_peek(&l->iter, l->b);
+
+ if (bpos_gt(l->b->key.k.p, i->prev_node)) {
+ bch2_btree_node_to_text(&i->buf, i->c, l->b);
+ i->prev_node = l->b->key.k.p;
+ }
+
+ bch2_bfloat_to_text(&i->buf, l->b, _k);
+ bch2_trans_unlock(trans);
+ i->from = bpos_successor(iter.pos);
+ flush_buf(i);
+ }))) ?:
+ i->ret;
}
static const struct file_operations bfloat_failed_debug_ops = {
@@ -616,7 +592,6 @@ static const struct file_operations cached_btree_nodes_ops = {
.read = bch2_cached_btree_nodes_read,
};
-#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
{
@@ -632,7 +607,9 @@ static ssize_t bch2_btree_transactions_read(struct file *file, char __user *buf,
restart:
seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (trans->locking_wait.task->pid <= i->iter)
+ struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+
+ if (!task || task->pid <= i->iter)
continue;
closure_get(&trans->ref);
@@ -650,11 +627,11 @@ restart:
prt_printf(&i->buf, "backtrace:");
prt_newline(&i->buf);
printbuf_indent_add(&i->buf, 2);
- bch2_prt_task_backtrace(&i->buf, trans->locking_wait.task);
+ bch2_prt_task_backtrace(&i->buf, task, 0, GFP_KERNEL);
printbuf_indent_sub(&i->buf, 2);
prt_newline(&i->buf);
- i->iter = trans->locking_wait.task->pid;
+ i->iter = task->pid;
closure_put(&trans->ref);
@@ -678,7 +655,6 @@ static const struct file_operations btree_transactions_ops = {
.release = bch2_dump_release,
.read = bch2_btree_transactions_read,
};
-#endif /* CONFIG_BCACHEFS_DEBUG_TRANSACTIONS */
static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf,
size_t size, loff_t *ppos)
@@ -717,7 +693,7 @@ static const struct file_operations journal_pins_ops = {
.read = bch2_journal_pins_read,
};
-static int lock_held_stats_open(struct inode *inode, struct file *file)
+static int btree_transaction_stats_open(struct inode *inode, struct file *file)
{
struct bch_fs *c = inode->i_private;
struct dump_iter *i;
@@ -727,7 +703,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file)
if (!i)
return -ENOMEM;
- i->iter = 0;
+ i->iter = 1;
i->c = c;
i->buf = PRINTBUF;
file->private_data = i;
@@ -735,7 +711,7 @@ static int lock_held_stats_open(struct inode *inode, struct file *file)
return 0;
}
-static int lock_held_stats_release(struct inode *inode, struct file *file)
+static int btree_transaction_stats_release(struct inode *inode, struct file *file)
{
struct dump_iter *i = file->private_data;
@@ -745,8 +721,8 @@ static int lock_held_stats_release(struct inode *inode, struct file *file)
return 0;
}
-static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
- size_t size, loff_t *ppos)
+static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf,
+ size_t size, loff_t *ppos)
{
struct dump_iter *i = file->private_data;
struct bch_fs *c = i->c;
@@ -779,6 +755,13 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
prt_printf(&i->buf, "Max mem used: %u", s->max_mem);
prt_newline(&i->buf);
+ prt_printf(&i->buf, "Transaction duration:");
+ prt_newline(&i->buf);
+
+ printbuf_indent_add(&i->buf, 2);
+ bch2_time_stats_to_text(&i->buf, &s->duration);
+ printbuf_indent_sub(&i->buf, 2);
+
if (IS_ENABLED(CONFIG_BCACHEFS_LOCK_TIME_STATS)) {
prt_printf(&i->buf, "Lock hold times:");
prt_newline(&i->buf);
@@ -810,11 +793,11 @@ static ssize_t lock_held_stats_read(struct file *file, char __user *buf,
return i->ret;
}
-static const struct file_operations lock_held_stats_op = {
- .owner = THIS_MODULE,
- .open = lock_held_stats_open,
- .release = lock_held_stats_release,
- .read = lock_held_stats_read,
+static const struct file_operations btree_transaction_stats_op = {
+ .owner = THIS_MODULE,
+ .open = btree_transaction_stats_open,
+ .release = btree_transaction_stats_release,
+ .read = btree_transaction_stats_read,
};
static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
@@ -835,7 +818,9 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf,
restart:
seqmutex_lock(&c->btree_trans_lock);
list_for_each_entry(trans, &c->btree_trans_list, list) {
- if (trans->locking_wait.task->pid <= i->iter)
+ struct task_struct *task = READ_ONCE(trans->locking_wait.task);
+
+ if (!task || task->pid <= i->iter)
continue;
closure_get(&trans->ref);
@@ -850,7 +835,7 @@ restart:
bch2_check_for_deadlock(trans, &i->buf);
- i->iter = trans->locking_wait.task->pid;
+ i->iter = task->pid;
closure_put(&trans->ref);
@@ -897,16 +882,14 @@ void bch2_fs_debug_init(struct bch_fs *c)
debugfs_create_file("cached_btree_nodes", 0400, c->fs_debug_dir,
c->btree_debug, &cached_btree_nodes_ops);
-#ifdef CONFIG_BCACHEFS_DEBUG_TRANSACTIONS
debugfs_create_file("btree_transactions", 0400, c->fs_debug_dir,
c->btree_debug, &btree_transactions_ops);
-#endif
debugfs_create_file("journal_pins", 0400, c->fs_debug_dir,
c->btree_debug, &journal_pins_ops);
debugfs_create_file("btree_transaction_stats", 0400, c->fs_debug_dir,
- c, &lock_held_stats_op);
+ c, &btree_transaction_stats_op);
debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir,
c->btree_debug, &btree_deadlock_ops);
@@ -947,8 +930,6 @@ void bch2_debug_exit(void)
int __init bch2_debug_init(void)
{
- int ret = 0;
-
bch_debug = debugfs_create_dir("bcachefs", NULL);
- return ret;
+ return 0;
}
diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c
index 2bfff0da7000..4ae1e9f002a0 100644
--- a/fs/bcachefs/dirent.c
+++ b/fs/bcachefs/dirent.c
@@ -65,7 +65,7 @@ static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
const struct qstr l_name = bch2_dirent_get_name(l);
const struct qstr *r_name = _r;
- return l_name.len - r_name->len ?: memcmp(l_name.name, r_name->name, l_name.len);
+ return !qstr_eq(l_name, *r_name);
}
static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
@@ -75,7 +75,7 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r)
const struct qstr l_name = bch2_dirent_get_name(l);
const struct qstr r_name = bch2_dirent_get_name(r);
- return l_name.len - r_name.len ?: memcmp(l_name.name, r_name.name, l_name.len);
+ return !qstr_eq(l_name, r_name);
}
static bool dirent_is_visible(subvol_inum inum, struct bkey_s_c k)
@@ -198,10 +198,39 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
return dirent;
}
+int bch2_dirent_create_snapshot(struct btree_trans *trans,
+ u64 dir, u32 snapshot,
+ const struct bch_hash_info *hash_info,
+ u8 type, const struct qstr *name, u64 dst_inum,
+ u64 *dir_offset,
+ bch_str_hash_flags_t str_hash_flags)
+{
+ subvol_inum zero_inum = { 0 };
+ struct bkey_i_dirent *dirent;
+ int ret;
+
+ dirent = dirent_create_key(trans, zero_inum, type, name, dst_inum);
+ ret = PTR_ERR_OR_ZERO(dirent);
+ if (ret)
+ return ret;
+
+ dirent->k.p.inode = dir;
+ dirent->k.p.snapshot = snapshot;
+
+ ret = bch2_hash_set_snapshot(trans, bch2_dirent_hash_desc, hash_info,
+ zero_inum, snapshot,
+ &dirent->k_i, str_hash_flags,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+ *dir_offset = dirent->k.p.offset;
+
+ return ret;
+}
+
int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
const struct bch_hash_info *hash_info,
u8 type, const struct qstr *name, u64 dst_inum,
- u64 *dir_offset, int flags)
+ u64 *dir_offset,
+ bch_str_hash_flags_t str_hash_flags)
{
struct bkey_i_dirent *dirent;
int ret;
@@ -212,7 +241,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir,
return ret;
ret = bch2_hash_set(trans, bch2_dirent_hash_desc, hash_info,
- dir, &dirent->k_i, flags);
+ dir, &dirent->k_i, str_hash_flags);
*dir_offset = dirent->k.p.offset;
return ret;
@@ -470,17 +499,11 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir,
const struct qstr *name, subvol_inum *inum)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- int ret;
-retry:
- bch2_trans_begin(trans);
+ struct btree_iter iter = { NULL };
- ret = __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info,
- name, inum, 0);
- if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
- goto retry;
- if (!ret)
- bch2_trans_iter_exit(trans, &iter);
+ int ret = lockrestart_do(trans,
+ __bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0));
+ bch2_trans_iter_exit(trans, &iter);
bch2_trans_put(trans);
return ret;
}
diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h
index 1e3431990abd..21ffeb78f02e 100644
--- a/fs/bcachefs/dirent.h
+++ b/fs/bcachefs/dirent.h
@@ -35,9 +35,14 @@ static inline unsigned dirent_val_u64s(unsigned len)
int bch2_dirent_read_target(struct btree_trans *, subvol_inum,
struct bkey_s_c_dirent, subvol_inum *);
+int bch2_dirent_create_snapshot(struct btree_trans *, u64, u32,
+ const struct bch_hash_info *, u8,
+ const struct qstr *, u64, u64 *,
+ bch_str_hash_flags_t);
int bch2_dirent_create(struct btree_trans *, subvol_inum,
const struct bch_hash_info *, u8,
- const struct qstr *, u64, u64 *, int);
+ const struct qstr *, u64, u64 *,
+ bch_str_hash_flags_t);
static inline unsigned vfs_d_type(unsigned type)
{
diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h
new file mode 100644
index 000000000000..5e116b88e814
--- /dev/null
+++ b/fs/bcachefs/dirent_format.h
@@ -0,0 +1,42 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_DIRENT_FORMAT_H
+#define _BCACHEFS_DIRENT_FORMAT_H
+
+/*
+ * Dirents (and xattrs) have to implement string lookups; since our b-tree
+ * doesn't support arbitrary length strings for the key, we instead index by a
+ * 64 bit hash (currently truncated sha1) of the string, stored in the offset
+ * field of the key - using linear probing to resolve hash collisions. This also
+ * provides us with the readdir cookie posix requires.
+ *
+ * Linear probing requires us to use whiteouts for deletions, in the event of a
+ * collision:
+ */
+
+struct bch_dirent {
+ struct bch_val v;
+
+ /* Target inode number: */
+ union {
+ __le64 d_inum;
+ struct { /* DT_SUBVOL */
+ __le32 d_child_subvol;
+ __le32 d_parent_subvol;
+ };
+ };
+
+ /*
+ * Copy of mode bits 12-15 from the target inode - so userspace can get
+ * the filetype without having to do a stat()
+ */
+ __u8 d_type;
+
+ __u8 d_name[];
+} __packed __aligned(8);
+
+#define DT_SUBVOL 16
+#define BCH_DT_MAX 17
+
+#define BCH_NAME_MAX 512
+
+#endif /* _BCACHEFS_DIRENT_FORMAT_H */
diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c
index 4d0cb0ccff32..06a7df529b40 100644
--- a/fs/bcachefs/disk_groups.c
+++ b/fs/bcachefs/disk_groups.c
@@ -89,19 +89,14 @@ err:
void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
{
- struct bch_disk_groups_cpu *g;
- struct bch_dev *ca;
- int i;
- unsigned iter;
-
out->atomic++;
rcu_read_lock();
- g = rcu_dereference(c->disk_groups);
+ struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups);
if (!g)
goto out;
- for (i = 0; i < g->nr; i++) {
+ for (unsigned i = 0; i < g->nr; i++) {
if (i)
prt_printf(out, " ");
@@ -111,7 +106,7 @@ void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c)
}
prt_printf(out, "[parent %d devs", g->entries[i].parent);
- for_each_member_device_rcu(ca, c, iter, &g->entries[i].devs)
+ for_each_member_device_rcu(c, ca, &g->entries[i].devs)
prt_printf(out, " %s", ca->name);
prt_printf(out, "]");
}
@@ -562,7 +557,7 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v)
: NULL;
if (ca && percpu_ref_tryget(&ca->io_ref)) {
- prt_printf(out, "/dev/%pg", ca->disk_sb.bdev);
+ prt_printf(out, "/dev/%s", ca->name);
percpu_ref_put(&ca->io_ref);
} else if (ca) {
prt_printf(out, "offline device %u", t.dev);
diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c
index 2a77de18c004..d503af270024 100644
--- a/fs/bcachefs/ec.c
+++ b/fs/bcachefs/ec.c
@@ -3,6 +3,7 @@
/* erasure coding */
#include "bcachefs.h"
+#include "alloc_background.h"
#include "alloc_foreground.h"
#include "backpointers.h"
#include "bkey_buf.h"
@@ -156,12 +157,311 @@ void bch2_stripe_to_text(struct printbuf *out, struct bch_fs *c,
}
}
+/* Triggers: */
+
+static int bch2_trans_mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c_stripe s,
+ unsigned idx, bool deleting)
+{
+ struct bch_fs *c = trans->c;
+ const struct bch_extent_ptr *ptr = &s.v->ptrs[idx];
+ struct btree_iter iter;
+ struct bkey_i_alloc_v4 *a;
+ enum bch_data_type data_type = idx >= s.v->nr_blocks - s.v->nr_redundant
+ ? BCH_DATA_parity : 0;
+ s64 sectors = data_type ? le16_to_cpu(s.v->sectors) : 0;
+ int ret = 0;
+
+ if (deleting)
+ sectors = -sectors;
+
+ a = bch2_trans_start_alloc_update(trans, &iter, PTR_BUCKET_POS(c, ptr));
+ if (IS_ERR(a))
+ return PTR_ERR(a);
+
+ ret = bch2_check_bucket_ref(trans, s.s_c, ptr, sectors, data_type,
+ a->v.gen, a->v.data_type,
+ a->v.dirty_sectors);
+ if (ret)
+ goto err;
+
+ if (!deleting) {
+ if (bch2_trans_inconsistent_on(a->v.stripe ||
+ a->v.stripe_redundancy, trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: multiple stripes using same bucket (%u, %llu)",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_type_str(a->v.data_type),
+ a->v.dirty_sectors,
+ a->v.stripe, s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ if (bch2_trans_inconsistent_on(data_type && a->v.dirty_sectors, trans,
+ "bucket %llu:%llu gen %u data type %s dirty_sectors %u: data already in stripe bucket %llu",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ bch2_data_type_str(a->v.data_type),
+ a->v.dirty_sectors,
+ s.k->p.offset)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ a->v.stripe = s.k->p.offset;
+ a->v.stripe_redundancy = s.v->nr_redundant;
+ a->v.data_type = BCH_DATA_stripe;
+ } else {
+ if (bch2_trans_inconsistent_on(a->v.stripe != s.k->p.offset ||
+ a->v.stripe_redundancy != s.v->nr_redundant, trans,
+ "bucket %llu:%llu gen %u: not marked as stripe when deleting stripe %llu (got %u)",
+ iter.pos.inode, iter.pos.offset, a->v.gen,
+ s.k->p.offset, a->v.stripe)) {
+ ret = -EIO;
+ goto err;
+ }
+
+ a->v.stripe = 0;
+ a->v.stripe_redundancy = 0;
+ a->v.data_type = alloc_data_type(a->v, BCH_DATA_user);
+ }
+
+ a->v.dirty_sectors += sectors;
+ if (data_type)
+ a->v.data_type = !deleting ? data_type : 0;
+
+ ret = bch2_trans_update(trans, &iter, &a->k_i, 0);
+ if (ret)
+ goto err;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ return ret;
+}
+
+static int mark_stripe_bucket(struct btree_trans *trans,
+ struct bkey_s_c k,
+ unsigned ptr_idx,
+ unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
+ unsigned nr_data = s->nr_blocks - s->nr_redundant;
+ bool parity = ptr_idx >= nr_data;
+ enum bch_data_type data_type = parity ? BCH_DATA_parity : BCH_DATA_stripe;
+ s64 sectors = parity ? le16_to_cpu(s->sectors) : 0;
+ const struct bch_extent_ptr *ptr = s->ptrs + ptr_idx;
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bucket old, new, *g;
+ struct printbuf buf = PRINTBUF;
+ int ret = 0;
+
+ BUG_ON(!(flags & BTREE_TRIGGER_GC));
+
+ /* * XXX doesn't handle deletion */
+
+ percpu_down_read(&c->mark_lock);
+ g = PTR_GC_BUCKET(ca, ptr);
+
+ if (g->dirty_sectors ||
+ (g->stripe && g->stripe != k.k->p.offset)) {
+ bch2_fs_inconsistent(c,
+ "bucket %u:%zu gen %u: multiple stripes using same bucket\n%s",
+ ptr->dev, PTR_BUCKET_NR(ca, ptr), g->gen,
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
+ ret = -EINVAL;
+ goto err;
+ }
+
+ bucket_lock(g);
+ old = *g;
+
+ ret = bch2_check_bucket_ref(trans, k, ptr, sectors, data_type,
+ g->gen, g->data_type,
+ g->dirty_sectors);
+ if (ret)
+ goto err;
+
+ g->data_type = data_type;
+ g->dirty_sectors += sectors;
+
+ g->stripe = k.k->p.offset;
+ g->stripe_redundancy = s->nr_redundant;
+ new = *g;
+err:
+ bucket_unlock(g);
+ if (!ret)
+ bch2_dev_usage_update_m(c, ca, &old, &new);
+ percpu_up_read(&c->mark_lock);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+int bch2_trigger_stripe(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s _new,
+ unsigned flags)
+{
+ struct bkey_s_c new = _new.s_c;
+ struct bch_fs *c = trans->c;
+ u64 idx = new.k->p.offset;
+ const struct bch_stripe *old_s = old.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(old).v : NULL;
+ const struct bch_stripe *new_s = new.k->type == KEY_TYPE_stripe
+ ? bkey_s_c_to_stripe(new).v : NULL;
+
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ /*
+ * If the pointers aren't changing, we don't need to do anything:
+ */
+ if (new_s && old_s &&
+ new_s->nr_blocks == old_s->nr_blocks &&
+ new_s->nr_redundant == old_s->nr_redundant &&
+ !memcmp(old_s->ptrs, new_s->ptrs,
+ new_s->nr_blocks * sizeof(struct bch_extent_ptr)))
+ return 0;
+
+ BUG_ON(new_s && old_s &&
+ (new_s->nr_blocks != old_s->nr_blocks ||
+ new_s->nr_redundant != old_s->nr_redundant));
+
+ if (new_s) {
+ s64 sectors = le16_to_cpu(new_s->sectors);
+
+ struct bch_replicas_padded r;
+ bch2_bkey_to_replicas(&r.e, new);
+ int ret = bch2_update_replicas_list(trans, &r.e, sectors * new_s->nr_redundant);
+ if (ret)
+ return ret;
+ }
+
+ if (old_s) {
+ s64 sectors = -((s64) le16_to_cpu(old_s->sectors));
+
+ struct bch_replicas_padded r;
+ bch2_bkey_to_replicas(&r.e, old);
+ int ret = bch2_update_replicas_list(trans, &r.e, sectors * old_s->nr_redundant);
+ if (ret)
+ return ret;
+ }
+
+ unsigned nr_blocks = new_s ? new_s->nr_blocks : old_s->nr_blocks;
+ for (unsigned i = 0; i < nr_blocks; i++) {
+ if (new_s && old_s &&
+ !memcmp(&new_s->ptrs[i],
+ &old_s->ptrs[i],
+ sizeof(new_s->ptrs[i])))
+ continue;
+
+ if (new_s) {
+ int ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(new), i, false);
+ if (ret)
+ return ret;
+ }
+
+ if (old_s) {
+ int ret = bch2_trans_mark_stripe_bucket(trans,
+ bkey_s_c_to_stripe(old), i, true);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ if (flags & BTREE_TRIGGER_ATOMIC) {
+ struct stripe *m = genradix_ptr(&c->stripes, idx);
+
+ if (!m) {
+ struct printbuf buf1 = PRINTBUF;
+ struct printbuf buf2 = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf1, c, old);
+ bch2_bkey_val_to_text(&buf2, c, new);
+ bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n"
+ "old %s\n"
+ "new %s", idx, buf1.buf, buf2.buf);
+ printbuf_exit(&buf2);
+ printbuf_exit(&buf1);
+ bch2_inconsistent_error(c);
+ return -1;
+ }
+
+ if (!new_s) {
+ bch2_stripes_heap_del(c, m, idx);
+
+ memset(m, 0, sizeof(*m));
+ } else {
+ m->sectors = le16_to_cpu(new_s->sectors);
+ m->algorithm = new_s->algorithm;
+ m->nr_blocks = new_s->nr_blocks;
+ m->nr_redundant = new_s->nr_redundant;
+ m->blocks_nonempty = 0;
+
+ for (unsigned i = 0; i < new_s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(new_s, i);
+
+ if (!old_s)
+ bch2_stripes_heap_insert(c, m, idx);
+ else
+ bch2_stripes_heap_update(c, m, idx);
+ }
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ struct gc_stripe *m =
+ genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL);
+
+ if (!m) {
+ bch_err(c, "error allocating memory for gc_stripes, idx %llu",
+ idx);
+ return -BCH_ERR_ENOMEM_mark_stripe;
+ }
+ /*
+ * This will be wrong when we bring back runtime gc: we should
+ * be unmarking the old key and then marking the new key
+ */
+ m->alive = true;
+ m->sectors = le16_to_cpu(new_s->sectors);
+ m->nr_blocks = new_s->nr_blocks;
+ m->nr_redundant = new_s->nr_redundant;
+
+ for (unsigned i = 0; i < new_s->nr_blocks; i++)
+ m->ptrs[i] = new_s->ptrs[i];
+
+ bch2_bkey_to_replicas(&m->r.e, new);
+
+ /*
+ * gc recalculates this field from stripe ptr
+ * references:
+ */
+ memset(m->block_sectors, 0, sizeof(m->block_sectors));
+
+ for (unsigned i = 0; i < new_s->nr_blocks; i++) {
+ int ret = mark_stripe_bucket(trans, new, i, flags);
+ if (ret)
+ return ret;
+ }
+
+ int ret = bch2_update_replicas(c, new, &m->r.e,
+ ((s64) m->sectors * m->nr_redundant),
+ 0, true);
+ if (ret) {
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, new);
+ bch2_fs_fatal_error(c, "no replicas entry for %s", buf.buf);
+ printbuf_exit(&buf);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
/* returns blocknr in stripe that we matched: */
static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s,
struct bkey_s_c k, unsigned *block)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
unsigned i, nr_data = s->nr_blocks - s->nr_redundant;
bkey_for_each_ptr(ptrs, ptr)
@@ -791,28 +1091,22 @@ static void ec_stripe_delete_work(struct work_struct *work)
{
struct bch_fs *c =
container_of(work, struct bch_fs, ec_stripe_delete_work);
- struct btree_trans *trans = bch2_trans_get(c);
- int ret;
- u64 idx;
while (1) {
mutex_lock(&c->ec_stripes_heap_lock);
- idx = stripe_idx_to_delete(c);
+ u64 idx = stripe_idx_to_delete(c);
mutex_unlock(&c->ec_stripes_heap_lock);
if (!idx)
break;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
- ec_stripe_delete(trans, idx));
- if (ret) {
- bch_err_fn(c, ret);
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ ec_stripe_delete(trans, idx));
+ bch_err_fn(c, ret);
+ if (ret)
break;
- }
}
- bch2_trans_put(trans);
-
bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete);
}
@@ -983,8 +1277,8 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b
while (1) {
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
ec_stripe_update_extent(trans, bucket_pos, bucket.gen,
s, &bp_pos));
if (ret)
@@ -1005,7 +1299,7 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s)
unsigned i, nr_data = v->nr_blocks - v->nr_redundant;
int ret = 0;
- ret = bch2_btree_write_buffer_flush(trans);
+ ret = bch2_btree_write_buffer_flush_sync(trans);
if (ret)
goto err;
@@ -1121,21 +1415,20 @@ static void ec_stripe_create(struct ec_stripe_new *s)
}
ret = bch2_trans_do(c, &s->res, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc,
ec_stripe_key_update(trans,
bkey_i_to_stripe(&s->new_stripe.key),
!s->have_existing_stripe));
+ bch_err_msg(c, ret, "creating stripe key");
if (ret) {
- bch_err(c, "error creating stripe: error creating stripe key");
goto err;
}
ret = ec_stripe_update_extents(c, &s->new_stripe);
- if (ret) {
- bch_err_msg(c, ret, "creating stripe: error updating pointers");
+ bch_err_msg(c, ret, "error updating extents");
+ if (ret)
goto err;
- }
err:
bch2_disk_reservation_put(c, &s->res);
@@ -1250,18 +1543,17 @@ static int unsigned_cmp(const void *_l, const void *_r)
static unsigned pick_blocksize(struct bch_fs *c,
struct bch_devs_mask *devs)
{
- struct bch_dev *ca;
- unsigned i, nr = 0, sizes[BCH_SB_MEMBERS_MAX];
+ unsigned nr = 0, sizes[BCH_SB_MEMBERS_MAX];
struct {
unsigned nr, size;
} cur = { 0, 0 }, best = { 0, 0 };
- for_each_member_device_rcu(ca, c, i, devs)
+ for_each_member_device_rcu(c, ca, devs)
sizes[nr++] = ca->mi.bucket_size;
sort(sizes, nr, sizeof(unsigned), unsigned_cmp, NULL);
- for (i = 0; i < nr; i++) {
+ for (unsigned i = 0; i < nr; i++) {
if (sizes[i] != cur.size) {
if (cur.nr > best.nr)
best = cur;
@@ -1344,8 +1636,6 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
enum bch_watermark watermark)
{
struct ec_stripe_head *h;
- struct bch_dev *ca;
- unsigned i;
h = kzalloc(sizeof(*h), GFP_KERNEL);
if (!h)
@@ -1362,13 +1652,13 @@ ec_new_stripe_head_alloc(struct bch_fs *c, unsigned target,
rcu_read_lock();
h->devs = target_rw_devs(c, BCH_DATA_user, target);
- for_each_member_device_rcu(ca, c, i, &h->devs)
+ for_each_member_device_rcu(c, ca, &h->devs)
if (!ca->mi.durability)
- __clear_bit(i, h->devs.d);
+ __clear_bit(ca->dev_idx, h->devs.d);
h->blocksize = pick_blocksize(c, &h->devs);
- for_each_member_device_rcu(ca, c, i, &h->devs)
+ for_each_member_device_rcu(c, ca, &h->devs)
if (ca->mi.bucket_size == h->blocksize)
h->nr_active_devs++;
@@ -1415,7 +1705,7 @@ __bch2_ec_stripe_head_get(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
- if (test_bit(BCH_FS_GOING_RO, &c->flags)) {
+ if (test_bit(BCH_FS_going_ro, &c->flags)) {
h = ERR_PTR(-BCH_ERR_erofs_no_writes);
goto found;
}
@@ -1833,44 +2123,32 @@ void bch2_fs_ec_flush(struct bch_fs *c)
int bch2_stripes_read(struct bch_fs *c)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- const struct bch_stripe *s;
- struct stripe *m;
- unsigned i;
- int ret;
-
- for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
- BTREE_ITER_PREFETCH, k, ret) {
- if (k.k->type != KEY_TYPE_stripe)
- continue;
-
- ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
- if (ret)
- break;
-
- s = bkey_s_c_to_stripe(k).v;
-
- m = genradix_ptr(&c->stripes, k.k->p.offset);
- m->sectors = le16_to_cpu(s->sectors);
- m->algorithm = s->algorithm;
- m->nr_blocks = s->nr_blocks;
- m->nr_redundant = s->nr_redundant;
- m->blocks_nonempty = 0;
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN,
+ BTREE_ITER_PREFETCH, k, ({
+ if (k.k->type != KEY_TYPE_stripe)
+ continue;
- for (i = 0; i < s->nr_blocks; i++)
- m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL);
+ if (ret)
+ break;
- bch2_stripes_heap_insert(c, m, k.k->p.offset);
- }
- bch2_trans_iter_exit(trans, &iter);
+ const struct bch_stripe *s = bkey_s_c_to_stripe(k).v;
- bch2_trans_put(trans);
+ struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset);
+ m->sectors = le16_to_cpu(s->sectors);
+ m->algorithm = s->algorithm;
+ m->nr_blocks = s->nr_blocks;
+ m->nr_redundant = s->nr_redundant;
+ m->blocks_nonempty = 0;
- if (ret)
- bch_err_fn(c, ret);
+ for (unsigned i = 0; i < s->nr_blocks; i++)
+ m->blocks_nonempty += !!stripe_blockcount_get(s, i);
+ bch2_stripes_heap_insert(c, m, k.k->p.offset);
+ 0;
+ })));
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h
index 7d0237c9819f..f4369b02e805 100644
--- a/fs/bcachefs/ec.h
+++ b/fs/bcachefs/ec.h
@@ -12,13 +12,14 @@ int bch2_stripe_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_stripe_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
+int bch2_trigger_stripe(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_stripe ((struct bkey_ops) { \
.key_invalid = bch2_stripe_invalid, \
.val_to_text = bch2_stripe_to_text, \
.swab = bch2_ptr_swab, \
- .trans_trigger = bch2_trans_mark_stripe, \
- .atomic_trigger = bch2_mark_stripe, \
+ .trigger = bch2_trigger_stripe, \
.min_val_size = 8, \
})
diff --git a/fs/bcachefs/ec_format.h b/fs/bcachefs/ec_format.h
new file mode 100644
index 000000000000..44ce88ba08d7
--- /dev/null
+++ b/fs/bcachefs/ec_format.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EC_FORMAT_H
+#define _BCACHEFS_EC_FORMAT_H
+
+struct bch_stripe {
+ struct bch_val v;
+ __le16 sectors;
+ __u8 algorithm;
+ __u8 nr_blocks;
+ __u8 nr_redundant;
+
+ __u8 csum_granularity_bits;
+ __u8 csum_type;
+ __u8 pad;
+
+ struct bch_extent_ptr ptrs[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_EC_FORMAT_H */
diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h
index e2b02a82de32..976426da3a12 100644
--- a/fs/bcachefs/ec_types.h
+++ b/fs/bcachefs/ec_types.h
@@ -5,7 +5,7 @@
#include "bcachefs_format.h"
struct bch_replicas_padded {
- struct bch_replicas_entry e;
+ struct bch_replicas_entry_v1 e;
u8 pad[BCH_BKEY_PTRS_MAX];
};
diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h
index 9ce29681eec9..8c40c2067a04 100644
--- a/fs/bcachefs/errcode.h
+++ b/fs/bcachefs/errcode.h
@@ -73,7 +73,6 @@
x(ENOMEM, ENOMEM_fsck_add_nlink) \
x(ENOMEM, ENOMEM_journal_key_insert) \
x(ENOMEM, ENOMEM_journal_keys_sort) \
- x(ENOMEM, ENOMEM_journal_replay) \
x(ENOMEM, ENOMEM_read_superblock_clean) \
x(ENOMEM, ENOMEM_fs_alloc) \
x(ENOMEM, ENOMEM_fs_name_alloc) \
@@ -152,7 +151,6 @@
x(BCH_ERR_btree_insert_fail, btree_insert_need_mark_replicas) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_res) \
x(BCH_ERR_btree_insert_fail, btree_insert_need_journal_reclaim) \
- x(BCH_ERR_btree_insert_fail, btree_insert_need_flush_buffer) \
x(0, backpointer_to_overwritten_btree_node) \
x(0, lock_fail_root_changed) \
x(0, journal_reclaim_would_deadlock) \
@@ -172,10 +170,12 @@
x(EINVAL, device_size_too_small) \
x(EINVAL, device_not_a_member_of_filesystem) \
x(EINVAL, device_has_been_removed) \
+ x(EINVAL, device_splitbrain) \
x(EINVAL, device_already_online) \
x(EINVAL, insufficient_devices_to_start) \
x(EINVAL, invalid) \
x(EINVAL, internal_fsck_err) \
+ x(EINVAL, opt_parse_error) \
x(EROFS, erofs_trans_commit) \
x(EROFS, erofs_no_writes) \
x(EROFS, erofs_journal_err) \
@@ -224,6 +224,8 @@
x(BCH_ERR_invalid, invalid_bkey) \
x(BCH_ERR_operation_blocked, nocow_lock_blocked) \
x(EIO, btree_node_read_err) \
+ x(EIO, sb_not_downgraded) \
+ x(EIO, btree_write_all_failed) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \
x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \
@@ -235,6 +237,7 @@
x(BCH_ERR_nopromote, nopromote_unwritten) \
x(BCH_ERR_nopromote, nopromote_congested) \
x(BCH_ERR_nopromote, nopromote_in_flight) \
+ x(BCH_ERR_nopromote, nopromote_no_writes) \
x(BCH_ERR_nopromote, nopromote_enomem)
enum bch_errcode {
diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c
index 25cf78a7b946..d32c8bebe46c 100644
--- a/fs/bcachefs/error.c
+++ b/fs/bcachefs/error.c
@@ -2,12 +2,13 @@
#include "bcachefs.h"
#include "error.h"
#include "super.h"
+#include "thread_with_file.h"
#define FSCK_ERR_RATELIMIT_NR 10
bool bch2_inconsistent_error(struct bch_fs *c)
{
- set_bit(BCH_FS_ERROR, &c->flags);
+ set_bit(BCH_FS_error, &c->flags);
switch (c->opts.errors) {
case BCH_ON_ERROR_continue:
@@ -26,8 +27,8 @@ bool bch2_inconsistent_error(struct bch_fs *c)
void bch2_topology_error(struct bch_fs *c)
{
- set_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags);
- if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ set_bit(BCH_FS_topology_error, &c->flags);
+ if (!test_bit(BCH_FS_fsck_running, &c->flags))
bch2_inconsistent_error(c);
}
@@ -69,40 +70,66 @@ enum ask_yn {
YN_ALLYES,
};
+static enum ask_yn parse_yn_response(char *buf)
+{
+ buf = strim(buf);
+
+ if (strlen(buf) == 1)
+ switch (buf[0]) {
+ case 'n':
+ return YN_NO;
+ case 'y':
+ return YN_YES;
+ case 'N':
+ return YN_ALLNO;
+ case 'Y':
+ return YN_ALLYES;
+ }
+ return -1;
+}
+
#ifdef __KERNEL__
-#define bch2_fsck_ask_yn() YN_NO
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
+{
+ struct stdio_redirect *stdio = c->stdio;
+
+ if (c->stdio_filter && c->stdio_filter != current)
+ stdio = NULL;
+
+ if (!stdio)
+ return YN_NO;
+
+ char buf[100];
+ int ret;
+
+ do {
+ bch2_print(c, " (y,n, or Y,N for all errors of this type) ");
+
+ int r = bch2_stdio_redirect_readline(stdio, buf, sizeof(buf) - 1);
+ if (r < 0)
+ return YN_NO;
+ buf[r] = '\0';
+ } while ((ret = parse_yn_response(buf)) < 0);
+
+ return ret;
+}
#else
#include "tools-util.h"
-enum ask_yn bch2_fsck_ask_yn(void)
+static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c)
{
char *buf = NULL;
size_t buflen = 0;
- bool ret;
+ int ret;
- while (true) {
+ do {
fputs(" (y,n, or Y,N for all errors of this type) ", stdout);
fflush(stdout);
if (getline(&buf, &buflen, stdin) < 0)
die("error reading from standard input");
-
- strim(buf);
- if (strlen(buf) != 1)
- continue;
-
- switch (buf[0]) {
- case 'n':
- return YN_NO;
- case 'y':
- return YN_YES;
- case 'N':
- return YN_ALLNO;
- case 'Y':
- return YN_ALLYES;
- }
- }
+ } while ((ret = parse_yn_response(buf)) < 0);
free(buf);
return ret;
@@ -114,7 +141,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt)
{
struct fsck_err_state *s;
- if (test_bit(BCH_FS_FSCK_DONE, &c->flags))
+ if (!test_bit(BCH_FS_fsck_running, &c->flags))
return NULL;
list_for_each_entry(s, &c->fsck_error_msgs, list)
@@ -152,7 +179,8 @@ int bch2_fsck_err(struct bch_fs *c,
struct printbuf buf = PRINTBUF, *out = &buf;
int ret = -BCH_ERR_fsck_ignore;
- if (test_bit(err, c->sb.errors_silent))
+ if ((flags & FSCK_CAN_FIX) &&
+ test_bit(err, c->sb.errors_silent))
return -BCH_ERR_fsck_fix;
bch2_sb_error_count(c, err);
@@ -196,7 +224,7 @@ int bch2_fsck_err(struct bch_fs *c,
prt_printf(out, bch2_log_msg(c, ""));
#endif
- if (test_bit(BCH_FS_FSCK_DONE, &c->flags)) {
+ if (!test_bit(BCH_FS_fsck_running, &c->flags)) {
if (c->opts.errors != BCH_ON_ERROR_continue ||
!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) {
prt_str(out, ", shutting down");
@@ -221,10 +249,13 @@ int bch2_fsck_err(struct bch_fs *c,
int ask;
prt_str(out, ": fix?");
- bch2_print_string_as_lines(KERN_ERR, out->buf);
+ if (bch2_fs_stdio_redirect(c))
+ bch2_print(c, "%s", out->buf);
+ else
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
print = false;
- ask = bch2_fsck_ask_yn();
+ ask = bch2_fsck_ask_yn(c);
if (ask >= YN_ALLNO && s)
s->fix = ask == YN_ALLNO
@@ -253,10 +284,14 @@ int bch2_fsck_err(struct bch_fs *c,
!(flags & FSCK_CAN_IGNORE)))
ret = -BCH_ERR_fsck_errors_not_fixed;
- if (print)
- bch2_print_string_as_lines(KERN_ERR, out->buf);
+ if (print) {
+ if (bch2_fs_stdio_redirect(c))
+ bch2_print(c, "%s\n", out->buf);
+ else
+ bch2_print_string_as_lines(KERN_ERR, out->buf);
+ }
- if (!test_bit(BCH_FS_FSCK_DONE, &c->flags) &&
+ if (test_bit(BCH_FS_fsck_running, &c->flags) &&
(ret != -BCH_ERR_fsck_fix &&
ret != -BCH_ERR_fsck_ignore))
bch_err(c, "Unable to continue, halting");
@@ -274,10 +309,10 @@ int bch2_fsck_err(struct bch_fs *c,
bch2_inconsistent_error(c);
if (ret == -BCH_ERR_fsck_fix) {
- set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+ set_bit(BCH_FS_errors_fixed, &c->flags);
} else {
- set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
- set_bit(BCH_FS_ERROR, &c->flags);
+ set_bit(BCH_FS_errors_not_fixed, &c->flags);
+ set_bit(BCH_FS_error, &c->flags);
}
return ret;
diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c
index 21af6fb8cecf..b9033bb4f11c 100644
--- a/fs/bcachefs/extent_update.c
+++ b/fs/bcachefs/extent_update.c
@@ -100,7 +100,7 @@ static int count_iters_for_insert(struct btree_trans *trans,
return ret2 ?: ret;
}
-#define EXTENT_ITERS_MAX (BTREE_ITER_MAX / 3)
+#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3)
int bch2_extent_atomic_end(struct btree_trans *trans,
struct btree_iter *iter,
diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c
index 9d8afcb5979a..61395b113df9 100644
--- a/fs/bcachefs/extents.c
+++ b/fs/bcachefs/extents.c
@@ -8,6 +8,7 @@
#include "bcachefs.h"
#include "bkey_methods.h"
+#include "btree_cache.h"
#include "btree_gc.h"
#include "btree_io.h"
#include "btree_iter.h"
@@ -843,7 +844,6 @@ void bch2_bkey_drop_device_noerror(struct bkey_s k, unsigned dev)
const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned dev)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
if (ptr->dev == dev)
@@ -855,7 +855,6 @@ const struct bch_extent_ptr *bch2_bkey_has_device_c(struct bkey_s_c k, unsigned
bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
if (bch2_dev_in_target(c, ptr->dev, target) &&
@@ -1020,12 +1019,12 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c,
struct bch_extent_crc_unpacked crc =
bch2_extent_crc_unpack(k.k, entry_to_crc(entry));
- prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress %s",
+ prt_printf(out, "crc: c_size %u size %u offset %u nonce %u csum %s compress ",
crc.compressed_size,
crc.uncompressed_size,
crc.offset, crc.nonce,
- bch2_csum_types[crc.csum_type],
- bch2_compression_types[crc.compression_type]);
+ bch2_csum_types[crc.csum_type]);
+ bch2_prt_compression_type(out, crc.compression_type);
break;
}
case BCH_EXTENT_ENTRY_stripe_ptr: {
@@ -1065,7 +1064,6 @@ static int extent_ptr_invalid(struct bch_fs *c,
struct printbuf *err)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr2;
u64 bucket;
u32 bucket_offset;
struct bch_dev *ca;
@@ -1307,7 +1305,6 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, struct bkey_s_c k,
}
incompressible:
if (target && bch2_target_accepts_data(c, BCH_DATA_user, target)) {
- const struct bch_extent_ptr *ptr;
unsigned i = 0;
bkey_for_each_ptr(ptrs, ptr) {
@@ -1338,10 +1335,12 @@ bool bch2_bkey_needs_rebalance(struct bch_fs *c, struct bkey_s_c k)
}
int bch2_bkey_set_needs_rebalance(struct bch_fs *c, struct bkey_i *_k,
- unsigned target, unsigned compression)
+ struct bch_io_opts *opts)
{
struct bkey_s k = bkey_i_to_s(_k);
struct bch_extent_rebalance *r;
+ unsigned target = opts->background_target;
+ unsigned compression = background_compression(*opts);
bool needs_rebalance;
if (!bkey_extent_is_direct_data(k.k))
diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h
index a2ce8a3be13c..6bf839d69e84 100644
--- a/fs/bcachefs/extents.h
+++ b/fs/bcachefs/extents.h
@@ -300,7 +300,7 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k)
bkey_extent_entry_for_each_from(_p, _entry, _p.start)
#define __bkey_for_each_ptr(_start, _end, _ptr) \
- for ((_ptr) = (_start); \
+ for (typeof(_start) (_ptr) = (_start); \
((_ptr) = __bkey_ptr_next(_ptr, _end)); \
(_ptr)++)
@@ -415,8 +415,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
.key_invalid = bch2_btree_ptr_invalid, \
.val_to_text = bch2_btree_ptr_to_text, \
.swab = bch2_ptr_swab, \
- .trans_trigger = bch2_trans_mark_extent, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_extent, \
})
#define bch2_bkey_ops_btree_ptr_v2 ((struct bkey_ops) { \
@@ -424,8 +423,7 @@ void bch2_btree_ptr_v2_compat(enum btree_id, unsigned, unsigned,
.val_to_text = bch2_btree_ptr_v2_to_text, \
.swab = bch2_ptr_swab, \
.compat = bch2_btree_ptr_v2_compat, \
- .trans_trigger = bch2_trans_mark_extent, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_extent, \
.min_val_size = 40, \
})
@@ -439,8 +437,7 @@ bool bch2_extent_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.swab = bch2_ptr_swab, \
.key_normalize = bch2_extent_normalize, \
.key_merge = bch2_extent_merge, \
- .trans_trigger = bch2_trans_mark_extent, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_extent, \
})
/* KEY_TYPE_reservation: */
@@ -454,8 +451,7 @@ bool bch2_reservation_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
.key_invalid = bch2_reservation_invalid, \
.val_to_text = bch2_reservation_to_text, \
.key_merge = bch2_reservation_merge, \
- .trans_trigger = bch2_trans_mark_reservation, \
- .atomic_trigger = bch2_mark_reservation, \
+ .trigger = bch2_trigger_reservation, \
.min_val_size = 8, \
})
@@ -547,7 +543,6 @@ static inline bool bkey_extent_is_allocation(const struct bkey *k)
static inline bool bkey_extent_is_unwritten(struct bkey_s_c k)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(ptrs, ptr)
if (ptr->unwritten)
@@ -565,10 +560,9 @@ static inline struct bch_devs_list bch2_bkey_devs(struct bkey_s_c k)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(p, ptr)
- ret.devs[ret.nr++] = ptr->dev;
+ ret.data[ret.nr++] = ptr->dev;
return ret;
}
@@ -577,11 +571,10 @@ static inline struct bch_devs_list bch2_bkey_dirty_devs(struct bkey_s_c k)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(p, ptr)
if (!ptr->cached)
- ret.devs[ret.nr++] = ptr->dev;
+ ret.data[ret.nr++] = ptr->dev;
return ret;
}
@@ -590,11 +583,10 @@ static inline struct bch_devs_list bch2_bkey_cached_devs(struct bkey_s_c k)
{
struct bch_devs_list ret = (struct bch_devs_list) { 0 };
struct bkey_ptrs_c p = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
bkey_for_each_ptr(p, ptr)
if (ptr->cached)
- ret.devs[ret.nr++] = ptr->dev;
+ ret.data[ret.nr++] = ptr->dev;
return ret;
}
@@ -716,7 +708,7 @@ unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *, struct bkey_s_c,
bool bch2_bkey_needs_rebalance(struct bch_fs *, struct bkey_s_c);
int bch2_bkey_set_needs_rebalance(struct bch_fs *, struct bkey_i *,
- unsigned, unsigned);
+ struct bch_io_opts *);
/* Generic extent code: */
diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h
new file mode 100644
index 000000000000..3bd2fdbb0817
--- /dev/null
+++ b/fs/bcachefs/extents_format.h
@@ -0,0 +1,295 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_EXTENTS_FORMAT_H
+#define _BCACHEFS_EXTENTS_FORMAT_H
+
+/*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the data that is currently
+ * live. The size field in struct bkey records the current (live) size of the
+ * extent, and is also used to mean "size of region on disk that we point to" in
+ * this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8 (except we're encoding a
+ * type, not a size), encoding the type in the position of the first set bit:
+ *
+ * bch_extent_crc32 - 0b1
+ * bch_extent_ptr - 0b10
+ * bch_extent_crc64 - 0b100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+#define BCH_EXTENT_ENTRY_TYPES() \
+ x(ptr, 0) \
+ x(crc32, 1) \
+ x(crc64, 2) \
+ x(crc128, 3) \
+ x(stripe_ptr, 4) \
+ x(rebalance, 5)
+#define BCH_EXTENT_ENTRY_MAX 6
+
+enum bch_extent_entry_type {
+#define x(f, n) BCH_EXTENT_ENTRY_##f = n,
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+/* Compressed/uncompressed size are stored biased by 1: */
+struct bch_extent_crc32 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u32 type:2,
+ _compressed_size:7,
+ _uncompressed_size:7,
+ offset:7,
+ _unused:1,
+ csum_type:4,
+ compression_type:4;
+ __u32 csum;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u32 csum;
+ __u32 compression_type:4,
+ csum_type:4,
+ _unused:1,
+ offset:7,
+ _uncompressed_size:7,
+ _compressed_size:7,
+ type:2;
+#endif
+} __packed __aligned(8);
+
+#define CRC32_SIZE_MAX (1U << 7)
+#define CRC32_NONCE_MAX 0
+
+struct bch_extent_crc64 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:3,
+ _compressed_size:9,
+ _uncompressed_size:9,
+ offset:9,
+ nonce:10,
+ csum_type:4,
+ compression_type:4,
+ csum_hi:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 csum_hi:16,
+ compression_type:4,
+ csum_type:4,
+ nonce:10,
+ offset:9,
+ _uncompressed_size:9,
+ _compressed_size:9,
+ type:3;
+#endif
+ __u64 csum_lo;
+} __packed __aligned(8);
+
+#define CRC64_SIZE_MAX (1U << 9)
+#define CRC64_NONCE_MAX ((1U << 10) - 1)
+
+struct bch_extent_crc128 {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:4,
+ _compressed_size:13,
+ _uncompressed_size:13,
+ offset:13,
+ nonce:13,
+ csum_type:4,
+ compression_type:4;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 compression_type:4,
+ csum_type:4,
+ nonce:13,
+ offset:13,
+ _uncompressed_size:13,
+ _compressed_size:13,
+ type:4;
+#endif
+ struct bch_csum csum;
+} __packed __aligned(8);
+
+#define CRC128_SIZE_MAX (1U << 13)
+#define CRC128_NONCE_MAX ((1U << 13) - 1)
+
+/*
+ * @reservation - pointer hasn't been written to, just reserved
+ */
+struct bch_extent_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:1,
+ cached:1,
+ unused:1,
+ unwritten:1,
+ offset:44, /* 8 petabytes */
+ dev:8,
+ gen:8;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 gen:8,
+ dev:8,
+ offset:44,
+ unwritten:1,
+ unused:1,
+ cached:1,
+ type:1;
+#endif
+} __packed __aligned(8);
+
+struct bch_extent_stripe_ptr {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:5,
+ block:8,
+ redundancy:4,
+ idx:47;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 idx:47,
+ redundancy:4,
+ block:8,
+ type:5;
+#endif
+};
+
+struct bch_extent_rebalance {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 type:6,
+ unused:34,
+ compression:8, /* enum bch_compression_opt */
+ target:16;
+#elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 target:16,
+ compression:8,
+ unused:34,
+ type:6;
+#endif
+};
+
+union bch_extent_entry {
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ || __BITS_PER_LONG == 64
+ unsigned long type;
+#elif __BITS_PER_LONG == 32
+ struct {
+ unsigned long pad;
+ unsigned long type;
+ };
+#else
+#error edit for your odd byteorder.
+#endif
+
+#define x(f, n) struct bch_extent_##f f;
+ BCH_EXTENT_ENTRY_TYPES()
+#undef x
+};
+
+struct bch_btree_ptr {
+ struct bch_val v;
+
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+struct bch_btree_ptr_v2 {
+ struct bch_val v;
+
+ __u64 mem_ptr;
+ __le64 seq;
+ __le16 sectors_written;
+ __le16 flags;
+ struct bpos min_key;
+ __u64 _data[0];
+ struct bch_extent_ptr start[];
+} __packed __aligned(8);
+
+LE16_BITMASK(BTREE_PTR_RANGE_UPDATED, struct bch_btree_ptr_v2, flags, 0, 1);
+
+struct bch_extent {
+ struct bch_val v;
+
+ __u64 _data[0];
+ union bch_extent_entry start[];
+} __packed __aligned(8);
+
+/* Maximum size (in u64s) a single pointer could be: */
+#define BKEY_EXTENT_PTR_U64s_MAX\
+ ((sizeof(struct bch_extent_crc128) + \
+ sizeof(struct bch_extent_ptr)) / sizeof(__u64))
+
+/* Maximum possible size of an entire extent value: */
+#define BKEY_EXTENT_VAL_U64s_MAX \
+ (1 + BKEY_EXTENT_PTR_U64s_MAX * (BCH_REPLICAS_MAX + 1))
+
+/* * Maximum possible size of an entire extent, key + value: */
+#define BKEY_EXTENT_U64s_MAX (BKEY_U64s + BKEY_EXTENT_VAL_U64s_MAX)
+
+/* Btree pointers don't carry around checksums: */
+#define BKEY_BTREE_PTR_VAL_U64s_MAX \
+ ((sizeof(struct bch_btree_ptr_v2) + \
+ sizeof(struct bch_extent_ptr) * BCH_REPLICAS_MAX) / sizeof(__u64))
+#define BKEY_BTREE_PTR_U64s_MAX \
+ (BKEY_U64s + BKEY_BTREE_PTR_VAL_U64s_MAX)
+
+struct bch_reservation {
+ struct bch_val v;
+
+ __le32 generation;
+ __u8 nr_replicas;
+ __u8 pad[3];
+} __packed __aligned(8);
+
+struct bch_inline_data {
+ struct bch_val v;
+ u8 data[];
+};
+
+#endif /* _BCACHEFS_EXTENTS_FORMAT_H */
diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h
index 05429c9631cd..b04750dbf870 100644
--- a/fs/bcachefs/eytzinger.h
+++ b/fs/bcachefs/eytzinger.h
@@ -156,7 +156,7 @@ static inline unsigned inorder_to_eytzinger1(unsigned i, unsigned size)
}
#define eytzinger1_for_each(_i, _size) \
- for ((_i) = eytzinger1_first((_size)); \
+ for (unsigned (_i) = eytzinger1_first((_size)); \
(_i) != 0; \
(_i) = eytzinger1_next((_i), (_size)))
@@ -227,7 +227,7 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size)
}
#define eytzinger0_for_each(_i, _size) \
- for ((_i) = eytzinger0_first((_size)); \
+ for (unsigned (_i) = eytzinger0_first((_size)); \
(_i) != -1; \
(_i) = eytzinger0_next((_i), (_size)))
@@ -261,11 +261,11 @@ static inline ssize_t eytzinger0_find_le(void *base, size_t nr, size_t size,
#define eytzinger0_find(base, nr, size, _cmp, search) \
({ \
- void *_base = (base); \
- void *_search = (search); \
- size_t _nr = (nr); \
- size_t _size = (size); \
- size_t _i = 0; \
+ void *_base = (base); \
+ const void *_search = (search); \
+ size_t _nr = (nr); \
+ size_t _size = (size); \
+ size_t _i = 0; \
int _res; \
\
while (_i < _nr && \
diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/fs-common.c
index 4496cf91a4c1..1c1ea0f0c692 100644
--- a/fs/bcachefs/fs-common.c
+++ b/fs/bcachefs/fs-common.c
@@ -166,10 +166,8 @@ int bch2_create_trans(struct btree_trans *trans,
if (ret)
goto err;
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- new_inode->bi_dir = dir_u->bi_inum;
- new_inode->bi_dir_offset = dir_offset;
- }
+ new_inode->bi_dir = dir_u->bi_inum;
+ new_inode->bi_dir_offset = dir_offset;
}
inode_iter.flags &= ~BTREE_ITER_ALL_SNAPSHOTS;
@@ -228,10 +226,8 @@ int bch2_link_trans(struct btree_trans *trans,
if (ret)
goto err;
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- inode_u->bi_dir = dir.inum;
- inode_u->bi_dir_offset = dir_offset;
- }
+ inode_u->bi_dir = dir.inum;
+ inode_u->bi_dir_offset = dir_offset;
ret = bch2_inode_write(trans, &dir_iter, dir_u) ?:
bch2_inode_write(trans, &inode_iter, inode_u);
@@ -414,21 +410,19 @@ int bch2_rename_trans(struct btree_trans *trans,
goto err;
}
- if (c->sb.version >= bcachefs_metadata_version_inode_backpointers) {
- src_inode_u->bi_dir = dst_dir_u->bi_inum;
- src_inode_u->bi_dir_offset = dst_offset;
+ src_inode_u->bi_dir = dst_dir_u->bi_inum;
+ src_inode_u->bi_dir_offset = dst_offset;
- if (mode == BCH_RENAME_EXCHANGE) {
- dst_inode_u->bi_dir = src_dir_u->bi_inum;
- dst_inode_u->bi_dir_offset = src_offset;
- }
+ if (mode == BCH_RENAME_EXCHANGE) {
+ dst_inode_u->bi_dir = src_dir_u->bi_inum;
+ dst_inode_u->bi_dir_offset = src_offset;
+ }
- if (mode == BCH_RENAME_OVERWRITE &&
- dst_inode_u->bi_dir == dst_dir_u->bi_inum &&
- dst_inode_u->bi_dir_offset == src_offset) {
- dst_inode_u->bi_dir = 0;
- dst_inode_u->bi_dir_offset = 0;
- }
+ if (mode == BCH_RENAME_OVERWRITE &&
+ dst_inode_u->bi_dir == dst_dir_u->bi_inum &&
+ dst_inode_u->bi_dir_offset == src_offset) {
+ dst_inode_u->bi_dir = 0;
+ dst_inode_u->bi_dir_offset = 0;
}
if (mode == BCH_RENAME_OVERWRITE) {
diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c
index 52f0e7acda3d..27710cdd5710 100644
--- a/fs/bcachefs/fs-io-buffered.c
+++ b/fs/bcachefs/fs-io-buffered.c
@@ -52,26 +52,20 @@ struct readpages_iter {
static int readpages_iter_init(struct readpages_iter *iter,
struct readahead_control *ractl)
{
- struct folio **fi;
- int ret;
-
- memset(iter, 0, sizeof(*iter));
+ struct folio *folio;
- iter->mapping = ractl->mapping;
+ *iter = (struct readpages_iter) { ractl->mapping };
- ret = bch2_filemap_get_contig_folios_d(iter->mapping,
- ractl->_index << PAGE_SHIFT,
- (ractl->_index + ractl->_nr_pages) << PAGE_SHIFT,
- 0, mapping_gfp_mask(iter->mapping),
- &iter->folios);
- if (ret)
- return ret;
+ while ((folio = __readahead_folio(ractl))) {
+ if (!bch2_folio_create(folio, GFP_KERNEL) ||
+ darray_push(&iter->folios, folio)) {
+ bch2_folio_release(folio);
+ ractl->_nr_pages += folio_nr_pages(folio);
+ ractl->_index -= folio_nr_pages(folio);
+ return iter->folios.nr ? 0 : -ENOMEM;
+ }
- darray_for_each(iter->folios, fi) {
- ractl->_nr_pages -= 1U << folio_order(*fi);
- __bch2_folio_create(*fi, __GFP_NOFAIL|GFP_KERNEL);
- folio_put(*fi);
- folio_put(*fi);
+ folio_put(folio);
}
return 0;
@@ -273,12 +267,12 @@ void bch2_readahead(struct readahead_control *ractl)
struct btree_trans *trans = bch2_trans_get(c);
struct folio *folio;
struct readpages_iter readpages_iter;
- int ret;
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
- ret = readpages_iter_init(&readpages_iter, ractl);
- BUG_ON(ret);
+ int ret = readpages_iter_init(&readpages_iter, ractl);
+ if (ret)
+ return;
bch2_pagecache_add_get(inode);
@@ -309,18 +303,6 @@ void bch2_readahead(struct readahead_control *ractl)
darray_exit(&readpages_iter.folios);
}
-static void __bchfs_readfolio(struct bch_fs *c, struct bch_read_bio *rbio,
- subvol_inum inum, struct folio *folio)
-{
- bch2_folio_create(folio, __GFP_NOFAIL);
-
- rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
- rbio->bio.bi_iter.bi_sector = folio_sector(folio);
- BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
-
- bch2_trans_run(c, (bchfs_read(trans, rbio, inum, NULL), 0));
-}
-
static void bch2_read_single_folio_end_io(struct bio *bio)
{
complete(bio->bi_private);
@@ -335,6 +317,9 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
int ret;
DECLARE_COMPLETION_ONSTACK(done);
+ if (!bch2_folio_create(folio, GFP_KERNEL))
+ return -ENOMEM;
+
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read),
@@ -342,7 +327,11 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping)
rbio->bio.bi_private = &done;
rbio->bio.bi_end_io = bch2_read_single_folio_end_io;
- __bchfs_readfolio(c, rbio, inode_inum(inode), folio);
+ rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC;
+ rbio->bio.bi_iter.bi_sector = folio_sector(folio);
+ BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0));
+
+ bch2_trans_run(c, (bchfs_read(trans, rbio, inode_inum(inode), NULL), 0));
wait_for_completion(&done);
ret = blk_status_to_errno(rbio->bio.bi_status);
@@ -638,7 +627,7 @@ do_io:
/* Check for writing past i_size: */
WARN_ONCE((bio_end_sector(&w->io->op.wbio.bio) << 9) >
round_up(i_size, block_bytes(c)) &&
- !test_bit(BCH_FS_EMERGENCY_RO, &c->flags),
+ !test_bit(BCH_FS_emergency_ro, &c->flags),
"writing past i_size: %llu > %llu (unrounded %llu)\n",
bio_end_sector(&w->io->op.wbio.bio) << 9,
round_up(i_size, block_bytes(c)),
@@ -826,7 +815,7 @@ static int __bch2_buffered_write(struct bch_inode_info *inode,
struct bch_fs *c = inode->v.i_sb->s_fs_info;
struct bch2_folio_reservation res;
folios fs;
- struct folio **fi, *f;
+ struct folio *f;
unsigned copied = 0, f_offset, f_copied;
u64 end = pos + len, f_pos, f_len;
loff_t last_folio_pos = inode->v.i_size;
diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c
index 84e20c3ada6c..33cb6da3a5ad 100644
--- a/fs/bcachefs/fs-io-direct.c
+++ b/fs/bcachefs/fs-io-direct.c
@@ -77,7 +77,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
bch2_inode_opts_get(&opts, c, &inode->ei_inode);
- if ((offset|iter->count) & (block_bytes(c) - 1))
+ /* bios must be 512 byte aligned: */
+ if ((offset|iter->count) & (SECTOR_SIZE - 1))
return -EINVAL;
ret = min_t(loff_t, iter->count,
@@ -87,6 +88,8 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter)
return ret;
shorten = iov_iter_count(iter) - round_up(ret, block_bytes(c));
+ if (shorten >= iter->count)
+ shorten = 0;
iter->count -= shorten;
bio = bio_alloc_bioset(NULL,
diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c
index ff664fd0d8ef..d359aa9b33b8 100644
--- a/fs/bcachefs/fs-io-pagecache.c
+++ b/fs/bcachefs/fs-io-pagecache.c
@@ -309,39 +309,49 @@ void bch2_mark_pagecache_unallocated(struct bch_inode_info *inode,
}
}
-void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
- u64 start, u64 end)
+int bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
+ u64 *start, u64 end,
+ bool nonblocking)
{
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- pgoff_t index = start >> PAGE_SECTORS_SHIFT;
+ pgoff_t index = *start >> PAGE_SECTORS_SHIFT;
pgoff_t end_index = (end - 1) >> PAGE_SECTORS_SHIFT;
struct folio_batch fbatch;
s64 i_sectors_delta = 0;
- unsigned i, j;
+ int ret = 0;
- if (end <= start)
- return;
+ if (end <= *start)
+ return 0;
folio_batch_init(&fbatch);
while (filemap_get_folios(inode->v.i_mapping,
&index, end_index, &fbatch)) {
- for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ for (unsigned i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
+
+ if (!nonblocking)
+ folio_lock(folio);
+ else if (!folio_trylock(folio)) {
+ folio_batch_release(&fbatch);
+ ret = -EAGAIN;
+ break;
+ }
+
u64 folio_start = folio_sector(folio);
u64 folio_end = folio_end_sector(folio);
- unsigned folio_offset = max(start, folio_start) - folio_start;
- unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
- struct bch_folio *s;
BUG_ON(end <= folio_start);
- folio_lock(folio);
- s = bch2_folio(folio);
+ *start = min(end, folio_end);
+ struct bch_folio *s = bch2_folio(folio);
if (s) {
+ unsigned folio_offset = max(*start, folio_start) - folio_start;
+ unsigned folio_len = min(end, folio_end) - folio_offset - folio_start;
+
spin_lock(&s->lock);
- for (j = folio_offset; j < folio_offset + folio_len; j++) {
+ for (unsigned j = folio_offset; j < folio_offset + folio_len; j++) {
i_sectors_delta -= s->s[j].state == SECTOR_dirty;
bch2_folio_sector_set(folio, s, j,
folio_sector_reserve(s->s[j].state));
@@ -356,6 +366,7 @@ void bch2_mark_pagecache_reserved(struct bch_inode_info *inode,
}
bch2_i_sectors_acct(c, inode, NULL, i_sectors_delta);
+ return ret;
}
static inline unsigned sectors_to_reserve(struct bch_folio_sector *s,
diff --git a/fs/bcachefs/fs-io-pagecache.h b/fs/bcachefs/fs-io-pagecache.h
index 27f712ae37a6..8cbaba6565b4 100644
--- a/fs/bcachefs/fs-io-pagecache.h
+++ b/fs/bcachefs/fs-io-pagecache.h
@@ -143,7 +143,7 @@ int bch2_folio_set(struct bch_fs *, subvol_inum, struct folio **, unsigned);
void bch2_bio_page_state_set(struct bio *, struct bkey_s_c);
void bch2_mark_pagecache_unallocated(struct bch_inode_info *, u64, u64);
-void bch2_mark_pagecache_reserved(struct bch_inode_info *, u64, u64);
+int bch2_mark_pagecache_reserved(struct bch_inode_info *, u64 *, u64, bool);
int bch2_get_folio_disk_reservation(struct bch_fs *,
struct bch_inode_info *,
diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c
index b0e8144ec550..8c70123b6a0c 100644
--- a/fs/bcachefs/fs-io.c
+++ b/fs/bcachefs/fs-io.c
@@ -79,7 +79,7 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c,
continue;
bio = container_of(bio_alloc_bioset(ca->disk_sb.bdev, 0,
- REQ_OP_FLUSH,
+ REQ_OP_WRITE|REQ_PREFLUSH,
GFP_KERNEL,
&c->nocow_flush_bioset),
struct nocow_flush, bio);
@@ -192,13 +192,17 @@ int bch2_fsync(struct file *file, loff_t start, loff_t end, int datasync)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret, ret2, ret3;
+ int ret;
ret = file_write_and_wait_range(file, start, end);
- ret2 = sync_inode_metadata(&inode->v, 1);
- ret3 = bch2_flush_inode(c, inode);
-
- return bch2_err_class(ret ?: ret2 ?: ret3);
+ if (ret)
+ goto out;
+ ret = sync_inode_metadata(&inode->v, 1);
+ if (ret)
+ goto out;
+ ret = bch2_flush_inode(c, inode);
+out:
+ return bch2_err_class(ret);
}
/* truncate: */
@@ -671,8 +675,11 @@ static int __bchfs_fallocate(struct bch_inode_info *inode, int mode,
bch2_i_sectors_acct(c, inode, &quota_res, i_sectors_delta);
- drop_locks_do(trans,
- (bch2_mark_pagecache_reserved(inode, hole_start, iter.pos.offset), 0));
+ if (bch2_mark_pagecache_reserved(inode, &hole_start,
+ iter.pos.offset, true))
+ drop_locks_do(trans,
+ bch2_mark_pagecache_reserved(inode, &hole_start,
+ iter.pos.offset, false));
bkey_err:
bch2_quota_reservation_put(c, inode, &quota_res);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -861,7 +868,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
abs(pos_src - pos_dst) < len)
return -EINVAL;
- bch2_lock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+ lock_two_nondirectories(&src->v, &dst->v);
+ bch2_lock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
inode_dio_wait(&src->v);
inode_dio_wait(&dst->v);
@@ -914,7 +922,8 @@ loff_t bch2_remap_file_range(struct file *file_src, loff_t pos_src,
ret = bch2_flush_inode(c, dst);
err:
bch2_quota_reservation_put(c, dst, &quota_res);
- bch2_unlock_inodes(INODE_LOCK|INODE_PAGECACHE_BLOCK, src, dst);
+ bch2_unlock_inodes(INODE_PAGECACHE_BLOCK, src, dst);
+ unlock_two_nondirectories(&src->v, &dst->v);
return bch2_err_class(ret);
}
diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c
index 14d5cc6f90d7..3dc8630ff9fe 100644
--- a/fs/bcachefs/fs-ioctl.c
+++ b/fs/bcachefs/fs-ioctl.c
@@ -285,34 +285,26 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg)
bch_notice(c, "shutdown by ioctl type %u", flags);
- down_write(&c->vfs_sb->s_umount);
-
switch (flags) {
case FSOP_GOING_FLAGS_DEFAULT:
- ret = freeze_bdev(c->vfs_sb->s_bdev);
+ ret = bdev_freeze(c->vfs_sb->s_bdev);
if (ret)
- goto err;
-
+ break;
bch2_journal_flush(&c->journal);
- c->vfs_sb->s_flags |= SB_RDONLY;
bch2_fs_emergency_read_only(c);
- thaw_bdev(c->vfs_sb->s_bdev);
+ bdev_thaw(c->vfs_sb->s_bdev);
break;
-
case FSOP_GOING_FLAGS_LOGFLUSH:
bch2_journal_flush(&c->journal);
fallthrough;
-
case FSOP_GOING_FLAGS_NOLOGFLUSH:
- c->vfs_sb->s_flags |= SB_RDONLY;
bch2_fs_emergency_read_only(c);
break;
default:
ret = -EINVAL;
break;
}
-err:
- up_write(&c->vfs_sb->s_umount);
+
return ret;
}
@@ -345,11 +337,12 @@ static long __bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
if (arg.flags & BCH_SUBVOL_SNAPSHOT_RO)
create_flags |= BCH_CREATE_SNAPSHOT_RO;
- /* why do we need this lock? */
- down_read(&c->vfs_sb->s_umount);
-
- if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE)
+ if (arg.flags & BCH_SUBVOL_SNAPSHOT_CREATE) {
+ /* sync_inodes_sb enforce s_umount is locked */
+ down_read(&c->vfs_sb->s_umount);
sync_inodes_sb(c->vfs_sb);
+ up_read(&c->vfs_sb->s_umount);
+ }
retry:
if (arg.src_ptr) {
error = user_path_at(arg.dirfd,
@@ -433,8 +426,6 @@ err2:
goto retry;
}
err1:
- up_read(&c->vfs_sb->s_umount);
-
return error;
}
@@ -451,33 +442,36 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp,
static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp,
struct bch_ioctl_subvolume arg)
{
+ const char __user *name = (void __user *)(unsigned long)arg.dst_ptr;
struct path path;
struct inode *dir;
+ struct dentry *victim;
int ret = 0;
if (arg.flags)
return -EINVAL;
- ret = user_path_at(arg.dirfd,
- (const char __user *)(unsigned long)arg.dst_ptr,
- LOOKUP_FOLLOW, &path);
- if (ret)
- return ret;
+ victim = user_path_locked_at(arg.dirfd, name, &path);
+ if (IS_ERR(victim))
+ return PTR_ERR(victim);
- if (path.dentry->d_sb->s_fs_info != c) {
+ dir = d_inode(path.dentry);
+ if (victim->d_sb->s_fs_info != c) {
ret = -EXDEV;
goto err;
}
-
- dir = path.dentry->d_parent->d_inode;
-
- ret = __bch2_unlink(dir, path.dentry, true);
- if (ret)
+ if (!d_is_positive(victim)) {
+ ret = -ENOENT;
goto err;
-
- fsnotify_rmdir(dir, path.dentry);
- d_delete(path.dentry);
+ }
+ ret = __bch2_unlink(dir, victim, true);
+ if (!ret) {
+ fsnotify_rmdir(dir, victim);
+ d_delete(victim);
+ }
err:
+ inode_unlock(dir);
+ dput(victim);
path_put(&path);
return ret;
}
diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c
index 49da8db1d9e9..77ae65542db9 100644
--- a/fs/bcachefs/fs.c
+++ b/fs/bcachefs/fs.c
@@ -93,7 +93,7 @@ retry:
BTREE_ITER_INTENT) ?:
(set ? set(trans, inode, &inode_u, p) : 0) ?:
bch2_inode_write(trans, &iter, &inode_u) ?:
- bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
/*
* the btree node lock protects inode->ei_inode, not ei_update_lock;
@@ -435,7 +435,7 @@ static int bch2_link(struct dentry *old_dentry, struct inode *vdir,
bch2_subvol_is_ro(c, inode->ei_subvol) ?:
__bch2_link(c, inode, dir, dentry);
if (unlikely(ret))
- return ret;
+ return bch2_err_class(ret);
ihold(&inode->v);
d_instantiate(dentry, &inode->v);
@@ -455,7 +455,7 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry,
bch2_lock_inodes(INODE_UPDATE_LOCK, dir, inode);
ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_enospc,
bch2_unlink_trans(trans,
inode_inum(dir), &dir_u,
&inode_u, &dentry->d_name,
@@ -487,8 +487,9 @@ static int bch2_unlink(struct inode *vdir, struct dentry *dentry)
struct bch_inode_info *dir= to_bch_ei(vdir);
struct bch_fs *c = dir->v.i_sb->s_fs_info;
- return bch2_subvol_is_ro(c, dir->ei_subvol) ?:
+ int ret = bch2_subvol_is_ro(c, dir->ei_subvol) ?:
__bch2_unlink(vdir, dentry, false);
+ return bch2_err_class(ret);
}
static int bch2_symlink(struct mnt_idmap *idmap,
@@ -523,7 +524,7 @@ static int bch2_symlink(struct mnt_idmap *idmap,
return 0;
err:
iput(&inode->v);
- return ret;
+ return bch2_err_class(ret);
}
static int bch2_mkdir(struct mnt_idmap *idmap,
@@ -641,7 +642,7 @@ err:
src_inode,
dst_inode);
- return ret;
+ return bch2_err_class(ret);
}
static void bch2_setattr_copy(struct mnt_idmap *idmap,
@@ -729,7 +730,7 @@ retry:
ret = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
btree_err:
bch2_trans_iter_exit(trans, &inode_iter);
@@ -1012,15 +1013,13 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx)
{
struct bch_inode_info *inode = file_bch_inode(file);
struct bch_fs *c = inode->v.i_sb->s_fs_info;
- int ret;
if (!dir_emit_dots(file, ctx))
return 0;
- ret = bch2_readdir(c, inode_inum(inode), ctx);
- if (ret)
- bch_err_fn(c, ret);
+ int ret = bch2_readdir(c, inode_inum(inode), ctx);
+ bch_err_fn(c, ret);
return bch2_err_class(ret);
}
@@ -1131,7 +1130,7 @@ static const struct address_space_operations bch_address_space_operations = {
#ifdef CONFIG_MIGRATION
.migrate_folio = filemap_migrate_folio,
#endif
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
struct bcachefs_fid {
@@ -1500,7 +1499,7 @@ static void bch2_evict_inode(struct inode *vinode)
void bch2_evict_subvolume_inodes(struct bch_fs *c, snapshot_id_list *s)
{
- struct bch_inode_info *inode, **i;
+ struct bch_inode_info *inode;
DARRAY(struct bch_inode_info *) grabbed;
bool clean_pass = false, this_pass_clean;
@@ -1626,43 +1625,18 @@ static struct bch_fs *bch2_path_to_fs(const char *path)
return c ?: ERR_PTR(-ENOENT);
}
-static char **split_devs(const char *_dev_name, unsigned *nr)
-{
- char *dev_name = NULL, **devs = NULL, *s;
- size_t i = 0, nr_devs = 0;
-
- dev_name = kstrdup(_dev_name, GFP_KERNEL);
- if (!dev_name)
- return NULL;
-
- for (s = dev_name; s; s = strchr(s + 1, ':'))
- nr_devs++;
-
- devs = kcalloc(nr_devs + 1, sizeof(const char *), GFP_KERNEL);
- if (!devs) {
- kfree(dev_name);
- return NULL;
- }
-
- while ((s = strsep(&dev_name, ":")))
- devs[i++] = s;
-
- *nr = nr_devs;
- return devs;
-}
-
static int bch2_remount(struct super_block *sb, int *flags, char *data)
{
struct bch_fs *c = sb->s_fs_info;
struct bch_opts opts = bch2_opts_empty();
int ret;
- opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
-
ret = bch2_parse_mount_opts(c, &opts, data);
if (ret)
goto err;
+ opt_set(opts, read_only, (*flags & SB_RDONLY) != 0);
+
if (opts.read_only != c->opts.read_only) {
down_write(&c->state_lock);
@@ -1696,11 +1670,9 @@ err:
static int bch2_show_devname(struct seq_file *seq, struct dentry *root)
{
struct bch_fs *c = root->d_sb->s_fs_info;
- struct bch_dev *ca;
- unsigned i;
bool first = true;
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (!first)
seq_putc(seq, ':');
first = false;
@@ -1770,7 +1742,7 @@ static int bch2_unfreeze(struct super_block *sb)
struct bch_fs *c = sb->s_fs_info;
int ret;
- if (test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
+ if (test_bit(BCH_FS_emergency_ro, &c->flags))
return 0;
down_write(&c->state_lock);
@@ -1805,17 +1777,18 @@ static int bch2_noset_super(struct super_block *s, void *data)
return -EBUSY;
}
+typedef DARRAY(struct bch_fs *) darray_fs;
+
static int bch2_test_super(struct super_block *s, void *data)
{
struct bch_fs *c = s->s_fs_info;
- struct bch_fs **devs = data;
- unsigned i;
+ darray_fs *d = data;
if (!c)
return false;
- for (i = 0; devs[i]; i++)
- if (c != devs[i])
+ darray_for_each(*d, i)
+ if (c != *i)
return false;
return true;
}
@@ -1824,13 +1797,9 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data)
{
struct bch_fs *c;
- struct bch_dev *ca;
struct super_block *sb;
struct inode *vinode;
struct bch_opts opts = bch2_opts_empty();
- char **devs;
- struct bch_fs **devs_to_fs = NULL;
- unsigned i, nr_devs;
int ret;
opt_set(opts, read_only, (flags & SB_RDONLY) != 0);
@@ -1842,25 +1811,25 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
if (!dev_name || strlen(dev_name) == 0)
return ERR_PTR(-EINVAL);
- devs = split_devs(dev_name, &nr_devs);
- if (!devs)
- return ERR_PTR(-ENOMEM);
+ darray_str devs;
+ ret = bch2_split_devs(dev_name, &devs);
+ if (ret)
+ return ERR_PTR(ret);
- devs_to_fs = kcalloc(nr_devs + 1, sizeof(void *), GFP_KERNEL);
- if (!devs_to_fs) {
- sb = ERR_PTR(-ENOMEM);
- goto got_sb;
+ darray_fs devs_to_fs = {};
+ darray_for_each(devs, i) {
+ ret = darray_push(&devs_to_fs, bch2_path_to_fs(*i));
+ if (ret) {
+ sb = ERR_PTR(ret);
+ goto got_sb;
+ }
}
- for (i = 0; i < nr_devs; i++)
- devs_to_fs[i] = bch2_path_to_fs(devs[i]);
-
- sb = sget(fs_type, bch2_test_super, bch2_noset_super,
- flags|SB_NOSEC, devs_to_fs);
+ sb = sget(fs_type, bch2_test_super, bch2_noset_super, flags|SB_NOSEC, &devs_to_fs);
if (!IS_ERR(sb))
goto got_sb;
- c = bch2_fs_open(devs, nr_devs, opts);
+ c = bch2_fs_open(devs.data, devs.nr, opts);
if (IS_ERR(c)) {
sb = ERR_CAST(c);
goto got_sb;
@@ -1880,9 +1849,8 @@ static struct dentry *bch2_mount(struct file_system_type *fs_type,
if (IS_ERR(sb))
bch2_fs_stop(c);
got_sb:
- kfree(devs_to_fs);
- kfree(devs[0]);
- kfree(devs);
+ darray_exit(&devs_to_fs);
+ bch2_darray_str_exit(&devs);
if (IS_ERR(sb)) {
ret = PTR_ERR(sb);
@@ -1923,7 +1891,7 @@ got_sb:
sb->s_bdi->ra_pages = VM_READAHEAD_PAGES;
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
struct block_device *bdev = ca->disk_sb.bdev;
/* XXX: create an anonymous device for multi device filesystems */
@@ -1944,10 +1912,9 @@ got_sb:
vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM);
ret = PTR_ERR_OR_ZERO(vinode);
- if (ret) {
- bch_err_msg(c, ret, "mounting: error getting root inode");
+ bch_err_msg(c, ret, "mounting: error getting root inode");
+ if (ret)
goto err_put_super;
- }
sb->s_root = d_make_root(vinode);
if (!sb->s_root) {
diff --git a/fs/bcachefs/fs.h b/fs/bcachefs/fs.h
index 5edf1d4b9e6b..c3af7225ff69 100644
--- a/fs/bcachefs/fs.h
+++ b/fs/bcachefs/fs.h
@@ -77,9 +77,8 @@ static inline int ptrcmp(void *l, void *r)
}
enum bch_inode_lock_op {
- INODE_LOCK = (1U << 0),
- INODE_PAGECACHE_BLOCK = (1U << 1),
- INODE_UPDATE_LOCK = (1U << 2),
+ INODE_PAGECACHE_BLOCK = (1U << 0),
+ INODE_UPDATE_LOCK = (1U << 1),
};
#define bch2_lock_inodes(_locks, ...) \
@@ -91,8 +90,6 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_LOCK) \
- down_write_nested(&a[i]->v.i_rwsem, i); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_get(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
@@ -109,8 +106,6 @@ do { \
\
for (i = 1; i < ARRAY_SIZE(a); i++) \
if (a[i] != a[i - 1]) { \
- if ((_locks) & INODE_LOCK) \
- up_write(&a[i]->v.i_rwsem); \
if ((_locks) & INODE_PAGECACHE_BLOCK) \
bch2_pagecache_block_put(a[i]);\
if ((_locks) & INODE_UPDATE_LOCK) \
diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c
index e0c5cd119acc..6a760777bafb 100644
--- a/fs/bcachefs/fsck.c
+++ b/fs/bcachefs/fsck.c
@@ -20,8 +20,6 @@
#include <linux/bsearch.h>
#include <linux/dcache.h> /* struct qstr */
-#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
-
/*
* XXX: this is handling transaction restarts without returning
* -BCH_ERR_transaction_restart_nested, this is not how we do things anymore:
@@ -29,19 +27,16 @@
static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
u32 snapshot)
{
- struct btree_iter iter;
- struct bkey_s_c k;
u64 sectors = 0;
- int ret;
- for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
SPOS(inum, 0, snapshot),
POS(inum, U64_MAX),
- 0, k, ret)
+ 0, k, ({
if (bkey_extent_is_allocation(k.k))
sectors += k.k->size;
-
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
return ret ?: sectors;
}
@@ -49,45 +44,23 @@ static s64 bch2_count_inode_sectors(struct btree_trans *trans, u64 inum,
static s64 bch2_count_subdirs(struct btree_trans *trans, u64 inum,
u32 snapshot)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
u64 subdirs = 0;
- int ret;
-
- for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
- SPOS(inum, 0, snapshot),
- POS(inum, U64_MAX),
- 0, k, ret) {
- if (k.k->type != KEY_TYPE_dirent)
- continue;
- d = bkey_s_c_to_dirent(k);
- if (d.v->d_type == DT_DIR)
+ int ret = for_each_btree_key_upto(trans, iter, BTREE_ID_dirents,
+ SPOS(inum, 0, snapshot),
+ POS(inum, U64_MAX),
+ 0, k, ({
+ if (k.k->type == KEY_TYPE_dirent &&
+ bkey_s_c_to_dirent(k).v->d_type == DT_DIR)
subdirs++;
- }
- bch2_trans_iter_exit(trans, &iter);
+ 0;
+ }));
return ret ?: subdirs;
}
-static int __snapshot_lookup_subvol(struct btree_trans *trans, u32 snapshot,
- u32 *subvol)
-{
- struct bch_snapshot s;
- int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots,
- POS(0, snapshot), 0,
- snapshot, &s);
- if (!ret)
- *subvol = le32_to_cpu(s.subvol);
- else if (bch2_err_matches(ret, ENOENT))
- bch_err(trans->c, "snapshot %u not found", snapshot);
- return ret;
-
-}
-
-static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
- u32 *snapshot, u64 *inum)
+static int subvol_lookup(struct btree_trans *trans, u32 subvol,
+ u32 *snapshot, u64 *inum)
{
struct bch_subvolume s;
int ret;
@@ -99,12 +72,6 @@ static int __subvol_lookup(struct btree_trans *trans, u32 subvol,
return ret;
}
-static int subvol_lookup(struct btree_trans *trans, u32 subvol,
- u32 *snapshot, u64 *inum)
-{
- return lockrestart_do(trans, __subvol_lookup(trans, subvol, snapshot, inum));
-}
-
static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr,
struct bch_inode_unpacked *inode)
{
@@ -132,7 +99,7 @@ err:
return ret;
}
-static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
+static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
struct bch_inode_unpacked *inode,
u32 *snapshot)
{
@@ -152,29 +119,19 @@ static int __lookup_inode(struct btree_trans *trans, u64 inode_nr,
if (!ret)
*snapshot = iter.pos.snapshot;
err:
- bch_err_msg(trans->c, ret, "fetching inode %llu:%u", inode_nr, *snapshot);
bch2_trans_iter_exit(trans, &iter);
return ret;
}
-static int lookup_inode(struct btree_trans *trans, u64 inode_nr,
- struct bch_inode_unpacked *inode,
- u32 *snapshot)
-{
- return lockrestart_do(trans, __lookup_inode(trans, inode_nr, inode, snapshot));
-}
-
-static int __lookup_dirent(struct btree_trans *trans,
+static int lookup_dirent_in_snapshot(struct btree_trans *trans,
struct bch_hash_info hash_info,
subvol_inum dir, struct qstr *name,
- u64 *target, unsigned *type)
+ u64 *target, unsigned *type, u32 snapshot)
{
struct btree_iter iter;
struct bkey_s_c_dirent d;
- int ret;
-
- ret = bch2_hash_lookup(trans, &iter, bch2_dirent_hash_desc,
- &hash_info, dir, name, 0);
+ int ret = bch2_hash_lookup_in_snapshot(trans, &iter, bch2_dirent_hash_desc,
+ &hash_info, dir, name, 0, snapshot);
if (ret)
return ret;
@@ -207,12 +164,9 @@ static int fsck_write_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 snapshot)
{
- int ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- __write_inode(trans, inode, snapshot));
- if (ret)
- bch_err_fn(trans->c, ret);
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ __write_inode(trans, inode, snapshot));
+ bch_err_fn(trans->c, ret);
return ret;
}
@@ -242,35 +196,44 @@ err:
}
/* Get lost+found, create if it doesn't exist: */
-static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
+static int lookup_lostfound(struct btree_trans *trans, u32 snapshot,
struct bch_inode_unpacked *lostfound)
{
struct bch_fs *c = trans->c;
- struct bch_inode_unpacked root;
- struct bch_hash_info root_hash_info;
struct qstr lostfound_str = QSTR("lost+found");
- subvol_inum root_inum = { .subvol = subvol };
u64 inum = 0;
unsigned d_type = 0;
- u32 snapshot;
int ret;
- ret = __subvol_lookup(trans, subvol, &snapshot, &root_inum.inum);
+ struct bch_snapshot_tree st;
+ ret = bch2_snapshot_tree_lookup(trans,
+ bch2_snapshot_tree(c, snapshot), &st);
+ if (ret)
+ return ret;
+
+ subvol_inum root_inum = { .subvol = le32_to_cpu(st.master_subvol) };
+ u32 subvol_snapshot;
+
+ ret = subvol_lookup(trans, le32_to_cpu(st.master_subvol),
+ &subvol_snapshot, &root_inum.inum);
+ bch_err_msg(c, ret, "looking up root subvol");
if (ret)
return ret;
- ret = __lookup_inode(trans, root_inum.inum, &root, &snapshot);
+ struct bch_inode_unpacked root_inode;
+ struct bch_hash_info root_hash_info;
+ u32 root_inode_snapshot = snapshot;
+ ret = lookup_inode(trans, root_inum.inum, &root_inode, &root_inode_snapshot);
+ bch_err_msg(c, ret, "looking up root inode");
if (ret)
return ret;
- root_hash_info = bch2_hash_info_init(c, &root);
+ root_hash_info = bch2_hash_info_init(c, &root_inode);
- ret = __lookup_dirent(trans, root_hash_info, root_inum,
- &lostfound_str, &inum, &d_type);
- if (bch2_err_matches(ret, ENOENT)) {
- bch_notice(c, "creating lost+found");
+ ret = lookup_dirent_in_snapshot(trans, root_hash_info, root_inum,
+ &lostfound_str, &inum, &d_type, snapshot);
+ if (bch2_err_matches(ret, ENOENT))
goto create_lostfound;
- }
bch_err_fn(c, ret);
if (ret)
@@ -285,20 +248,53 @@ static int lookup_lostfound(struct btree_trans *trans, u32 subvol,
* The bch2_check_dirents pass has already run, dangling dirents
* shouldn't exist here:
*/
- return __lookup_inode(trans, inum, lostfound, &snapshot);
+ ret = lookup_inode(trans, inum, lostfound, &snapshot);
+ bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)",
+ inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot));
+ return ret;
create_lostfound:
+ /*
+ * XXX: we could have a nicer log message here if we had a nice way to
+ * walk backpointers to print a path
+ */
+ bch_notice(c, "creating lost+found in snapshot %u", le32_to_cpu(st.root_snapshot));
+
+ u64 now = bch2_current_time(c);
+ struct btree_iter lostfound_iter = { NULL };
+ u64 cpu = raw_smp_processor_id();
+
bch2_inode_init_early(c, lostfound);
+ bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode);
+ lostfound->bi_dir = root_inode.bi_inum;
+
+ root_inode.bi_nlink++;
+
+ ret = bch2_inode_create(trans, &lostfound_iter, lostfound, snapshot, cpu);
+ if (ret)
+ goto err;
- ret = bch2_create_trans(trans, root_inum, &root,
- lostfound, &lostfound_str,
- 0, 0, S_IFDIR|0700, 0, NULL, NULL,
- (subvol_inum) { }, 0);
+ bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot);
+ ret = bch2_btree_iter_traverse(&lostfound_iter);
+ if (ret)
+ goto err;
+
+ ret = bch2_dirent_create_snapshot(trans,
+ root_inode.bi_inum, snapshot, &root_hash_info,
+ mode_to_type(lostfound->bi_mode),
+ &lostfound_str,
+ lostfound->bi_inum,
+ &lostfound->bi_dir_offset,
+ BCH_HASH_SET_MUST_CREATE) ?:
+ bch2_inode_write_flags(trans, &lostfound_iter, lostfound,
+ BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
+err:
bch_err_msg(c, ret, "creating lost+found");
+ bch2_trans_iter_exit(trans, &lostfound_iter);
return ret;
}
-static int __reattach_inode(struct btree_trans *trans,
+static int reattach_inode(struct btree_trans *trans,
struct bch_inode_unpacked *inode,
u32 inode_snapshot)
{
@@ -307,14 +303,9 @@ static int __reattach_inode(struct btree_trans *trans,
char name_buf[20];
struct qstr name;
u64 dir_offset = 0;
- u32 subvol;
int ret;
- ret = __snapshot_lookup_subvol(trans, inode_snapshot, &subvol);
- if (ret)
- return ret;
-
- ret = lookup_lostfound(trans, subvol, &lostfound);
+ ret = lookup_lostfound(trans, inode_snapshot, &lostfound);
if (ret)
return ret;
@@ -331,15 +322,12 @@ static int __reattach_inode(struct btree_trans *trans,
snprintf(name_buf, sizeof(name_buf), "%llu", inode->bi_inum);
name = (struct qstr) QSTR(name_buf);
- ret = bch2_dirent_create(trans,
- (subvol_inum) {
- .subvol = subvol,
- .inum = lostfound.bi_inum,
- },
- &dir_hash,
- inode_d_type(inode),
- &name, inode->bi_inum, &dir_offset,
- BCH_HASH_SET_MUST_CREATE);
+ ret = bch2_dirent_create_snapshot(trans,
+ lostfound.bi_inum, inode_snapshot,
+ &dir_hash,
+ inode_d_type(inode),
+ &name, inode->bi_inum, &dir_offset,
+ BCH_HASH_SET_MUST_CREATE);
if (ret)
return ret;
@@ -349,18 +337,6 @@ static int __reattach_inode(struct btree_trans *trans,
return __write_inode(trans, inode, inode_snapshot);
}
-static int reattach_inode(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 inode_snapshot)
-{
- int ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL,
- __reattach_inode(trans, inode, inode_snapshot));
- bch_err_msg(trans->c, ret, "reattaching inode %llu", inode->bi_inum);
- return ret;
-}
-
static int remove_backpointer(struct btree_trans *trans,
struct bch_inode_unpacked *inode)
{
@@ -405,7 +381,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s
};
int ret = 0;
- darray_for_each(s->ids, i) {
+ __darray_for_each(s->ids, i) {
if (i->id == id)
return 0;
if (i->id > id)
@@ -422,7 +398,7 @@ static int snapshots_seen_add_inorder(struct bch_fs *c, struct snapshots_seen *s
static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
enum btree_id btree_id, struct bpos pos)
{
- struct snapshots_seen_entry *i, n = {
+ struct snapshots_seen_entry n = {
.id = pos.snapshot,
.equiv = bch2_snapshot_equiv(c, pos.snapshot),
};
@@ -448,7 +424,7 @@ static int snapshots_seen_update(struct bch_fs *c, struct snapshots_seen *s,
bch2_btree_id_str(btree_id),
pos.inode, pos.offset,
i->id, n.id, n.equiv);
- set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_delete_dead_snapshots);
}
}
@@ -593,14 +569,13 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- u32 restart_count = trans->restart_count;
int ret;
w->recalculate_sums = false;
w->inodes.nr = 0;
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, inum),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
if (k.k->p.offset != inum)
break;
@@ -613,8 +588,7 @@ static int get_inodes_all_snapshots(struct btree_trans *trans,
return ret;
w->first_this_inode = true;
-
- return trans_was_restarted(trans, restart_count);
+ return 0;
}
static struct inode_walker_entry *
@@ -625,7 +599,7 @@ lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w,
snapshot = bch2_snapshot_equiv(c, snapshot);
- darray_for_each(w->inodes, i)
+ __darray_for_each(w->inodes, i)
if (bch2_snapshot_is_ancestor(c, snapshot, i->snapshot))
goto found;
@@ -667,11 +641,8 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
} else if (bkey_cmp(w->last_pos, pos)) {
- struct inode_walker_entry *i;
-
darray_for_each(w->inodes, i)
i->seen_this_pos = false;
-
}
w->last_pos = pos;
@@ -756,9 +727,7 @@ static int hash_redo_key(struct btree_trans *trans,
k.k->p.snapshot, tmp,
BCH_HASH_SET_MUST_CREATE,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
- bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW);
+ bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
static int hash_check_key(struct btree_trans *trans,
@@ -826,6 +795,18 @@ fsck_err:
goto out;
}
+static int check_inode_deleted_list(struct btree_trans *trans, struct bpos p)
+{
+ struct btree_iter iter;
+ struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_deleted_inodes, p, 0);
+ int ret = bkey_err(k);
+ if (ret)
+ return ret;
+
+ bch2_trans_iter_exit(trans, &iter);
+ return k.k->type == KEY_TYPE_set;
+}
+
static int check_inode(struct btree_trans *trans,
struct btree_iter *iter,
struct bkey_s_c k,
@@ -867,7 +848,7 @@ static int check_inode(struct btree_trans *trans,
c, inode_snapshot_mismatch,
"inodes in different snapshots don't match")) {
bch_err(c, "repair not implemented yet");
- return -EINVAL;
+ return -BCH_ERR_fsck_repair_unimplemented;
}
if ((u.bi_flags & (BCH_INODE_i_size_dirty|BCH_INODE_unlinked)) &&
@@ -890,14 +871,22 @@ static int check_inode(struct btree_trans *trans,
return 0;
}
+ if (u.bi_flags & BCH_INODE_unlinked) {
+ ret = check_inode_deleted_list(trans, k.k->p);
+ if (ret < 0)
+ return ret;
+
+ fsck_err_on(ret, c, unlinked_inode_not_on_deleted_list,
+ "inode %llu:%u unlinked, but not on deleted list",
+ u.bi_inum, k.k->p.snapshot);
+ ret = 0;
+ }
+
if (u.bi_flags & BCH_INODE_unlinked &&
(!c->sb.clean ||
fsck_err(c, inode_unlinked_but_clean,
"filesystem marked clean, but inode %llu unlinked",
u.bi_inum))) {
- bch2_trans_unlock(trans);
- bch2_fs_lazy_rw(c);
-
ret = bch2_inode_rm_snapshot(trans, u.bi_inum, iter->pos.snapshot);
bch_err_msg(c, ret, "in fsck deleting inode");
return ret;
@@ -910,9 +899,6 @@ static int check_inode(struct btree_trans *trans,
u.bi_inum))) {
bch_verbose(c, "truncating inode %llu", u.bi_inum);
- bch2_trans_unlock(trans);
- bch2_fs_lazy_rw(c);
-
/*
* XXX: need to truncate partial blocks too here - or ideally
* just switch units to bytes and that issue goes away
@@ -976,27 +962,22 @@ fsck_err:
return ret;
}
-noinline_for_stack
int bch2_check_inodes(struct bch_fs *c)
{
bool full = c->opts.fsck;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
struct bch_inode_unpacked prev = { 0 };
struct snapshots_seen s;
- struct bkey_s_c k;
- int ret;
snapshots_seen_init(&s);
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
- POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_inode(trans, &iter, k, &prev, &s, full));
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
+ POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_inode(trans, &iter, k, &prev, &s, full)));
snapshots_seen_exit(&s);
- bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
}
@@ -1023,29 +1004,9 @@ static bool dirent_points_to_inode(struct bkey_s_c_dirent d,
: le64_to_cpu(d.v->d_inum) == inode->bi_inum;
}
-static int inode_backpointer_exists(struct btree_trans *trans,
- struct bch_inode_unpacked *inode,
- u32 snapshot)
-{
- struct btree_iter iter;
- struct bkey_s_c_dirent d;
- int ret;
-
- d = dirent_get_by_pos(trans, &iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset, snapshot));
- ret = bkey_err(d);
- if (ret)
- return bch2_err_matches(ret, ENOENT) ? 0 : ret;
-
- ret = dirent_points_to_inode(d, inode);
- bch2_trans_iter_exit(trans, &iter);
- return ret;
-}
-
static int check_i_sectors(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1094,11 +1055,8 @@ struct extent_ends {
static void extent_ends_reset(struct extent_ends *extent_ends)
{
- struct extent_end *i;
-
darray_for_each(extent_ends->e, i)
snapshots_seen_exit(&i->seen);
-
extent_ends->e.nr = 0;
}
@@ -1130,7 +1088,7 @@ static int extent_ends_at(struct bch_fs *c,
if (!n.seen.ids.data)
return -BCH_ERR_ENOMEM_fsck_extent_ends_at;
- darray_for_each(extent_ends->e, i) {
+ __darray_for_each(extent_ends->e, i) {
if (i->snapshot == k.k->p.snapshot) {
snapshots_seen_exit(&i->seen);
*i = n;
@@ -1220,13 +1178,12 @@ static int overlapping_extents_found(struct btree_trans *trans,
swap(k1, k2);
}
- trans->extra_journal_res += bch2_bkey_sectors_compressed(k2);
+ trans->extra_disk_res += bch2_bkey_sectors_compressed(k2);
ret = bch2_trans_update_extent_overwrite(trans, old_iter,
BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE,
k1, k2) ?:
- bch2_trans_commit(trans, &res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc);
bch2_disk_reservation_put(c, &res);
if (ret)
@@ -1270,7 +1227,6 @@ static int check_overlapping_extents(struct btree_trans *trans,
bool *fixed)
{
struct bch_fs *c = trans->c;
- struct extent_end *i;
int ret = 0;
/* transaction restart, running again */
@@ -1451,32 +1407,28 @@ int bch2_check_extents(struct bch_fs *c)
{
struct inode_walker w = inode_walker_init();
struct snapshots_seen s;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct extent_ends extent_ends;
struct disk_reservation res = { 0 };
- int ret = 0;
snapshots_seen_init(&s);
extent_ends_init(&extent_ends);
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
- bch2_disk_reservation_put(c, &res);
- check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
- check_extent_overbig(trans, &iter, k);
- })) ?:
- check_i_sectors(trans, &w);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_extents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ &res, NULL,
+ BCH_TRANS_COMMIT_no_enospc, ({
+ bch2_disk_reservation_put(c, &res);
+ check_extent(trans, &iter, k, &w, &s, &extent_ends) ?:
+ check_extent_overbig(trans, &iter, k);
+ })) ?:
+ check_i_sectors(trans, &w));
bch2_disk_reservation_put(c, &res);
extent_ends_exit(&extent_ends);
inode_walker_exit(&w);
snapshots_seen_exit(&s);
- bch2_trans_put(trans);
bch_err_fn(c, ret);
return ret;
@@ -1484,24 +1436,19 @@ int bch2_check_extents(struct bch_fs *c)
int bch2_check_indirect_extents(struct bch_fs *c)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct disk_reservation res = { 0 };
- int ret = 0;
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
- POS_MIN,
- BTREE_ITER_PREFETCH, k,
- &res, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL, ({
- bch2_disk_reservation_put(c, &res);
- check_extent_overbig(trans, &iter, k);
- }));
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_reflink,
+ POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ &res, NULL,
+ BCH_TRANS_COMMIT_no_enospc, ({
+ bch2_disk_reservation_put(c, &res);
+ check_extent_overbig(trans, &iter, k);
+ })));
bch2_disk_reservation_put(c, &res);
- bch2_trans_put(trans);
-
bch_err_fn(c, ret);
return ret;
}
@@ -1509,7 +1456,6 @@ int bch2_check_indirect_extents(struct bch_fs *c)
static int check_subdir_count(struct btree_trans *trans, struct inode_walker *w)
{
struct bch_fs *c = trans->c;
- struct inode_walker_entry *i;
u32 restart_count = trans->restart_count;
int ret = 0;
s64 count2;
@@ -1553,8 +1499,8 @@ static int check_dirent_target(struct btree_trans *trans,
{
struct bch_fs *c = trans->c;
struct bkey_i_dirent *n;
- bool backpointer_exists = true;
struct printbuf buf = PRINTBUF;
+ struct btree_iter bp_iter = { NULL };
int ret = 0;
if (!target->bi_dir &&
@@ -1568,25 +1514,37 @@ static int check_dirent_target(struct btree_trans *trans,
}
if (!inode_points_to_dirent(target, d)) {
- ret = inode_backpointer_exists(trans, target, d.k->p.snapshot);
- if (ret < 0)
+ struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter,
+ SPOS(target->bi_dir, target->bi_dir_offset, target_snapshot));
+ ret = bkey_err(bp_dirent);
+ if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
- backpointer_exists = ret;
+ bool backpointer_exists = !ret;
ret = 0;
+ bch2_bkey_val_to_text(&buf, c, d.s_c);
+ prt_newline(&buf);
+ if (backpointer_exists)
+ bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c);
+
if (fsck_err_on(S_ISDIR(target->bi_mode) && backpointer_exists,
c, inode_dir_multiple_links,
- "directory %llu with multiple links",
- target->bi_inum)) {
+ "directory %llu:%u with multiple links\n%s",
+ target->bi_inum, target_snapshot, buf.buf)) {
ret = __remove_dirent(trans, d.k->p);
goto out;
}
+ /*
+ * hardlinked file with nlink 0:
+ * We're just adjusting nlink here so check_nlinks() will pick
+ * it up, it ignores inodes with nlink 0
+ */
if (fsck_err_on(backpointer_exists && !target->bi_nlink,
c, inode_multiple_links_but_nlink_0,
- "inode %llu type %s has multiple links but i_nlink 0",
- target->bi_inum, bch2_d_types[d.v->d_type])) {
+ "inode %llu:%u type %s has multiple links but i_nlink 0\n%s",
+ target->bi_inum, target_snapshot, bch2_d_types[d.v->d_type], buf.buf)) {
target->bi_nlink++;
target->bi_flags &= ~BCH_INODE_unlinked;
@@ -1636,13 +1594,12 @@ static int check_dirent_target(struct btree_trans *trans,
d = dirent_i_to_s_c(n);
}
- if (d.v->d_type == DT_SUBVOL &&
- target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol) &&
- (c->sb.version < bcachefs_metadata_version_subvol_dirent ||
- fsck_err(c, dirent_d_parent_subvol_wrong,
- "dirent has wrong d_parent_subvol field: got %u, should be %u",
- le32_to_cpu(d.v->d_parent_subvol),
- target->bi_parent_subvol))) {
+ if (fsck_err_on(d.v->d_type == DT_SUBVOL &&
+ target->bi_parent_subvol != le32_to_cpu(d.v->d_parent_subvol),
+ c, dirent_d_parent_subvol_wrong,
+ "dirent has wrong d_parent_subvol field: got %u, should be %u",
+ le32_to_cpu(d.v->d_parent_subvol),
+ target->bi_parent_subvol)) {
n = bch2_trans_kmalloc(trans, bkey_bytes(d.k));
ret = PTR_ERR_OR_ZERO(n);
if (ret)
@@ -1660,6 +1617,7 @@ static int check_dirent_target(struct btree_trans *trans,
out:
err:
fsck_err:
+ bch2_trans_iter_exit(trans, &bp_iter);
printbuf_exit(&buf);
bch_err_fn(c, ret);
return ret;
@@ -1701,7 +1659,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
- BUG_ON(!iter->path->should_be_locked);
+ BUG_ON(!btree_iter_path(trans, iter)->should_be_locked);
i = walk_inode(trans, dir, equiv, k.k->type == KEY_TYPE_whiteout);
ret = PTR_ERR_OR_ZERO(i);
@@ -1754,7 +1712,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
u32 target_snapshot;
u64 target_inum;
- ret = __subvol_lookup(trans, target_subvol,
+ ret = subvol_lookup(trans, target_subvol,
&target_snapshot, &target_inum);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
@@ -1766,7 +1724,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter,
goto err;
}
- ret = __lookup_inode(trans, target_inum,
+ ret = lookup_inode(trans, target_inum,
&subvol_root, &target_snapshot);
if (ret && !bch2_err_matches(ret, ENOENT))
goto err;
@@ -1842,22 +1800,18 @@ int bch2_check_dirents(struct bch_fs *c)
struct inode_walker target = inode_walker_init();
struct snapshots_seen s;
struct bch_hash_info hash_info;
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
snapshots_seen_init(&s);
- ret = for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
- POS(BCACHEFS_ROOT_INO, 0),
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
- k,
- NULL, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s));
+ int ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_dirents,
+ POS(BCACHEFS_ROOT_INO, 0),
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
+ k,
+ NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc,
+ check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)));
- bch2_trans_put(trans);
snapshots_seen_exit(&s);
inode_walker_exit(&dir);
inode_walker_exit(&target);
@@ -1908,8 +1862,6 @@ int bch2_check_xattrs(struct bch_fs *c)
{
struct inode_walker inode = inode_walker_init();
struct bch_hash_info hash_info;
- struct btree_iter iter;
- struct bkey_s_c k;
int ret = 0;
ret = bch2_trans_run(c,
@@ -1918,7 +1870,7 @@ int bch2_check_xattrs(struct bch_fs *c)
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS,
k,
NULL, NULL,
- BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ BCH_TRANS_COMMIT_no_enospc,
check_xattr(trans, &iter, k, &hash_info, &inode)));
bch_err_fn(c, ret);
return ret;
@@ -1932,7 +1884,7 @@ static int check_root_trans(struct btree_trans *trans)
u64 inum;
int ret;
- ret = __subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
+ ret = subvol_lookup(trans, BCACHEFS_ROOT_SUBVOL, &snapshot, &inum);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
@@ -1948,18 +1900,13 @@ static int check_root_trans(struct btree_trans *trans)
root_subvol.v.flags = 0;
root_subvol.v.snapshot = cpu_to_le32(snapshot);
root_subvol.v.inode = cpu_to_le64(inum);
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- bch2_btree_insert_trans(trans, BTREE_ID_subvolumes,
- &root_subvol.k_i, 0));
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_subvolumes, &root_subvol.k_i, 0);
bch_err_msg(c, ret, "writing root subvol");
if (ret)
goto err;
-
}
- ret = __lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
+ ret = lookup_inode(trans, BCACHEFS_ROOT_INO, &root_inode, &snapshot);
if (ret && !bch2_err_matches(ret, ENOENT))
return ret;
@@ -1983,11 +1930,7 @@ fsck_err:
/* Get root directory, create if it doesn't exist: */
int bch2_check_root(struct bch_fs *c)
{
- int ret;
-
- ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_root_trans(trans));
bch_err_fn(c, ret);
return ret;
@@ -2002,13 +1945,10 @@ typedef DARRAY(struct pathbuf_entry) pathbuf;
static bool path_is_dup(pathbuf *p, u64 inum, u32 snapshot)
{
- struct pathbuf_entry *i;
-
darray_for_each(*p, i)
if (i->inum == inum &&
i->snapshot == snapshot)
return true;
-
return false;
}
@@ -2057,10 +1997,10 @@ static int check_path(struct btree_trans *trans,
break;
}
- ret = lockrestart_do(trans,
- PTR_ERR_OR_ZERO((d = dirent_get_by_pos(trans, &dirent_iter,
- SPOS(inode->bi_dir, inode->bi_dir_offset,
- parent_snapshot))).k));
+ d = dirent_get_by_pos(trans, &dirent_iter,
+ SPOS(inode->bi_dir, inode->bi_dir_offset,
+ parent_snapshot));
+ ret = bkey_err(d.s_c);
if (ret && !bch2_err_matches(ret, ENOENT))
break;
@@ -2097,13 +2037,12 @@ static int check_path(struct btree_trans *trans,
ret = lookup_inode(trans, inode->bi_dir, inode, &snapshot);
if (ret) {
/* Should have been caught in dirents pass */
- bch_err(c, "error looking up parent directory: %i", ret);
+ if (!bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err(c, "error looking up parent directory: %i", ret);
break;
}
if (path_is_dup(p, inode->bi_inum, snapshot)) {
- struct pathbuf_entry *i;
-
/* XXX print path */
bch_err(c, "directory structure loop");
@@ -2111,20 +2050,19 @@ static int check_path(struct btree_trans *trans,
pr_err("%llu:%u", i->inum, i->snapshot);
pr_err("%llu:%u", inode->bi_inum, snapshot);
- if (!fsck_err(c, dir_loop,
- "directory structure loop"))
+ if (!fsck_err(c, dir_loop, "directory structure loop"))
return 0;
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- remove_backpointer(trans, inode));
- if (ret) {
- bch_err(c, "error removing dirent: %i", ret);
+ ret = remove_backpointer(trans, inode);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err_msg(c, ret, "removing dirent");
+ if (ret)
break;
- }
ret = reattach_inode(trans, inode, snapshot);
+ if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
+ bch_err_msg(c, ret, "reattaching inode %llu", inode->bi_inum);
+ break;
}
}
fsck_err:
@@ -2139,37 +2077,28 @@ fsck_err:
*/
int bch2_check_directory_structure(struct bch_fs *c)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
struct bch_inode_unpacked u;
pathbuf path = { 0, };
int ret;
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- if (!bkey_is_inode(k.k))
- continue;
+ ret = bch2_trans_run(c,
+ for_each_btree_key_commit(trans, iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ if (!bkey_is_inode(k.k))
+ continue;
- ret = bch2_inode_unpack(k, &u);
- if (ret) {
- /* Should have been caught earlier in fsck: */
- bch_err(c, "error unpacking inode %llu: %i", k.k->p.offset, ret);
- break;
- }
+ BUG_ON(bch2_inode_unpack(k, &u));
- if (u.bi_flags & BCH_INODE_unlinked)
- continue;
+ if (u.bi_flags & BCH_INODE_unlinked)
+ continue;
- ret = check_path(trans, &path, &u, iter.pos.snapshot);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
+ check_path(trans, &path, &u, iter.pos.snapshot);
+ })));
darray_exit(&path);
+
bch_err_fn(c, ret);
return ret;
}
@@ -2255,47 +2184,39 @@ static int check_nlinks_find_hardlinks(struct bch_fs *c,
struct nlink_table *t,
u64 start, u64 *end)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bch_inode_unpacked u;
- int ret = 0;
-
- for_each_btree_key(trans, iter, BTREE_ID_inodes,
- POS(0, start),
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- if (!bkey_is_inode(k.k))
- continue;
-
- /* Should never fail, checked by bch2_inode_invalid: */
- BUG_ON(bch2_inode_unpack(k, &u));
-
- /*
- * Backpointer and directory structure checks are sufficient for
- * directories, since they can't have hardlinks:
- */
- if (S_ISDIR(u.bi_mode))
- continue;
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_inodes,
+ POS(0, start),
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ if (!bkey_is_inode(k.k))
+ continue;
- if (!u.bi_nlink)
- continue;
+ /* Should never fail, checked by bch2_inode_invalid: */
+ struct bch_inode_unpacked u;
+ BUG_ON(bch2_inode_unpack(k, &u));
- ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
- if (ret) {
- *end = k.k->p.offset;
- ret = 0;
- break;
- }
+ /*
+ * Backpointer and directory structure checks are sufficient for
+ * directories, since they can't have hardlinks:
+ */
+ if (S_ISDIR(u.bi_mode))
+ continue;
- }
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
+ if (!u.bi_nlink)
+ continue;
- if (ret)
- bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+ ret = add_nlink(c, t, k.k->p.offset, k.k->p.snapshot);
+ if (ret) {
+ *end = k.k->p.offset;
+ ret = 0;
+ break;
+ }
+ 0;
+ })));
+ bch_err_fn(c, ret);
return ret;
}
@@ -2303,42 +2224,34 @@ noinline_for_stack
static int check_nlinks_walk_dirents(struct bch_fs *c, struct nlink_table *links,
u64 range_start, u64 range_end)
{
- struct btree_trans *trans = bch2_trans_get(c);
struct snapshots_seen s;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_dirent d;
- int ret;
snapshots_seen_init(&s);
- for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
- BTREE_ITER_INTENT|
- BTREE_ITER_PREFETCH|
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
- if (ret)
- break;
-
- switch (k.k->type) {
- case KEY_TYPE_dirent:
- d = bkey_s_c_to_dirent(k);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_dirents, POS_MIN,
+ BTREE_ITER_INTENT|
+ BTREE_ITER_PREFETCH|
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ ret = snapshots_seen_update(c, &s, iter.btree_id, k.k->p);
+ if (ret)
+ break;
- if (d.v->d_type != DT_DIR &&
- d.v->d_type != DT_SUBVOL)
- inc_link(c, &s, links, range_start, range_end,
- le64_to_cpu(d.v->d_inum),
- bch2_snapshot_equiv(c, d.k->p.snapshot));
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
+ if (k.k->type == KEY_TYPE_dirent) {
+ struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- if (ret)
- bch_err(c, "error in fsck: btree error %i while walking dirents", ret);
+ if (d.v->d_type != DT_DIR &&
+ d.v->d_type != DT_SUBVOL)
+ inc_link(c, &s, links, range_start, range_end,
+ le64_to_cpu(d.v->d_inum),
+ bch2_snapshot_equiv(c, d.k->p.snapshot));
+ }
+ 0;
+ })));
- bch2_trans_put(trans);
snapshots_seen_exit(&s);
+
+ bch_err_fn(c, ret);
return ret;
}
@@ -2389,19 +2302,16 @@ static int check_nlinks_update_hardlinks(struct bch_fs *c,
struct nlink_table *links,
u64 range_start, u64 range_end)
{
- struct btree_iter iter;
- struct bkey_s_c k;
size_t idx = 0;
- int ret = 0;
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_inodes,
POS(0, range_start),
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_nlinks_update_inode(trans, &iter, k, links, &idx, range_end)));
if (ret < 0) {
- bch_err(c, "error in fsck: btree error %i while walking inodes", ret);
+ bch_err(c, "error in fsck walking inodes: %s", bch2_err_str(ret));
return ret;
}
@@ -2447,7 +2357,6 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
{
struct bkey_s_c_reflink_p p;
struct bkey_i_reflink_p *u;
- int ret;
if (k.k->type != KEY_TYPE_reflink_p)
return 0;
@@ -2458,7 +2367,7 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
return 0;
u = bch2_trans_kmalloc(trans, sizeof(*u));
- ret = PTR_ERR_OR_ZERO(u);
+ int ret = PTR_ERR_OR_ZERO(u);
if (ret)
return ret;
@@ -2471,19 +2380,15 @@ static int fix_reflink_p_key(struct btree_trans *trans, struct btree_iter *iter,
int bch2_fix_reflink_p(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix)
return 0;
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_extents, POS_MIN,
BTREE_ITER_INTENT|BTREE_ITER_PREFETCH|
BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
fix_reflink_p_key(trans, &iter, k)));
bch_err_fn(c, ret);
return ret;
diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c
index 9309cfeecd8d..086f0090b03a 100644
--- a/fs/bcachefs/inode.c
+++ b/fs/bcachefs/inode.c
@@ -506,22 +506,33 @@ fsck_err:
static void __bch2_inode_unpacked_to_text(struct printbuf *out,
struct bch_inode_unpacked *inode)
{
- prt_printf(out, "mode=%o ", inode->bi_mode);
+ printbuf_indent_add(out, 2);
+ prt_printf(out, "mode=%o", inode->bi_mode);
+ prt_newline(out);
prt_str(out, "flags=");
prt_bitflags(out, bch2_inode_flag_strs, inode->bi_flags & ((1U << 20) - 1));
prt_printf(out, " (%x)", inode->bi_flags);
+ prt_newline(out);
- prt_printf(out, " journal_seq=%llu bi_size=%llu bi_sectors=%llu bi_version=%llu",
- inode->bi_journal_seq,
- inode->bi_size,
- inode->bi_sectors,
- inode->bi_version);
+ prt_printf(out, "journal_seq=%llu", inode->bi_journal_seq);
+ prt_newline(out);
+
+ prt_printf(out, "bi_size=%llu", inode->bi_size);
+ prt_newline(out);
+
+ prt_printf(out, "bi_sectors=%llu", inode->bi_sectors);
+ prt_newline(out);
+
+ prt_newline(out);
+ prt_printf(out, "bi_version=%llu", inode->bi_version);
#define x(_name, _bits) \
- prt_printf(out, " "#_name "=%llu", (u64) inode->_name);
+ prt_printf(out, #_name "=%llu", (u64) inode->_name); \
+ prt_newline(out);
BCH_INODE_FIELDS_v3()
#undef x
+ printbuf_indent_sub(out, 2);
}
void bch2_inode_unpacked_to_text(struct printbuf *out, struct bch_inode_unpacked *inode)
@@ -561,64 +572,46 @@ static inline bool bkey_is_deleted_inode(struct bkey_s_c k)
return bkey_inode_flags(k) & BCH_INODE_unlinked;
}
-int bch2_trans_mark_inode(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old,
- struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_inode(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_s new,
+ unsigned flags)
{
- int nr = bkey_is_inode(&new->k) - bkey_is_inode(old.k);
- bool old_deleted = bkey_is_deleted_inode(old);
- bool new_deleted = bkey_is_deleted_inode(bkey_i_to_s_c(new));
+ s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k);
- if (nr) {
- int ret = bch2_replicas_deltas_realloc(trans, 0);
- struct replicas_delta_list *d = trans->fs_usage_deltas;
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ if (nr) {
+ int ret = bch2_replicas_deltas_realloc(trans, 0);
+ if (ret)
+ return ret;
- if (ret)
- return ret;
+ trans->fs_usage_deltas->nr_inodes += nr;
+ }
- d->nr_inodes += nr;
+ bool old_deleted = bkey_is_deleted_inode(old);
+ bool new_deleted = bkey_is_deleted_inode(new.s_c);
+ if (old_deleted != new_deleted) {
+ int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new.k->p, new_deleted);
+ if (ret)
+ return ret;
+ }
}
- if (old_deleted != new_deleted) {
- int ret = bch2_btree_bit_mod(trans, BTREE_ID_deleted_inodes, new->k.p, new_deleted);
- if (ret)
- return ret;
- }
+ if ((flags & BTREE_TRIGGER_ATOMIC) && (flags & BTREE_TRIGGER_INSERT)) {
+ BUG_ON(!trans->journal_res.seq);
- return 0;
-}
-
-int bch2_mark_inode(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_s_c new,
- unsigned flags)
-{
- struct bch_fs *c = trans->c;
- struct bch_fs_usage *fs_usage;
- u64 journal_seq = trans->journal_res.seq;
-
- if (flags & BTREE_TRIGGER_INSERT) {
- struct bch_inode_v3 *v = (struct bch_inode_v3 *) new.v;
-
- BUG_ON(!journal_seq);
- BUG_ON(new.k->type != KEY_TYPE_inode_v3);
-
- v->bi_journal_seq = cpu_to_le64(journal_seq);
+ bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq);
}
if (flags & BTREE_TRIGGER_GC) {
- percpu_down_read(&c->mark_lock);
- preempt_disable();
+ struct bch_fs *c = trans->c;
- fs_usage = fs_usage_ptr(c, journal_seq, flags & BTREE_TRIGGER_GC);
- fs_usage->nr_inodes += bkey_is_inode(new.k);
- fs_usage->nr_inodes -= bkey_is_inode(old.k);
-
- preempt_enable();
+ percpu_down_read(&c->mark_lock);
+ this_cpu_add(c->usage_gc->b.nr_inodes, nr);
percpu_up_read(&c->mark_lock);
}
+
return 0;
}
@@ -831,7 +824,7 @@ static int bch2_inode_delete_keys(struct btree_trans *trans,
ret = bch2_trans_update(trans, &iter, &delete, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
break;
@@ -894,7 +887,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1058,7 +1051,7 @@ retry:
ret = bch2_trans_update(trans, &iter, &delete.k_i, 0) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
err:
bch2_trans_iter_exit(trans, &iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -1155,51 +1148,48 @@ delete:
int bch2_delete_dead_inodes(struct bch_fs *c)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
bool need_another_pass;
int ret;
again:
need_another_pass = false;
- ret = bch2_btree_write_buffer_flush_sync(trans);
- if (ret)
- goto err;
-
/*
* Weird transaction restart handling here because on successful delete,
* bch2_inode_rm_snapshot() will return a nested transaction restart,
* but we can't retry because the btree write buffer won't have been
* flushed and we'd spin:
*/
- for_each_btree_key(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
- BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
- ret = commit_do(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL|
- BTREE_INSERT_LAZY_RW,
- may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass));
- if (ret < 0)
- break;
-
- if (ret) {
- if (!test_bit(BCH_FS_RW, &c->flags)) {
- bch2_trans_unlock(trans);
- bch2_fs_lazy_rw(c);
- }
-
+ ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
+ ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass);
+ if (ret > 0) {
bch_verbose(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot);
ret = bch2_inode_rm_snapshot(trans, k.k->p.offset, k.k->p.snapshot);
- if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
- break;
+ /*
+ * We don't want to loop here: a transaction restart
+ * error here means we handled a transaction restart and
+ * we're actually done, but if we loop we'll retry the
+ * same key because the write buffer hasn't been flushed
+ * yet
+ */
+ if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
+ ret = 0;
+ continue;
+ }
}
- }
- bch2_trans_iter_exit(trans, &iter);
- if (!ret && need_another_pass)
+ ret;
+ }));
+
+ if (!ret && need_another_pass) {
+ ret = bch2_btree_write_buffer_flush_sync(trans);
+ if (ret)
+ goto err;
goto again;
+ }
err:
bch2_trans_put(trans);
-
return ret;
}
diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h
index 88818a332b1e..b63f312581cf 100644
--- a/fs/bcachefs/inode.h
+++ b/fs/bcachefs/inode.h
@@ -17,32 +17,27 @@ int bch2_inode_v3_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_inode_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
-int bch2_trans_mark_inode(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
-int bch2_mark_inode(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
+int bch2_trigger_inode(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_inode ((struct bkey_ops) { \
.key_invalid = bch2_inode_invalid, \
.val_to_text = bch2_inode_to_text, \
- .trans_trigger = bch2_trans_mark_inode, \
- .atomic_trigger = bch2_mark_inode, \
+ .trigger = bch2_trigger_inode, \
.min_val_size = 16, \
})
#define bch2_bkey_ops_inode_v2 ((struct bkey_ops) { \
.key_invalid = bch2_inode_v2_invalid, \
.val_to_text = bch2_inode_to_text, \
- .trans_trigger = bch2_trans_mark_inode, \
- .atomic_trigger = bch2_mark_inode, \
+ .trigger = bch2_trigger_inode, \
.min_val_size = 32, \
})
#define bch2_bkey_ops_inode_v3 ((struct bkey_ops) { \
.key_invalid = bch2_inode_v3_invalid, \
.val_to_text = bch2_inode_to_text, \
- .trans_trigger = bch2_trans_mark_inode, \
- .atomic_trigger = bch2_mark_inode, \
+ .trigger = bch2_trigger_inode, \
.min_val_size = 48, \
})
diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h
new file mode 100644
index 000000000000..83d107331edf
--- /dev/null
+++ b/fs/bcachefs/inode_format.h
@@ -0,0 +1,166 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_INODE_FORMAT_H
+#define _BCACHEFS_INODE_FORMAT_H
+
+#define BLOCKDEV_INODE_MAX 4096
+#define BCACHEFS_ROOT_INO 4096
+
+struct bch_inode {
+ struct bch_val v;
+
+ __le64 bi_hash_seed;
+ __le32 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v2 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le16 bi_mode;
+ __u8 fields[];
+} __packed __aligned(8);
+
+struct bch_inode_v3 {
+ struct bch_val v;
+
+ __le64 bi_journal_seq;
+ __le64 bi_hash_seed;
+ __le64 bi_flags;
+ __le64 bi_sectors;
+ __le64 bi_size;
+ __le64 bi_version;
+ __u8 fields[];
+} __packed __aligned(8);
+
+#define INODEv3_FIELDS_START_INITIAL 6
+#define INODEv3_FIELDS_START_CUR (offsetof(struct bch_inode_v3, fields) / sizeof(__u64))
+
+struct bch_inode_generation {
+ struct bch_val v;
+
+ __le32 bi_generation;
+ __le32 pad;
+} __packed __aligned(8);
+
+/*
+ * bi_subvol and bi_parent_subvol are only set for subvolume roots:
+ */
+
+#define BCH_INODE_FIELDS_v2() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_size, 64) \
+ x(bi_sectors, 64) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32)
+
+#define BCH_INODE_FIELDS_v3() \
+ x(bi_atime, 96) \
+ x(bi_ctime, 96) \
+ x(bi_mtime, 96) \
+ x(bi_otime, 96) \
+ x(bi_uid, 32) \
+ x(bi_gid, 32) \
+ x(bi_nlink, 32) \
+ x(bi_generation, 32) \
+ x(bi_dev, 32) \
+ x(bi_data_checksum, 8) \
+ x(bi_compression, 8) \
+ x(bi_project, 32) \
+ x(bi_background_compression, 8) \
+ x(bi_data_replicas, 8) \
+ x(bi_promote_target, 16) \
+ x(bi_foreground_target, 16) \
+ x(bi_background_target, 16) \
+ x(bi_erasure_code, 16) \
+ x(bi_fields_set, 16) \
+ x(bi_dir, 64) \
+ x(bi_dir_offset, 64) \
+ x(bi_subvol, 32) \
+ x(bi_parent_subvol, 32) \
+ x(bi_nocow, 8)
+
+/* subset of BCH_INODE_FIELDS */
+#define BCH_INODE_OPTS() \
+ x(data_checksum, 8) \
+ x(compression, 8) \
+ x(project, 32) \
+ x(background_compression, 8) \
+ x(data_replicas, 8) \
+ x(promote_target, 16) \
+ x(foreground_target, 16) \
+ x(background_target, 16) \
+ x(erasure_code, 16) \
+ x(nocow, 8)
+
+enum inode_opt_id {
+#define x(name, ...) \
+ Inode_opt_##name,
+ BCH_INODE_OPTS()
+#undef x
+ Inode_opt_nr,
+};
+
+#define BCH_INODE_FLAGS() \
+ x(sync, 0) \
+ x(immutable, 1) \
+ x(append, 2) \
+ x(nodump, 3) \
+ x(noatime, 4) \
+ x(i_size_dirty, 5) \
+ x(i_sectors_dirty, 6) \
+ x(unlinked, 7) \
+ x(backptr_untrusted, 8)
+
+/* bits 20+ reserved for packed fields below: */
+
+enum bch_inode_flags {
+#define x(t, n) BCH_INODE_##t = 1U << n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+enum __bch_inode_flags {
+#define x(t, n) __BCH_INODE_##t = n,
+ BCH_INODE_FLAGS()
+#undef x
+};
+
+LE32_BITMASK(INODE_STR_HASH, struct bch_inode, bi_flags, 20, 24);
+LE32_BITMASK(INODE_NR_FIELDS, struct bch_inode, bi_flags, 24, 31);
+LE32_BITMASK(INODE_NEW_VARINT, struct bch_inode, bi_flags, 31, 32);
+
+LE64_BITMASK(INODEv2_STR_HASH, struct bch_inode_v2, bi_flags, 20, 24);
+LE64_BITMASK(INODEv2_NR_FIELDS, struct bch_inode_v2, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_STR_HASH, struct bch_inode_v3, bi_flags, 20, 24);
+LE64_BITMASK(INODEv3_NR_FIELDS, struct bch_inode_v3, bi_flags, 24, 31);
+
+LE64_BITMASK(INODEv3_FIELDS_START,
+ struct bch_inode_v3, bi_flags, 31, 36);
+LE64_BITMASK(INODEv3_MODE, struct bch_inode_v3, bi_flags, 36, 52);
+
+#endif /* _BCACHEFS_INODE_FORMAT_H */
diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c
index bebc11444ef5..1baf78594cca 100644
--- a/fs/bcachefs/io_misc.c
+++ b/fs/bcachefs/io_misc.c
@@ -34,8 +34,7 @@ int bch2_extent_fallocate(struct btree_trans *trans,
struct open_buckets open_buckets = { 0 };
struct bkey_s_c k;
struct bkey_buf old, new;
- unsigned sectors_allocated = 0;
- bool have_reservation = false;
+ unsigned sectors_allocated = 0, new_replicas;
bool unwritten = opts.nocow &&
c->sb.version >= bcachefs_metadata_version_unwritten_extents;
int ret;
@@ -50,28 +49,20 @@ int bch2_extent_fallocate(struct btree_trans *trans,
return ret;
sectors = min_t(u64, sectors, k.k->p.offset - iter->pos.offset);
+ new_replicas = max(0, (int) opts.data_replicas -
+ (int) bch2_bkey_nr_ptrs_fully_allocated(k));
- if (!have_reservation) {
- unsigned new_replicas =
- max(0, (int) opts.data_replicas -
- (int) bch2_bkey_nr_ptrs_fully_allocated(k));
- /*
- * Get a disk reservation before (in the nocow case) calling
- * into the allocator:
- */
- ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
- if (unlikely(ret))
- goto err;
-
- bch2_bkey_buf_reassemble(&old, c, k);
- }
+ /*
+ * Get a disk reservation before (in the nocow case) calling
+ * into the allocator:
+ */
+ ret = bch2_disk_reservation_get(c, &disk_res, sectors, new_replicas, 0);
+ if (unlikely(ret))
+ goto err_noprint;
- if (have_reservation) {
- if (!bch2_extents_match(k, bkey_i_to_s_c(old.k)))
- goto err;
+ bch2_bkey_buf_reassemble(&old, c, k);
- bch2_key_resize(&new.k->k, sectors);
- } else if (!unwritten) {
+ if (!unwritten) {
struct bkey_i_reservation *reservation;
bch2_bkey_buf_realloc(&new, c, sizeof(*reservation) / sizeof(u64));
@@ -83,7 +74,6 @@ int bch2_extent_fallocate(struct btree_trans *trans,
struct bkey_i_extent *e;
struct bch_devs_list devs_have;
struct write_point *wp;
- struct bch_extent_ptr *ptr;
devs_have.nr = 0;
@@ -118,14 +108,17 @@ int bch2_extent_fallocate(struct btree_trans *trans,
ptr->unwritten = true;
}
- have_reservation = true;
-
ret = bch2_extent_update(trans, inum, iter, new.k, &disk_res,
0, i_sectors_delta, true);
err:
if (!ret && sectors_allocated)
bch2_increment_clock(c, sectors_allocated, WRITE);
-
+ if (should_print_err(ret))
+ bch_err_inum_offset_ratelimited(c,
+ inum.inum,
+ iter->pos.offset << 9,
+ "%s(): error: %s", __func__, bch2_err_str(ret));
+err_noprint:
bch2_open_buckets_put(c, &open_buckets);
bch2_disk_reservation_put(c, &disk_res);
bch2_bkey_buf_exit(&new, c);
@@ -256,7 +249,7 @@ static int __bch2_resume_logged_op_truncate(struct btree_trans *trans,
u64 new_i_size = le64_to_cpu(op->v.new_i_size);
int ret;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
truncate_set_isize(trans, inum, new_i_size));
if (ret)
goto err;
@@ -378,7 +371,7 @@ case LOGGED_OP_FINSERT_start:
op->v.state = LOGGED_OP_FINSERT_shift_extents;
if (insert) {
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, len) ?:
bch2_logged_op_update(trans, &op->k_i));
if (ret)
@@ -390,7 +383,7 @@ case LOGGED_OP_FINSERT_start:
if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart))
goto err;
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_logged_op_update(trans, &op->k_i));
}
@@ -449,13 +442,11 @@ case LOGGED_OP_FINSERT_shift_extents:
op->v.pos = cpu_to_le64(insert ? bkey_start_offset(&delete.k) : delete.k.p.offset);
- ret = bch2_bkey_set_needs_rebalance(c, copy,
- opts.background_target,
- opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, copy, &opts) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, &delete, 0) ?:
bch2_btree_insert_trans(trans, BTREE_ID_extents, copy, 0) ?:
bch2_logged_op_update(trans, &op->k_i) ?:
- bch2_trans_commit(trans, &disk_res, NULL, BTREE_INSERT_NOFAIL);
+ bch2_trans_commit(trans, &disk_res, NULL, BCH_TRANS_COMMIT_no_enospc);
btree_err:
bch2_disk_reservation_put(c, &disk_res);
@@ -470,12 +461,12 @@ btree_err:
op->v.state = LOGGED_OP_FINSERT_finish;
if (!insert) {
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, src_offset, shift) ?:
bch2_logged_op_update(trans, &op->k_i));
} else {
/* We need an inode update to update bi_journal_seq for fsync: */
- ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
adjust_i_size(trans, inum, 0, 0) ?:
bch2_logged_op_update(trans, &op->k_i));
}
diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c
index 36763865facd..3c574d8873a1 100644
--- a/fs/bcachefs/io_read.c
+++ b/fs/bcachefs/io_read.c
@@ -80,7 +80,7 @@ struct promote_op {
struct bpos pos;
struct data_update write;
- struct bio_vec bi_inline_vecs[0]; /* must be last */
+ struct bio_vec bi_inline_vecs[]; /* must be last */
};
static const struct rhashtable_params bch_promote_params = {
@@ -172,11 +172,13 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
int ret;
if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote))
- return NULL;
+ return ERR_PTR(-BCH_ERR_nopromote_no_writes);
- op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_NOFS);
- if (!op)
+ op = kzalloc(sizeof(*op) + sizeof(struct bio_vec) * pages, GFP_KERNEL);
+ if (!op) {
+ ret = -BCH_ERR_nopromote_enomem;
goto err;
+ }
op->start_time = local_clock();
op->pos = pos;
@@ -187,24 +189,29 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
*/
*rbio = kzalloc(sizeof(struct bch_read_bio) +
sizeof(struct bio_vec) * pages,
- GFP_NOFS);
- if (!*rbio)
+ GFP_KERNEL);
+ if (!*rbio) {
+ ret = -BCH_ERR_nopromote_enomem;
goto err;
+ }
rbio_init(&(*rbio)->bio, opts);
bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0);
- if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9,
- GFP_NOFS))
+ if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) {
+ ret = -BCH_ERR_nopromote_enomem;
goto err;
+ }
(*rbio)->bounce = true;
(*rbio)->split = true;
(*rbio)->kmalloc = true;
if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash,
- bch_promote_params))
+ bch_promote_params)) {
+ ret = -BCH_ERR_nopromote_in_flight;
goto err;
+ }
bio = &op->write.op.wbio.bio;
bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0);
@@ -223,9 +230,8 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans,
* -BCH_ERR_ENOSPC_disk_reservation:
*/
if (ret) {
- ret = rhashtable_remove_fast(&c->promote_table, &op->hash,
- bch_promote_params);
- BUG_ON(ret);
+ BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash,
+ bch_promote_params));
goto err;
}
@@ -239,7 +245,7 @@ err:
*rbio = NULL;
kfree(op);
bch2_write_ref_put(c, BCH_WRITE_REF_promote);
- return NULL;
+ return ERR_PTR(ret);
}
noinline
@@ -274,10 +280,9 @@ static struct promote_op *promote_alloc(struct btree_trans *trans,
? BTREE_ID_reflink
: BTREE_ID_extents,
k, pos, pick, opts, sectors, rbio);
- if (!promote) {
- ret = -BCH_ERR_nopromote_enomem;
+ ret = PTR_ERR_OR_ZERO(promote);
+ if (ret)
goto nopromote;
- }
*bounce = true;
*read_full = promote_full;
@@ -526,7 +531,7 @@ out:
static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio)
{
- bch2_trans_do(rbio->c, NULL, NULL, BTREE_INSERT_NOFAIL,
+ bch2_trans_do(rbio->c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_rbio_narrow_crcs(trans, rbio));
}
@@ -637,12 +642,17 @@ csum_err:
goto out;
}
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ prt_str(&buf, "data ");
+ bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum);
+
bch_err_inum_offset_ratelimited(ca,
rbio->read_pos.inode,
rbio->read_pos.offset << 9,
- "data checksum error: expected %0llx:%0llx got %0llx:%0llx (type %s)",
- rbio->pick.crc.csum.hi, rbio->pick.crc.csum.lo,
- csum.hi, csum.lo, bch2_csum_types[crc.csum_type]);
+ "data %s", buf.buf);
+ printbuf_exit(&buf);
+
bch2_io_error(ca, BCH_MEMBER_ERROR_checksum);
bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR);
goto out;
diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c
index 8c8cb1541ac9..2c098ac017b3 100644
--- a/fs/bcachefs/io_write.c
+++ b/fs/bcachefs/io_write.c
@@ -316,8 +316,8 @@ int bch2_extent_update(struct btree_trans *trans,
i_sectors_delta) ?:
bch2_trans_update(trans, iter, k, 0) ?:
bch2_trans_commit(trans, disk_res, NULL,
- BTREE_INSERT_NOCHECK_RW|
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_check_rw|
+ BCH_TRANS_COMMIT_no_enospc);
if (unlikely(ret))
return ret;
@@ -362,9 +362,7 @@ static int bch2_write_index_default(struct bch_write_op *op)
bkey_start_pos(&sk.k->k),
BTREE_ITER_SLOTS|BTREE_ITER_INTENT);
- ret = bch2_bkey_set_needs_rebalance(c, sk.k,
- op->opts.background_target,
- op->opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, sk.k, &op->opts) ?:
bch2_extent_update(trans, inum, &iter, sk.k,
&op->res,
op->new_i_size, &op->i_sectors_delta,
@@ -396,17 +394,14 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c,
bool nocow)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
- const struct bch_extent_ptr *ptr;
struct bch_write_bio *n;
- struct bch_dev *ca;
BUG_ON(c->opts.nochanges);
bkey_for_each_ptr(ptrs, ptr) {
- BUG_ON(ptr->dev >= BCH_SB_MEMBERS_MAX ||
- !c->devs[ptr->dev]);
+ BUG_ON(!bch2_dev_exists2(c, ptr->dev));
- ca = bch_dev_bkey_exists(c, ptr->dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, ptr->dev);
if (to_entry(ptr + 1) < ptrs.end) {
n = to_wbio(bio_alloc_clone(NULL, &wbio->bio,
@@ -1109,16 +1104,14 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op,
static inline void bch2_nocow_write_unlock(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
- const struct bch_extent_ptr *ptr;
- struct bkey_i *k;
for_each_keylist_key(&op->insert_keys, k) {
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k));
bkey_for_each_ptr(ptrs, ptr)
bch2_bucket_nocow_unlock(&c->nocow_locks,
- PTR_BUCKET_POS(c, ptr),
- BUCKET_NOCOW_LOCK_UPDATE);
+ PTR_BUCKET_POS(c, ptr),
+ BUCKET_NOCOW_LOCK_UPDATE);
}
}
@@ -1128,25 +1121,20 @@ static int bch2_nocow_write_convert_one_unwritten(struct btree_trans *trans,
struct bkey_s_c k,
u64 new_i_size)
{
- struct bkey_i *new;
- struct bkey_ptrs ptrs;
- struct bch_extent_ptr *ptr;
- int ret;
-
if (!bch2_extents_match(bkey_i_to_s_c(orig), k)) {
/* trace this */
return 0;
}
- new = bch2_bkey_make_mut_noupdate(trans, k);
- ret = PTR_ERR_OR_ZERO(new);
+ struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
+ int ret = PTR_ERR_OR_ZERO(new);
if (ret)
return ret;
bch2_cut_front(bkey_start_pos(&orig->k), new);
bch2_cut_back(orig->k.p, new);
- ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
+ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new));
bkey_for_each_ptr(ptrs, ptr)
ptr->unwritten = 0;
@@ -1167,16 +1155,12 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op)
{
struct bch_fs *c = op->c;
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_i *orig;
- struct bkey_s_c k;
- int ret;
for_each_keylist_key(&op->insert_keys, orig) {
- ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
+ int ret = for_each_btree_key_upto_commit(trans, iter, BTREE_ID_extents,
bkey_start_pos(&orig->k), orig->k.p,
BTREE_ITER_INTENT, k,
- NULL, NULL, BTREE_INSERT_NOFAIL, ({
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({
bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size);
}));
@@ -1228,10 +1212,7 @@ static void bch2_nocow_write(struct bch_write_op *op)
struct btree_trans *trans;
struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_ptrs_c ptrs;
- const struct bch_extent_ptr *ptr;
DARRAY_PREALLOCATED(struct bucket_to_lock, 3) buckets;
- struct bucket_to_lock *i;
u32 snapshot;
struct bucket_to_lock *stale_at;
int ret;
@@ -1273,7 +1254,7 @@ retry:
break;
/* Get iorefs before dropping btree locks: */
- ptrs = bch2_bkey_ptrs_c(k);
+ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
bkey_for_each_ptr(ptrs, ptr) {
struct bpos b = PTR_BUCKET_POS(c, ptr);
struct nocow_lock_bucket *l =
@@ -1464,6 +1445,11 @@ err:
op->flags |= BCH_WRITE_DONE;
if (ret < 0) {
+ if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT))
+ bch_err_inum_offset_ratelimited(c,
+ op->pos.inode,
+ op->pos.offset << 9,
+ "%s(): error: %s", __func__, bch2_err_str(ret));
op->error = ret;
break;
}
@@ -1578,6 +1564,7 @@ CLOSURE_CALLBACK(bch2_write)
BUG_ON(!op->write_point.v);
BUG_ON(bkey_eq(op->pos, POS_MAX));
+ op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas);
op->start_time = local_clock();
bch2_keylist_init(&op->insert_keys, op->inline_keys);
wbio_init(bio)->put_bio = false;
diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c
index 8cf238be6213..bc890776eb57 100644
--- a/fs/bcachefs/journal.c
+++ b/fs/bcachefs/journal.c
@@ -10,6 +10,7 @@
#include "bkey_methods.h"
#include "btree_gc.h"
#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "error.h"
#include "journal.h"
@@ -26,6 +27,47 @@ static const char * const bch2_journal_errors[] = {
NULL
};
+static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u64 seq)
+{
+ union journal_res_state s = READ_ONCE(j->reservations);
+ unsigned i = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *buf = j->buf + i;
+
+ prt_printf(out, "seq:");
+ prt_tab(out);
+ prt_printf(out, "%llu", seq);
+ prt_newline(out);
+ printbuf_indent_add(out, 2);
+
+ prt_printf(out, "refcount:");
+ prt_tab(out);
+ prt_printf(out, "%u", journal_state_count(s, i));
+ prt_newline(out);
+
+ prt_printf(out, "size:");
+ prt_tab(out);
+ prt_human_readable_u64(out, vstruct_bytes(buf->data));
+ prt_newline(out);
+
+ prt_printf(out, "expires");
+ prt_tab(out);
+ prt_printf(out, "%li jiffies", buf->expires - jiffies);
+ prt_newline(out);
+
+ printbuf_indent_sub(out, 2);
+}
+
+static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j)
+{
+ if (!out->nr_tabstops)
+ printbuf_tabstop_push(out, 24);
+
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= journal_cur_seq(j);
+ seq++)
+ bch2_journal_buf_to_text(out, j, seq);
+}
+
static inline bool journal_seq_unwritten(struct journal *j, u64 seq)
{
return seq > j->seq_ondisk;
@@ -155,7 +197,7 @@ void bch2_journal_buf_put_final(struct journal *j, u64 seq, bool write)
* We don't close a journal_buf until the next journal_buf is finished writing,
* and can be opened again - this also initializes the next journal_buf:
*/
-static void __journal_entry_close(struct journal *j, unsigned closed_val)
+static void __journal_entry_close(struct journal *j, unsigned closed_val, bool trace)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct journal_buf *buf = journal_cur_buf(j);
@@ -184,6 +226,18 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
/* Close out old buffer: */
buf->data->u64s = cpu_to_le32(old.cur_entry_offset);
+ if (trace_journal_entry_close_enabled() && trace) {
+ struct printbuf pbuf = PRINTBUF;
+ pbuf.atomic++;
+
+ prt_str(&pbuf, "entry size: ");
+ prt_human_readable_u64(&pbuf, vstruct_bytes(buf->data));
+ prt_newline(&pbuf);
+ bch2_prt_task_backtrace(&pbuf, current, 1, GFP_NOWAIT);
+ trace_journal_entry_close(c, pbuf.buf);
+ printbuf_exit(&pbuf);
+ }
+
sectors = vstruct_blocks_plus(buf->data, c->block_bits,
buf->u64s_reserved) << c->block_bits;
BUG_ON(sectors > buf->sectors);
@@ -222,7 +276,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val)
void bch2_journal_halt(struct journal *j)
{
spin_lock(&j->lock);
- __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true);
if (!j->err_seq)
j->err_seq = journal_cur_seq(j);
journal_wake(j);
@@ -236,7 +290,7 @@ static bool journal_entry_want_write(struct journal *j)
/* Don't close it yet if we already have a write in flight: */
if (ret)
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
else if (nr_unwritten_journal_entries(j)) {
struct journal_buf *buf = journal_cur_buf(j);
@@ -330,6 +384,7 @@ static int journal_entry_open(struct journal *j)
buf->must_flush = false;
buf->separate_flush = false;
buf->flush_time = 0;
+ buf->need_flush_to_write_buffer = true;
memset(buf->data, 0, sizeof(*buf->data));
buf->data->seq = cpu_to_le64(journal_cur_seq(j));
@@ -363,11 +418,6 @@ static int journal_entry_open(struct journal *j)
} while ((v = atomic64_cmpxchg(&j->reservations.counter,
old.v, new.v)) != old.v);
- if (j->res_get_blocked_start)
- bch2_time_stats_update(j->blocked_time,
- j->res_get_blocked_start);
- j->res_get_blocked_start = 0;
-
mod_delayed_work(c->io_complete_wq,
&j->write_work,
msecs_to_jiffies(c->opts.journal_flush_delay));
@@ -407,7 +457,7 @@ static void journal_write_work(struct work_struct *work)
if (delta > 0)
mod_delayed_work(c->io_complete_wq, &j->write_work, delta);
else
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
unlock:
spin_unlock(&j->lock);
}
@@ -464,18 +514,23 @@ retry:
buf->buf_size < JOURNAL_ENTRY_SIZE_MAX)
j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1);
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false);
ret = journal_entry_open(j);
- if (ret == JOURNAL_ERR_max_in_flight)
- trace_and_count(c, journal_entry_full, c);
-unlock:
- if ((ret && ret != JOURNAL_ERR_insufficient_devices) &&
- !j->res_get_blocked_start) {
- j->res_get_blocked_start = local_clock() ?: 1;
- trace_and_count(c, journal_full, c);
- }
+ if (ret == JOURNAL_ERR_max_in_flight) {
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+ &j->max_in_flight_start, true);
+ if (trace_journal_entry_full_enabled()) {
+ struct printbuf buf = PRINTBUF;
+ buf.atomic++;
+ bch2_journal_bufs_to_text(&buf, j);
+ trace_journal_entry_full(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+ count_event(c, journal_entry_full);
+ }
+unlock:
can_discard = j->can_discard;
spin_unlock(&j->lock);
@@ -553,7 +608,7 @@ void bch2_journal_entry_res_resize(struct journal *j,
/*
* Not enough room in current journal entry, have to flush it:
*/
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
} else {
journal_cur_buf(j)->u64s_reserved += d;
}
@@ -610,7 +665,7 @@ recheck_need_open:
struct journal_res res = { 0 };
if (journal_entry_is_open(j))
- __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL);
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
spin_unlock(&j->lock);
@@ -774,6 +829,48 @@ void bch2_journal_block(struct journal *j)
journal_quiesce(j);
}
+static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+ struct journal_buf *ret = NULL;
+
+ mutex_lock(&j->buf_lock);
+ spin_lock(&j->lock);
+ max_seq = min(max_seq, journal_cur_seq(j));
+
+ for (u64 seq = journal_last_unwritten_seq(j);
+ seq <= max_seq;
+ seq++) {
+ unsigned idx = seq & JOURNAL_BUF_MASK;
+ struct journal_buf *buf = j->buf + idx;
+
+ if (buf->need_flush_to_write_buffer) {
+ if (seq == journal_cur_seq(j))
+ __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, true);
+
+ union journal_res_state s;
+ s.v = atomic64_read_acquire(&j->reservations.counter);
+
+ ret = journal_state_count(s, idx)
+ ? ERR_PTR(-EAGAIN)
+ : buf;
+ break;
+ }
+ }
+
+ spin_unlock(&j->lock);
+ if (IS_ERR_OR_NULL(ret))
+ mutex_unlock(&j->buf_lock);
+ return ret;
+}
+
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq)
+{
+ struct journal_buf *ret;
+
+ wait_event(j->wait, (ret = __bch2_next_write_buffer_flush_journal_buf(j, max_seq)) != ERR_PTR(-EAGAIN));
+ return ret;
+}
+
/* allocate journal on a device: */
static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr,
@@ -955,8 +1052,7 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca,
break;
}
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
unlock:
up_write(&c->state_lock);
return ret;
@@ -986,17 +1082,13 @@ int bch2_dev_journal_alloc(struct bch_dev *ca)
ret = __bch2_set_nr_journal_buckets(ca, nr, true, NULL);
err:
- if (ret)
- bch_err_fn(ca, ret);
+ bch_err_fn(ca, ret);
return ret;
}
int bch2_fs_journal_alloc(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (ca->journal.nr)
continue;
@@ -1225,6 +1317,7 @@ int bch2_fs_journal_init(struct journal *j)
static struct lock_class_key res_key;
unsigned i;
+ mutex_init(&j->buf_lock);
spin_lock_init(&j->lock);
spin_lock_init(&j->err_lock);
init_waitqueue_head(&j->wait);
@@ -1260,10 +1353,8 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
union journal_res_state s;
- struct bch_dev *ca;
unsigned long now = jiffies;
- u64 seq;
- unsigned i;
+ u64 nr_writes = j->nr_flush_writes + j->nr_noflush_writes;
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 24);
@@ -1275,20 +1366,23 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
prt_printf(out, "dirty journal entries:\t%llu/%llu\n", fifo_used(&j->pin), j->pin.size);
prt_printf(out, "seq:\t\t\t%llu\n", journal_cur_seq(j));
prt_printf(out, "seq_ondisk:\t\t%llu\n", j->seq_ondisk);
- prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
+ prt_printf(out, "last_seq:\t\t%llu\n", journal_last_seq(j));
prt_printf(out, "last_seq_ondisk:\t%llu\n", j->last_seq_ondisk);
- prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
- prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
- prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
+ prt_printf(out, "flushed_seq_ondisk:\t%llu\n", j->flushed_seq_ondisk);
+ prt_printf(out, "watermark:\t\t%s\n", bch2_watermarks[j->watermark]);
+ prt_printf(out, "each entry reserved:\t%u\n", j->entry_u64s_reserved);
prt_printf(out, "nr flush writes:\t%llu\n", j->nr_flush_writes);
- prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
- prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
+ prt_printf(out, "nr noflush writes:\t%llu\n", j->nr_noflush_writes);
+ prt_printf(out, "average write size:\t");
+ prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0);
+ prt_newline(out);
+ prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim);
prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim);
prt_printf(out, "reclaim kicked:\t\t%u\n", j->reclaim_kicked);
- prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
+ prt_printf(out, "reclaim runs in:\t%u ms\n", time_after(j->next_reclaim, now)
? jiffies_to_msecs(j->next_reclaim - jiffies) : 0);
- prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
- prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
+ prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors);
+ prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]);
prt_printf(out, "current entry:\t\t");
switch (s.cur_entry_offset) {
@@ -1304,35 +1398,9 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
}
prt_newline(out);
-
- for (seq = journal_cur_seq(j);
- seq >= journal_last_unwritten_seq(j);
- --seq) {
- i = seq & JOURNAL_BUF_MASK;
-
- prt_printf(out, "unwritten entry:");
- prt_tab(out);
- prt_printf(out, "%llu", seq);
- prt_newline(out);
- printbuf_indent_add(out, 2);
-
- prt_printf(out, "refcount:");
- prt_tab(out);
- prt_printf(out, "%u", journal_state_count(s, i));
- prt_newline(out);
-
- prt_printf(out, "sectors:");
- prt_tab(out);
- prt_printf(out, "%u", j->buf[i].sectors);
- prt_newline(out);
-
- prt_printf(out, "expires");
- prt_tab(out);
- prt_printf(out, "%li jiffies", j->buf[i].expires - jiffies);
- prt_newline(out);
-
- printbuf_indent_sub(out, 2);
- }
+ prt_printf(out, "unwritten entries:");
+ prt_newline(out);
+ bch2_journal_bufs_to_text(out, j);
prt_printf(out,
"replay done:\t\t%i\n",
@@ -1352,8 +1420,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
j->space[journal_space_total].next_entry,
j->space[journal_space_total].total);
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_journal]) {
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
if (!test_bit(ca->dev_idx, c->rw_devs[BCH_DATA_journal].d))
@@ -1362,7 +1429,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
if (!ja->nr)
continue;
- prt_printf(out, "dev %u:\n", i);
+ prt_printf(out, "dev %u:\n", ca->dev_idx);
prt_printf(out, "\tnr\t\t%u\n", ja->nr);
prt_printf(out, "\tbucket size\t%u\n", ca->mi.bucket_size);
prt_printf(out, "\tavailable\t%u:%u\n", bch2_journal_dev_buckets_available(j, ja, journal_space_discarded), ja->sectors_free);
diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h
index 2f768e11aec9..4544ce24bb8a 100644
--- a/fs/bcachefs/journal.h
+++ b/fs/bcachefs/journal.h
@@ -119,7 +119,6 @@ static inline void journal_wake(struct journal *j)
{
wake_up(&j->wait);
closure_wake_up(&j->async_wait);
- closure_wake_up(&j->preres_wait);
}
static inline struct journal_buf *journal_cur_buf(struct journal *j)
@@ -239,8 +238,6 @@ bch2_journal_add_entry(struct journal *j, struct journal_res *res,
static inline bool journal_entry_empty(struct jset *j)
{
- struct jset_entry *i;
-
if (j->seq != j->last_seq)
return false;
@@ -426,6 +423,7 @@ static inline void bch2_journal_set_replay_done(struct journal *j)
void bch2_journal_unblock(struct journal *);
void bch2_journal_block(struct journal *);
+struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, u64 max_seq);
void __bch2_journal_debug_to_text(struct printbuf *, struct journal *);
void bch2_journal_debug_to_text(struct printbuf *, struct journal *);
diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c
index 3eb6c3f62a81..47805193f18c 100644
--- a/fs/bcachefs/journal_io.c
+++ b/fs/bcachefs/journal_io.c
@@ -4,6 +4,7 @@
#include "alloc_foreground.h"
#include "btree_io.h"
#include "btree_update_interior.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "checksum.h"
#include "disk_groups.h"
@@ -26,11 +27,15 @@ static struct nonce journal_nonce(const struct jset *jset)
}};
}
-static bool jset_csum_good(struct bch_fs *c, struct jset *j)
+static bool jset_csum_good(struct bch_fs *c, struct jset *j, struct bch_csum *csum)
{
- return bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j)) &&
- !bch2_crc_cmp(j->csum,
- csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j));
+ if (!bch2_checksum_type_valid(c, JSET_CSUM_TYPE(j))) {
+ *csum = (struct bch_csum) {};
+ return false;
+ }
+
+ *csum = csum_vstruct(c, JSET_CSUM_TYPE(j), journal_nonce(j), j);
+ return !bch2_crc_cmp(j->csum, *csum);
}
static inline u32 journal_entry_radix_idx(struct bch_fs *c, u64 seq)
@@ -678,17 +683,12 @@ static void journal_entry_dev_usage_to_text(struct printbuf *out, struct bch_fs
prt_printf(out, "dev=%u", le32_to_cpu(u->dev));
for (i = 0; i < nr_types; i++) {
- if (i < BCH_DATA_NR)
- prt_printf(out, " %s", bch2_data_types[i]);
- else
- prt_printf(out, " (unknown data type %u)", i);
+ bch2_prt_data_type(out, i);
prt_printf(out, ": buckets=%llu sectors=%llu fragmented=%llu",
le64_to_cpu(u->d[i].buckets),
le64_to_cpu(u->d[i].sectors),
le64_to_cpu(u->d[i].fragmented));
}
-
- prt_printf(out, " buckets_ec: %llu", le64_to_cpu(u->buckets_ec));
}
static int journal_entry_log_validate(struct bch_fs *c,
@@ -725,6 +725,22 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs
journal_entry_btree_keys_to_text(out, c, entry);
}
+static int journal_entry_write_buffer_keys_validate(struct bch_fs *c,
+ struct jset *jset,
+ struct jset_entry *entry,
+ unsigned version, int big_endian,
+ enum bkey_invalid_flags flags)
+{
+ return journal_entry_btree_keys_validate(c, jset, entry,
+ version, big_endian, READ);
+}
+
+static void journal_entry_write_buffer_keys_to_text(struct printbuf *out, struct bch_fs *c,
+ struct jset_entry *entry)
+{
+ journal_entry_btree_keys_to_text(out, c, entry);
+}
+
struct jset_entry_ops {
int (*validate)(struct bch_fs *, struct jset *,
struct jset_entry *, unsigned, int,
@@ -768,7 +784,6 @@ void bch2_journal_entry_to_text(struct printbuf *out, struct bch_fs *c,
static int jset_validate_entries(struct bch_fs *c, struct jset *jset,
enum bkey_invalid_flags flags)
{
- struct jset_entry *entry;
unsigned version = le32_to_cpu(jset->version);
int ret = 0;
@@ -920,6 +935,7 @@ static int journal_read_bucket(struct bch_dev *ca,
u64 offset = bucket_to_sector(ca, ja->buckets[bucket]),
end = offset + ca->mi.bucket_size;
bool saw_bad = false, csum_good;
+ struct printbuf err = PRINTBUF;
int ret = 0;
pr_debug("reading %u", bucket);
@@ -952,7 +968,7 @@ reread:
* found on a different device, and missing or
* no journal entries will be handled later
*/
- return 0;
+ goto out;
}
j = buf->data;
@@ -969,12 +985,12 @@ reread:
ret = journal_read_buf_realloc(buf,
vstruct_bytes(j));
if (ret)
- return ret;
+ goto err;
}
goto reread;
case JOURNAL_ENTRY_NONE:
if (!saw_bad)
- return 0;
+ goto out;
/*
* On checksum error we don't really trust the size
* field of the journal entry we read, so try reading
@@ -983,7 +999,7 @@ reread:
sectors = block_sectors(c);
goto next_block;
default:
- return ret;
+ goto err;
}
/*
@@ -993,20 +1009,28 @@ reread:
* bucket:
*/
if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket])
- return 0;
+ goto out;
ja->bucket_seq[bucket] = le64_to_cpu(j->seq);
- csum_good = jset_csum_good(c, j);
+ enum bch_csum_type csum_type = JSET_CSUM_TYPE(j);
+ struct bch_csum csum;
+ csum_good = jset_csum_good(c, j, &csum);
+
if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum,
- "journal checksum error"))
+ "%s",
+ (printbuf_reset(&err),
+ prt_str(&err, "journal "),
+ bch2_csum_err_msg(&err, csum_type, j->csum, csum),
+ err.buf)))
saw_bad = true;
ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j),
j->encrypted_start,
vstruct_end(j) - (void *) j->encrypted_start);
bch2_fs_fatal_err_on(ret, c,
- "error decrypting journal entry: %i", ret);
+ "error decrypting journal entry: %s",
+ bch2_err_str(ret));
mutex_lock(&jlist->lock);
ret = journal_entry_add(c, ca, (struct journal_ptr) {
@@ -1025,7 +1049,7 @@ reread:
case JOURNAL_ENTRY_ADD_OUT_OF_RANGE:
break;
default:
- return ret;
+ goto err;
}
next_block:
pr_debug("next");
@@ -1034,7 +1058,11 @@ next_block:
j = ((void *) j) + (sectors << 9);
}
- return 0;
+out:
+ ret = 0;
+err:
+ printbuf_exit(&err);
+ return ret;
}
static CLOSURE_CALLBACK(bch2_journal_read_device)
@@ -1156,8 +1184,6 @@ int bch2_journal_read(struct bch_fs *c,
struct journal_list jlist;
struct journal_replay *i, **_i, *prev = NULL;
struct genradix_iter radix_iter;
- struct bch_dev *ca;
- unsigned iter;
struct printbuf buf = PRINTBUF;
bool degraded = false, last_write_torn = false;
u64 seq;
@@ -1168,7 +1194,7 @@ int bch2_journal_read(struct bch_fs *c,
jlist.last_seq = 0;
jlist.ret = 0;
- for_each_member_device(ca, c, iter) {
+ for_each_member_device(c, ca) {
if (!c->opts.fsck &&
!(bch2_dev_has_data(c, ca) & (1 << BCH_DATA_journal)))
continue;
@@ -1334,7 +1360,7 @@ int bch2_journal_read(struct bch_fs *c,
continue;
for (ptr = 0; ptr < i->nr_ptrs; ptr++) {
- ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
+ struct bch_dev *ca = bch_dev_bkey_exists(c, i->ptrs[ptr].dev);
if (!i->ptrs[ptr].csum_good)
bch_err_dev_offset(ca, i->ptrs[ptr].sector,
@@ -1452,6 +1478,8 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w)
c->opts.foreground_target;
unsigned i, replicas = 0, replicas_want =
READ_ONCE(c->opts.metadata_replicas);
+ unsigned replicas_need = min_t(unsigned, replicas_want,
+ READ_ONCE(c->opts.metadata_replicas_required));
rcu_read_lock();
retry:
@@ -1500,11 +1528,13 @@ done:
BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX);
- return replicas >= c->opts.metadata_replicas_required ? 0 : -EROFS;
+ return replicas >= replicas_need ? 0 : -EROFS;
}
static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
{
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+
/* we aren't holding j->lock: */
unsigned new_size = READ_ONCE(j->buf_size_want);
void *new_buf;
@@ -1512,6 +1542,11 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf)
if (buf->buf_size >= new_size)
return;
+ size_t btree_write_buffer_size = new_size / 64;
+
+ if (bch2_btree_write_buffer_resize(c, btree_write_buffer_size))
+ return;
+
new_buf = kvpmalloc(new_size, GFP_NOFS|__GFP_NOWARN);
if (!new_buf)
return;
@@ -1604,6 +1639,9 @@ static CLOSURE_CALLBACK(journal_write_done)
bch2_journal_reclaim_fast(j);
bch2_journal_space_available(j);
+ track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight],
+ &j->max_in_flight_start, false);
+
closure_wake_up(&w->wait);
journal_wake(j);
@@ -1656,7 +1694,6 @@ static CLOSURE_CALLBACK(do_journal_write)
struct bch_fs *c = container_of(j, struct bch_fs, journal);
struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
- struct bch_extent_ptr *ptr;
struct bio *bio;
unsigned sectors = vstruct_sectors(w->data, c->block_bits);
@@ -1700,11 +1737,13 @@ static CLOSURE_CALLBACK(do_journal_write)
static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct jset_entry *start, *end, *i, *next, *prev = NULL;
+ struct jset_entry *start, *end;
struct jset *jset = w->data;
+ struct journal_keys_to_wb wb = { NULL };
unsigned sectors, bytes, u64s;
- bool validate_before_checksum = false;
unsigned long btree_roots_have = 0;
+ bool validate_before_checksum = false;
+ u64 seq = le64_to_cpu(jset->seq);
int ret;
/*
@@ -1715,7 +1754,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
* If we wanted to be really fancy here, we could sort all the keys in
* the jset and drop keys that were overwritten - probably not worth it:
*/
- vstruct_for_each_safe(jset, i, next) {
+ vstruct_for_each(jset, i) {
unsigned u64s = le16_to_cpu(i->u64s);
/* Empty entry: */
@@ -1732,40 +1771,40 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
* to c->btree_roots we have to get any missing btree roots and
* add them to this journal entry:
*/
- if (i->type == BCH_JSET_ENTRY_btree_root) {
+ switch (i->type) {
+ case BCH_JSET_ENTRY_btree_root:
bch2_journal_entry_to_btree_root(c, i);
__set_bit(i->btree_id, &btree_roots_have);
+ break;
+ case BCH_JSET_ENTRY_write_buffer_keys:
+ EBUG_ON(!w->need_flush_to_write_buffer);
+
+ if (!wb.wb)
+ bch2_journal_keys_to_write_buffer_start(c, &wb, seq);
+
+ struct bkey_i *k;
+ jset_entry_for_each_key(i, k) {
+ ret = bch2_journal_key_to_wb(c, &wb, i->btree_id, k);
+ if (ret) {
+ bch2_fs_fatal_error(c, "-ENOMEM flushing journal keys to btree write buffer");
+ bch2_journal_keys_to_write_buffer_end(c, &wb);
+ return ret;
+ }
+ }
+ i->type = BCH_JSET_ENTRY_btree_keys;
+ break;
}
-
- /* Can we merge with previous entry? */
- if (prev &&
- i->btree_id == prev->btree_id &&
- i->level == prev->level &&
- i->type == prev->type &&
- i->type == BCH_JSET_ENTRY_btree_keys &&
- le16_to_cpu(prev->u64s) + u64s <= U16_MAX) {
- memmove_u64s_down(vstruct_next(prev),
- i->_data,
- u64s);
- le16_add_cpu(&prev->u64s, u64s);
- continue;
- }
-
- /* Couldn't merge, move i into new position (after prev): */
- prev = prev ? vstruct_next(prev) : jset->start;
- if (i != prev)
- memmove_u64s_down(prev, i, jset_u64s(u64s));
}
- prev = prev ? vstruct_next(prev) : jset->start;
- jset->u64s = cpu_to_le32((u64 *) prev - jset->_data);
+ if (wb.wb)
+ bch2_journal_keys_to_write_buffer_end(c, &wb);
+ w->need_flush_to_write_buffer = false;
start = end = vstruct_last(jset);
end = bch2_btree_roots_to_journal_entries(c, end, btree_roots_have);
- bch2_journal_super_entries_add_common(c, &end,
- le64_to_cpu(jset->seq));
+ bch2_journal_super_entries_add_common(c, &end, seq);
u64s = (u64 *) end - (u64 *) start;
BUG_ON(u64s > j->entry_u64s_reserved);
@@ -1788,7 +1827,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w)
SET_JSET_CSUM_TYPE(jset, bch2_meta_checksum_type(c));
if (!JSET_NO_FLUSH(jset) && journal_entry_empty(jset))
- j->last_empty_seq = le64_to_cpu(jset->seq);
+ j->last_empty_seq = seq;
if (bch2_csum_type_is_encryption(JSET_CSUM_TYPE(jset)))
validate_before_checksum = true;
@@ -1847,7 +1886,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf *
(!w->must_flush &&
(jiffies - j->last_flush_write) < msecs_to_jiffies(c->opts.journal_flush_delay) &&
test_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags))) {
- w->noflush = true;
+ w->noflush = true;
SET_JSET_NO_FLUSH(w->data, true);
w->data->last_seq = 0;
w->last_seq = 0;
@@ -1866,12 +1905,11 @@ CLOSURE_CALLBACK(bch2_journal_write)
{
closure_type(j, struct journal, io);
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
struct journal_buf *w = journal_last_unwritten_buf(j);
struct bch_replicas_padded replicas;
struct bio *bio;
struct printbuf journal_debug_buf = PRINTBUF;
- unsigned i, nr_rw_members = 0;
+ unsigned nr_rw_members = 0;
int ret;
BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb));
@@ -1884,12 +1922,16 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (ret)
goto err;
+ mutex_lock(&j->buf_lock);
journal_buf_realloc(j, w);
ret = bch2_journal_write_prep(j, w);
+ mutex_unlock(&j->buf_lock);
if (ret)
goto err;
+ j->entry_bytes_written += vstruct_bytes(w->data);
+
while (1) {
spin_lock(&j->lock);
ret = journal_write_alloc(j, w);
@@ -1927,7 +1969,7 @@ CLOSURE_CALLBACK(bch2_journal_write)
if (c->opts.nochanges)
goto no_io;
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
nr_rw_members++;
if (nr_rw_members > 1)
@@ -1944,11 +1986,12 @@ CLOSURE_CALLBACK(bch2_journal_write)
goto err;
if (!JSET_NO_FLUSH(w->data) && w->separate_flush) {
- for_each_rw_member(ca, c, i) {
+ for_each_rw_member(c, ca) {
percpu_ref_get(&ca->io_ref);
bio = ca->journal.bio;
- bio_reset(bio, ca->disk_sb.bdev, REQ_OP_FLUSH);
+ bio_reset(bio, ca->disk_sb.bdev,
+ REQ_OP_WRITE|REQ_PREFLUSH);
bio->bi_end_io = journal_write_endio;
bio->bi_private = ca;
closure_bio_submit(bio, cl);
diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c
index ec712104addb..c33dca641575 100644
--- a/fs/bcachefs/journal_reclaim.c
+++ b/fs/bcachefs/journal_reclaim.c
@@ -3,6 +3,7 @@
#include "bcachefs.h"
#include "btree_key_cache.h"
#include "btree_update.h"
+#include "btree_write_buffer.h"
#include "buckets.h"
#include "errcode.h"
#include "error.h"
@@ -50,17 +51,24 @@ unsigned bch2_journal_dev_buckets_available(struct journal *j,
return available;
}
-static inline void journal_set_watermark(struct journal *j, bool low_on_space)
+void bch2_journal_set_watermark(struct journal *j)
{
- unsigned watermark = BCH_WATERMARK_stripe;
-
- if (low_on_space)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
- if (fifo_free(&j->pin) < j->pin.size / 4)
- watermark = max_t(unsigned, watermark, BCH_WATERMARK_reclaim);
-
- if (watermark == j->watermark)
- return;
+ struct bch_fs *c = container_of(j, struct bch_fs, journal);
+ bool low_on_space = j->space[journal_space_clean].total * 4 <=
+ j->space[journal_space_total].total;
+ bool low_on_pin = fifo_free(&j->pin) < j->pin.size / 4;
+ bool low_on_wb = bch2_btree_write_buffer_must_wait(c);
+ unsigned watermark = low_on_space || low_on_pin || low_on_wb
+ ? BCH_WATERMARK_reclaim
+ : BCH_WATERMARK_stripe;
+
+ if (track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_space],
+ &j->low_on_space_start, low_on_space) ||
+ track_event_change(&c->times[BCH_TIME_blocked_journal_low_on_pin],
+ &j->low_on_pin_start, low_on_pin) ||
+ track_event_change(&c->times[BCH_TIME_blocked_write_buffer_full],
+ &j->write_buffer_full_start, low_on_wb))
+ trace_and_count(c, journal_full, c);
swap(watermark, j->watermark);
if (watermark > j->watermark)
@@ -128,15 +136,13 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
enum journal_space_from from)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- unsigned i, pos, nr_devs = 0;
+ unsigned pos, nr_devs = 0;
struct journal_space space, dev_space[BCH_SB_MEMBERS_MAX];
BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space));
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_journal]) {
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
if (!ca->journal.nr)
continue;
@@ -165,19 +171,17 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne
void bch2_journal_space_available(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
unsigned clean, clean_ondisk, total;
unsigned max_entry_size = min(j->buf[0].buf_size >> 9,
j->buf[1].buf_size >> 9);
- unsigned i, nr_online = 0, nr_devs_want;
+ unsigned nr_online = 0, nr_devs_want;
bool can_discard = false;
int ret = 0;
lockdep_assert_held(&j->lock);
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i,
- &c->rw_devs[BCH_DATA_journal]) {
+ for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) {
struct journal_device *ja = &ca->journal;
if (!ja->nr)
@@ -201,14 +205,14 @@ void bch2_journal_space_available(struct journal *j)
j->can_discard = can_discard;
- if (nr_online < c->opts.metadata_replicas_required) {
+ if (nr_online < metadata_replicas_required(c)) {
ret = JOURNAL_ERR_insufficient_devices;
goto out;
}
nr_devs_want = min_t(unsigned, nr_online, c->opts.metadata_replicas);
- for (i = 0; i < journal_space_nr; i++)
+ for (unsigned i = 0; i < journal_space_nr; i++)
j->space[i] = __journal_space_available(j, nr_devs_want, i);
clean_ondisk = j->space[journal_space_clean_ondisk].total;
@@ -226,7 +230,7 @@ void bch2_journal_space_available(struct journal *j)
else
clear_bit(JOURNAL_MAY_SKIP_FLUSH, &j->flags);
- journal_set_watermark(j, clean * 4 <= total);
+ bch2_journal_set_watermark(j);
out:
j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0;
j->cur_entry_error = ret;
@@ -255,12 +259,10 @@ static bool should_discard_bucket(struct journal *j, struct journal_device *ja)
void bch2_journal_do_discards(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
- unsigned iter;
mutex_lock(&j->discard_lock);
- for_each_rw_member(ca, c, iter) {
+ for_each_rw_member(c, ca) {
struct journal_device *ja = &ca->journal;
while (should_discard_bucket(j, ja)) {
@@ -299,6 +301,7 @@ void bch2_journal_reclaim_fast(struct journal *j)
* all btree nodes got written out
*/
while (!fifo_empty(&j->pin) &&
+ j->pin.front <= j->seq_ondisk &&
!atomic_read(&fifo_peek_front(&j->pin).count)) {
j->pin.front++;
popped = true;
@@ -367,15 +370,36 @@ static enum journal_pin_type journal_pin_type(journal_pin_flush_fn fn)
return JOURNAL_PIN_other;
}
-void bch2_journal_pin_set(struct journal *j, u64 seq,
+static inline void bch2_journal_pin_set_locked(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
- journal_pin_flush_fn flush_fn)
+ journal_pin_flush_fn flush_fn,
+ enum journal_pin_type type)
+{
+ struct journal_entry_pin_list *pin_list = journal_seq_pin(j, seq);
+
+ /*
+ * flush_fn is how we identify journal pins in debugfs, so must always
+ * exist, even if it doesn't do anything:
+ */
+ BUG_ON(!flush_fn);
+
+ atomic_inc(&pin_list->count);
+ pin->seq = seq;
+ pin->flush = flush_fn;
+ list_add(&pin->list, &pin_list->list[type]);
+}
+
+void bch2_journal_pin_copy(struct journal *j,
+ struct journal_entry_pin *dst,
+ struct journal_entry_pin *src,
+ journal_pin_flush_fn flush_fn)
{
- struct journal_entry_pin_list *pin_list;
bool reclaim;
spin_lock(&j->lock);
+ u64 seq = READ_ONCE(src->seq);
+
if (seq < journal_last_seq(j)) {
/*
* bch2_journal_pin_copy() raced with bch2_journal_pin_drop() on
@@ -387,18 +411,34 @@ void bch2_journal_pin_set(struct journal *j, u64 seq,
return;
}
- pin_list = journal_seq_pin(j, seq);
+ reclaim = __journal_pin_drop(j, dst);
- reclaim = __journal_pin_drop(j, pin);
+ bch2_journal_pin_set_locked(j, seq, dst, flush_fn, journal_pin_type(flush_fn));
- atomic_inc(&pin_list->count);
- pin->seq = seq;
- pin->flush = flush_fn;
+ if (reclaim)
+ bch2_journal_reclaim_fast(j);
+ spin_unlock(&j->lock);
- if (flush_fn)
- list_add(&pin->list, &pin_list->list[journal_pin_type(flush_fn)]);
- else
- list_add(&pin->list, &pin_list->flushed);
+ /*
+ * If the journal is currently full, we might want to call flush_fn
+ * immediately:
+ */
+ journal_wake(j);
+}
+
+void bch2_journal_pin_set(struct journal *j, u64 seq,
+ struct journal_entry_pin *pin,
+ journal_pin_flush_fn flush_fn)
+{
+ bool reclaim;
+
+ spin_lock(&j->lock);
+
+ BUG_ON(seq < journal_last_seq(j));
+
+ reclaim = __journal_pin_drop(j, pin);
+
+ bch2_journal_pin_set_locked(j, seq, pin, flush_fn, journal_pin_type(flush_fn));
if (reclaim)
bch2_journal_reclaim_fast(j);
@@ -537,13 +577,11 @@ static size_t journal_flush_pins(struct journal *j,
static u64 journal_seq_to_flush(struct journal *j)
{
struct bch_fs *c = container_of(j, struct bch_fs, journal);
- struct bch_dev *ca;
u64 seq_to_flush = 0;
- unsigned iter;
spin_lock(&j->lock);
- for_each_rw_member(ca, c, iter) {
+ for_each_rw_member(c, ca) {
struct journal_device *ja = &ca->journal;
unsigned nr_buckets, bucket_to_flush;
@@ -747,10 +785,9 @@ int bch2_journal_reclaim_start(struct journal *j)
p = kthread_create(bch2_journal_reclaim_thread, j,
"bch-reclaim/%s", c->name);
ret = PTR_ERR_OR_ZERO(p);
- if (ret) {
- bch_err_msg(c, ret, "creating journal reclaim thread");
+ bch_err_msg(c, ret, "creating journal reclaim thread");
+ if (ret)
return ret;
- }
get_task_struct(p);
j->reclaim_thread = p;
@@ -796,6 +833,7 @@ static int journal_flush_done(struct journal *j, u64 seq_to_flush,
bool bch2_journal_flush_pins(struct journal *j, u64 seq_to_flush)
{
+ /* time_stats this */
bool did_work = false;
if (!test_bit(JOURNAL_STARTED, &j->flags))
@@ -854,9 +892,11 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx)
journal_seq_pin(j, seq)->devs);
seq++;
- spin_unlock(&j->lock);
- ret = bch2_mark_replicas(c, &replicas.e);
- spin_lock(&j->lock);
+ if (replicas.e.nr_devs) {
+ spin_unlock(&j->lock);
+ ret = bch2_mark_replicas(c, &replicas.e);
+ spin_lock(&j->lock);
+ }
}
spin_unlock(&j->lock);
err:
diff --git a/fs/bcachefs/journal_reclaim.h b/fs/bcachefs/journal_reclaim.h
index 494d1a6eddb0..ec84c3345281 100644
--- a/fs/bcachefs/journal_reclaim.h
+++ b/fs/bcachefs/journal_reclaim.h
@@ -16,6 +16,7 @@ static inline void journal_reclaim_kick(struct journal *j)
unsigned bch2_journal_dev_buckets_available(struct journal *,
struct journal_device *,
enum journal_space_from);
+void bch2_journal_set_watermark(struct journal *);
void bch2_journal_space_available(struct journal *);
static inline bool journal_pin_active(struct journal_entry_pin *pin)
@@ -47,17 +48,10 @@ static inline void bch2_journal_pin_add(struct journal *j, u64 seq,
bch2_journal_pin_set(j, seq, pin, flush_fn);
}
-static inline void bch2_journal_pin_copy(struct journal *j,
- struct journal_entry_pin *dst,
- struct journal_entry_pin *src,
- journal_pin_flush_fn flush_fn)
-{
- /* Guard against racing with journal_pin_drop(src): */
- u64 seq = READ_ONCE(src->seq);
-
- if (seq)
- bch2_journal_pin_add(j, seq, dst, flush_fn);
-}
+void bch2_journal_pin_copy(struct journal *,
+ struct journal_entry_pin *,
+ struct journal_entry_pin *,
+ journal_pin_flush_fn);
static inline void bch2_journal_pin_update(struct journal *j, u64 seq,
struct journal_entry_pin *pin,
diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c
index f9d9aa95bf3a..0200e299cfbb 100644
--- a/fs/bcachefs/journal_seq_blacklist.c
+++ b/fs/bcachefs/journal_seq_blacklist.c
@@ -267,7 +267,7 @@ retry:
while (!(ret = PTR_ERR_OR_ZERO(b)) &&
b &&
- !test_bit(BCH_FS_STOPPING, &c->flags))
+ !test_bit(BCH_FS_stopping, &c->flags))
b = bch2_btree_iter_next_node(&iter);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h
index a756b69582e3..38817c7a0851 100644
--- a/fs/bcachefs/journal_types.h
+++ b/fs/bcachefs/journal_types.h
@@ -36,6 +36,7 @@ struct journal_buf {
bool noflush; /* write has already been kicked off, and was noflush */
bool must_flush; /* something wants a flush */
bool separate_flush;
+ bool need_flush_to_write_buffer;
};
/*
@@ -182,6 +183,12 @@ struct journal {
darray_u64 early_journal_entries;
/*
+ * Protects journal_buf->data, when accessing without a jorunal
+ * reservation: for synchronization between the btree write buffer code
+ * and the journal write path:
+ */
+ struct mutex buf_lock;
+ /*
* Two journal entries -- one is currently open for new entries, the
* other is possibly being written out.
*/
@@ -195,7 +202,6 @@ struct journal {
/* Used when waiting because the journal was full */
wait_queue_head_t wait;
struct closure_waitlist async_wait;
- struct closure_waitlist preres_wait;
struct closure io;
struct delayed_work write_work;
@@ -262,15 +268,19 @@ struct journal {
unsigned long last_flush_write;
- u64 res_get_blocked_start;
u64 write_start_time;
u64 nr_flush_writes;
u64 nr_noflush_writes;
+ u64 entry_bytes_written;
+
+ u64 low_on_space_start;
+ u64 low_on_pin_start;
+ u64 max_in_flight_start;
+ u64 write_buffer_full_start;
struct bch2_time_stats *flush_write_time;
struct bch2_time_stats *noflush_write_time;
- struct bch2_time_stats *blocked_time;
struct bch2_time_stats *flush_seq_time;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c
index 5699cd4873c8..1b828bddd11b 100644
--- a/fs/bcachefs/keylist.c
+++ b/fs/bcachefs/keylist.c
@@ -43,8 +43,6 @@ void bch2_keylist_pop_front(struct keylist *l)
#ifdef CONFIG_BCACHEFS_DEBUG
void bch2_verify_keylist_sorted(struct keylist *l)
{
- struct bkey_i *k;
-
for_each_keylist_key(l, k)
BUG_ON(bkey_next(k) != l->top &&
bpos_ge(k->k.p, bkey_next(k)->k.p));
diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h
index fe759c7031e0..e687e0e9aede 100644
--- a/fs/bcachefs/keylist.h
+++ b/fs/bcachefs/keylist.h
@@ -50,18 +50,16 @@ static inline struct bkey_i *bch2_keylist_front(struct keylist *l)
}
#define for_each_keylist_key(_keylist, _k) \
- for (_k = (_keylist)->keys; \
+ for (struct bkey_i *_k = (_keylist)->keys; \
_k != (_keylist)->top; \
_k = bkey_next(_k))
static inline u64 keylist_sectors(struct keylist *keys)
{
- struct bkey_i *k;
u64 ret = 0;
for_each_keylist_key(keys, k)
ret += k->k.size;
-
return ret;
}
diff --git a/fs/bcachefs/logged_ops.c b/fs/bcachefs/logged_ops.c
index 8640f7dee0de..ad598105c587 100644
--- a/fs/bcachefs/logged_ops.c
+++ b/fs/bcachefs/logged_ops.c
@@ -54,16 +54,12 @@ static int resume_logged_op(struct btree_trans *trans, struct btree_iter *iter,
int bch2_resume_logged_ops(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
- for_each_btree_key2(trans, iter,
- BTREE_ID_logged_ops, POS_MIN, BTREE_ITER_PREFETCH, k,
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter,
+ BTREE_ID_logged_ops, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
resume_logged_op(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -85,13 +81,13 @@ static int __bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
int bch2_logged_op_start(struct btree_trans *trans, struct bkey_i *k)
{
- return commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ return commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_logged_op_start(trans, k));
}
void bch2_logged_op_finish(struct btree_trans *trans, struct bkey_i *k)
{
- int ret = commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ int ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_btree_delete(trans, BTREE_ID_logged_ops, k->k.p, 0));
/*
* This needs to be a fatal error because we've left an unfinished
diff --git a/fs/bcachefs/logged_ops_format.h b/fs/bcachefs/logged_ops_format.h
new file mode 100644
index 000000000000..6a4bf7129dba
--- /dev/null
+++ b/fs/bcachefs/logged_ops_format.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_LOGGED_OPS_FORMAT_H
+#define _BCACHEFS_LOGGED_OPS_FORMAT_H
+
+struct bch_logged_op_truncate {
+ struct bch_val v;
+ __le32 subvol;
+ __le32 pad;
+ __le64 inum;
+ __le64 new_i_size;
+};
+
+enum logged_op_finsert_state {
+ LOGGED_OP_FINSERT_start,
+ LOGGED_OP_FINSERT_shift_extents,
+ LOGGED_OP_FINSERT_finish,
+};
+
+struct bch_logged_op_finsert {
+ struct bch_val v;
+ __u8 state;
+ __u8 pad[3];
+ __le32 subvol;
+ __le64 inum;
+ __le64 dst_offset;
+ __le64 src_offset;
+ __le64 pos;
+};
+
+#endif /* _BCACHEFS_LOGGED_OPS_FORMAT_H */
diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c
index a5cc0ed195d6..7a4ca5a28b3e 100644
--- a/fs/bcachefs/lru.c
+++ b/fs/bcachefs/lru.c
@@ -147,18 +147,13 @@ fsck_err:
int bch2_check_lrus(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bpos last_flushed_pos = POS_MIN;
- int ret = 0;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_lru, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
bch2_check_lru_key(trans, &iter, k, &last_flushed_pos)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/mean_and_variance.c b/fs/bcachefs/mean_and_variance.c
index 1f0801e2e565..bf0ef668fd38 100644
--- a/fs/bcachefs/mean_and_variance.c
+++ b/fs/bcachefs/mean_and_variance.c
@@ -62,6 +62,7 @@ EXPORT_SYMBOL_GPL(u128_div);
/**
* mean_and_variance_get_mean() - get mean from @s
+ * @s: mean and variance number of samples and their sums
*/
s64 mean_and_variance_get_mean(struct mean_and_variance s)
{
@@ -71,6 +72,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_mean);
/**
* mean_and_variance_get_variance() - get variance from @s1
+ * @s1: mean and variance number of samples and sums
*
* see linked pdf equation 12.
*/
@@ -89,6 +91,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_variance);
/**
* mean_and_variance_get_stddev() - get standard deviation from @s
+ * @s: mean and variance number of samples and their sums
*/
u32 mean_and_variance_get_stddev(struct mean_and_variance s)
{
@@ -98,8 +101,8 @@ EXPORT_SYMBOL_GPL(mean_and_variance_get_stddev);
/**
* mean_and_variance_weighted_update() - exponentially weighted variant of mean_and_variance_update()
- * @s1: ..
- * @s2: ..
+ * @s: mean and variance number of samples and their sums
+ * @x: new value to include in the &mean_and_variance_weighted
*
* see linked pdf: function derived from equations 140-143 where alpha = 2^w.
* values are stored bitshifted for performance and added precision.
@@ -129,6 +132,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_update);
/**
* mean_and_variance_weighted_get_mean() - get mean from @s
+ * @s: mean and variance number of samples and their sums
*/
s64 mean_and_variance_weighted_get_mean(struct mean_and_variance_weighted s)
{
@@ -138,6 +142,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_mean);
/**
* mean_and_variance_weighted_get_variance() -- get variance from @s
+ * @s: mean and variance number of samples and their sums
*/
u64 mean_and_variance_weighted_get_variance(struct mean_and_variance_weighted s)
{
@@ -148,6 +153,7 @@ EXPORT_SYMBOL_GPL(mean_and_variance_weighted_get_variance);
/**
* mean_and_variance_weighted_get_stddev() - get standard deviation from @s
+ * @s: mean and variance number of samples and their sums
*/
u32 mean_and_variance_weighted_get_stddev(struct mean_and_variance_weighted s)
{
diff --git a/fs/bcachefs/mean_and_variance.h b/fs/bcachefs/mean_and_variance.h
index 647505010b39..64df11ab422b 100644
--- a/fs/bcachefs/mean_and_variance.h
+++ b/fs/bcachefs/mean_and_variance.h
@@ -12,9 +12,12 @@
/*
* u128_u: u128 user mode, because not all architectures support a real int128
* type
+ *
+ * We don't use this version in userspace, because in userspace we link with
+ * Rust and rustc has issues with u128.
*/
-#ifdef __SIZEOF_INT128__
+#if defined(__SIZEOF_INT128__) && defined(__KERNEL__) && !defined(CONFIG_PARISC)
typedef struct {
unsigned __int128 v;
diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c
index e3a51f6d6c9b..5623cee3ef86 100644
--- a/fs/bcachefs/migrate.c
+++ b/fs/bcachefs/migrate.c
@@ -79,8 +79,6 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans,
static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
{
struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter;
- struct bkey_s_c k;
enum btree_id id;
int ret = 0;
@@ -90,7 +88,7 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags)
ret = for_each_btree_key_commit(trans, iter, id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags));
if (ret)
break;
@@ -145,10 +143,9 @@ retry:
continue;
}
- if (ret) {
- bch_err_msg(c, ret, "updating btree node key");
+ bch_err_msg(c, ret, "updating btree node key");
+ if (ret)
break;
- }
next:
bch2_btree_iter_next_node(&iter);
}
diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c
index 54830ee0ed88..bf68ea49447b 100644
--- a/fs/bcachefs/move.c
+++ b/fs/bcachefs/move.c
@@ -6,9 +6,11 @@
#include "backpointers.h"
#include "bkey_buf.h"
#include "btree_gc.h"
+#include "btree_io.h"
#include "btree_update.h"
#include "btree_update_interior.h"
#include "btree_write_buffer.h"
+#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "errcode.h"
@@ -27,12 +29,53 @@
#include <linux/ioprio.h>
#include <linux/kthread.h>
-static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k)
+const char * const bch2_data_ops_strs[] = {
+#define x(t, n, ...) [n] = #t,
+ BCH_DATA_OPS()
+#undef x
+ NULL
+};
+
+static void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ printbuf_tabstop_push(out, 20);
+ prt_str(out, "rewrite ptrs:");
+ prt_tab(out);
+ bch2_prt_u64_base2(out, data_opts->rewrite_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "kill ptrs: ");
+ prt_tab(out);
+ bch2_prt_u64_base2(out, data_opts->kill_ptrs);
+ prt_newline(out);
+
+ prt_str(out, "target: ");
+ prt_tab(out);
+ bch2_target_to_text(out, c, data_opts->target);
+ prt_newline(out);
+
+ prt_str(out, "compression: ");
+ prt_tab(out);
+ bch2_compression_opt_to_text(out, background_compression(*io_opts));
+ prt_newline(out);
+
+ prt_str(out, "extra replicas: ");
+ prt_tab(out);
+ prt_u64(out, data_opts->extra_replicas);
+}
+
+static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
{
if (trace_move_extent_enabled()) {
struct printbuf buf = PRINTBUF;
bch2_bkey_val_to_text(&buf, c, k);
+ prt_newline(&buf);
+ bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts);
trace_move_extent(c, buf.buf);
printbuf_exit(&buf);
}
@@ -63,7 +106,7 @@ struct moving_io {
struct data_update write;
/* Must be last since it is variable size */
- struct bio_vec bi_inline_vecs[0];
+ struct bio_vec bi_inline_vecs[];
};
static void move_free(struct moving_io *io)
@@ -104,6 +147,15 @@ static void move_write(struct moving_io *io)
return;
}
+ if (trace_move_extent_write_enabled()) {
+ struct bch_fs *c = io->write.op.c;
+ struct printbuf buf = PRINTBUF;
+
+ bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k));
+ trace_move_extent_write(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
closure_get(&io->write.ctxt->cl);
atomic_add(io->write_sectors, &io->write.ctxt->write_sectors);
atomic_inc(&io->write.ctxt->write_ios);
@@ -152,7 +204,7 @@ void bch2_move_ctxt_wait_for_io(struct moving_context *ctxt)
atomic_read(&ctxt->write_sectors) != sectors_pending);
}
-static void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
+void bch2_moving_ctxt_flush_all(struct moving_context *ctxt)
{
move_ctxt_wait_event(ctxt, list_empty(&ctxt->reads));
bch2_trans_unlock_long(ctxt->trans);
@@ -211,7 +263,7 @@ void bch2_move_stats_exit(struct bch_move_stats *stats, struct bch_fs *c)
trace_move_data(c, stats);
}
-void bch2_move_stats_init(struct bch_move_stats *stats, char *name)
+void bch2_move_stats_init(struct bch_move_stats *stats, const char *name)
{
memset(stats, 0, sizeof(*stats));
stats->data_type = BCH_DATA_user;
@@ -234,9 +286,10 @@ int bch2_move_extent(struct moving_context *ctxt,
unsigned sectors = k.k->size, pages;
int ret = -ENOMEM;
+ trace_move_extent2(c, k, &io_opts, &data_opts);
+
if (ctxt->stats)
ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos);
- trace_move_extent2(c, k);
bch2_data_update_opts_normalize(k, &data_opts);
@@ -342,7 +395,8 @@ err:
bch2_err_matches(ret, BCH_ERR_transaction_restart))
return ret;
- this_cpu_inc(c->counters[BCH_COUNTER_move_extent_start_fail]);
+ count_event(c, move_extent_start_fail);
+
if (trace_move_extent_start_fail_enabled()) {
struct printbuf buf = PRINTBUF;
@@ -364,13 +418,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
int ret = 0;
if (io_opts->cur_inum != extent_k.k->p.inode) {
- struct btree_iter iter;
- struct bkey_s_c k;
-
io_opts->d.nr = 0;
- for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
- BTREE_ITER_ALL_SNAPSHOTS, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_inodes, POS(0, extent_k.k->p.inode),
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
if (k.k->p.offset != extent_k.k->p.inode)
break;
@@ -383,11 +434,8 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
struct snapshot_io_opts_entry e = { .snapshot = k.k->p.snapshot };
bch2_inode_opts_get(&e.io_opts, trans->c, &inode);
- ret = darray_push(&io_opts->d, e);
- if (ret)
- break;
- }
- bch2_trans_iter_exit(trans, &iter);
+ darray_push(&io_opts->d, e);
+ }));
io_opts->cur_inum = extent_k.k->p.inode;
}
@@ -395,12 +443,10 @@ struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans,
if (ret)
return ERR_PTR(ret);
- if (extent_k.k->p.snapshot) {
- struct snapshot_io_opts_entry *i;
+ if (extent_k.k->p.snapshot)
darray_for_each(io_opts->d, i)
if (bch2_snapshot_is_ancestor(c, extent_k.k->p.snapshot, i->snapshot))
return &i->io_opts;
- }
return &io_opts->fs_io_opts;
}
@@ -628,7 +674,7 @@ int bch2_move_data(struct bch_fs *c,
return ret;
}
-int __bch2_evacuate_bucket(struct moving_context *ctxt,
+int bch2_evacuate_bucket(struct moving_context *ctxt,
struct move_bucket_in_flight *bucket_in_flight,
struct bpos bucket, int gen,
struct data_update_opts _data_opts)
@@ -664,21 +710,19 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
bkey_err(k = bch2_btree_iter_peek_slot(&iter)));
bch2_trans_iter_exit(trans, &iter);
- if (ret) {
- bch_err_msg(c, ret, "looking up alloc key");
+ bch_err_msg(c, ret, "looking up alloc key");
+ if (ret)
goto err;
- }
a = bch2_alloc_to_v4(k, &a_convert);
- dirty_sectors = a->dirty_sectors;
+ dirty_sectors = bch2_bucket_sectors_dirty(*a);
bucket_size = bch_dev_bkey_exists(c, bucket.inode)->mi.bucket_size;
fragmentation = a->fragmentation_lru;
- ret = bch2_btree_write_buffer_flush(trans);
- if (ret) {
- bch_err_msg(c, ret, "flushing btree write buffer");
+ ret = bch2_btree_write_buffer_tryflush(trans);
+ bch_err_msg(c, ret, "flushing btree write buffer");
+ if (ret)
goto err;
- }
while (!(ret = bch2_move_ratelimit(ctxt))) {
if (is_kthread && kthread_should_stop())
@@ -697,9 +741,6 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
break;
if (!bp.level) {
- const struct bch_extent_ptr *ptr;
- unsigned i = 0;
-
k = bch2_backpointer_get_key(trans, &iter, bp_pos, bp, 0);
ret = bkey_err(k);
if (bch2_err_matches(ret, BCH_ERR_transaction_restart))
@@ -722,6 +763,7 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
data_opts.target = io_opts.background_target;
data_opts.rewrite_ptrs = 0;
+ unsigned i = 0;
bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) {
if (ptr->dev == bucket.inode) {
data_opts.rewrite_ptrs |= 1U << i;
@@ -763,6 +805,8 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
if (!b)
goto next;
+ unsigned sectors = btree_ptr_sectors_written(&b->key);
+
ret = bch2_btree_node_rewrite(trans, &iter, b, 0);
bch2_trans_iter_exit(trans, &iter);
@@ -772,11 +816,10 @@ int __bch2_evacuate_bucket(struct moving_context *ctxt,
goto err;
if (ctxt->rate)
- bch2_ratelimit_increment(ctxt->rate,
- c->opts.btree_node_size >> 9);
+ bch2_ratelimit_increment(ctxt->rate, sectors);
if (ctxt->stats) {
- atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_seen);
- atomic64_add(c->opts.btree_node_size >> 9, &ctxt->stats->sectors_moved);
+ atomic64_add(sectors, &ctxt->stats->sectors_seen);
+ atomic64_add(sectors, &ctxt->stats->sectors_moved);
}
}
next:
@@ -789,31 +832,13 @@ err:
return ret;
}
-int bch2_evacuate_bucket(struct bch_fs *c,
- struct bpos bucket, int gen,
- struct data_update_opts data_opts,
- struct bch_ratelimit *rate,
- struct bch_move_stats *stats,
- struct write_point_specifier wp,
- bool wait_on_copygc)
-{
- struct moving_context ctxt;
- int ret;
-
- bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc);
- ret = __bch2_evacuate_bucket(&ctxt, NULL, bucket, gen, data_opts);
- bch2_moving_ctxt_exit(&ctxt);
-
- return ret;
-}
-
typedef bool (*move_btree_pred)(struct bch_fs *, void *,
struct btree *, struct bch_io_opts *,
struct data_update_opts *);
static int bch2_move_btree(struct bch_fs *c,
- enum btree_id start_btree_id, struct bpos start_pos,
- enum btree_id end_btree_id, struct bpos end_pos,
+ struct bbpos start,
+ struct bbpos end,
move_btree_pred pred, void *arg,
struct bch_move_stats *stats)
{
@@ -823,7 +848,7 @@ static int bch2_move_btree(struct bch_fs *c,
struct btree_trans *trans;
struct btree_iter iter;
struct btree *b;
- enum btree_id id;
+ enum btree_id btree;
struct data_update_opts data_opts;
int ret = 0;
@@ -834,15 +859,15 @@ static int bch2_move_btree(struct bch_fs *c,
stats->data_type = BCH_DATA_btree;
- for (id = start_btree_id;
- id <= min_t(unsigned, end_btree_id, btree_id_nr_alive(c) - 1);
- id++) {
- stats->pos = BBPOS(id, POS_MIN);
+ for (btree = start.btree;
+ btree <= min_t(unsigned, end.btree, btree_id_nr_alive(c) - 1);
+ btree ++) {
+ stats->pos = BBPOS(btree, POS_MIN);
- if (!bch2_btree_id_root(c, id)->b)
+ if (!bch2_btree_id_root(c, btree)->b)
continue;
- bch2_trans_node_iter_init(trans, &iter, id, POS_MIN, 0, 0,
+ bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, 0,
BTREE_ITER_PREFETCH);
retry:
ret = 0;
@@ -852,8 +877,8 @@ retry:
if (kthread && kthread_should_stop())
break;
- if ((cmp_int(id, end_btree_id) ?:
- bpos_cmp(b->key.k.p, end_pos)) > 0)
+ if ((cmp_int(btree, end.btree) ?:
+ bpos_cmp(b->key.k.p, end.pos)) > 0)
break;
stats->pos = BBPOS(iter.btree_id, iter.pos);
@@ -910,7 +935,6 @@ static bool migrate_pred(struct bch_fs *c, void *arg,
struct data_update_opts *data_opts)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
- const struct bch_extent_ptr *ptr;
struct bch_ioctl_data *op = arg;
unsigned i = 0;
@@ -990,8 +1014,8 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
int ret;
ret = bch2_move_btree(c,
- 0, POS_MIN,
- BTREE_ID_NR, SPOS_MAX,
+ BBPOS_MIN,
+ BBPOS_MAX,
rewrite_old_nodes_pred, c, stats);
if (!ret) {
mutex_lock(&c->sb_lock);
@@ -1006,79 +1030,109 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats)
return ret;
}
+static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg,
+ struct bkey_s_c k,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ unsigned durability = bch2_bkey_durability(c, k);
+ unsigned replicas = bkey_is_btree_ptr(k.k)
+ ? c->opts.metadata_replicas
+ : io_opts->data_replicas;
+ const union bch_extent_entry *entry;
+ struct extent_ptr_decoded p;
+ unsigned i = 0;
+
+ bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) {
+ unsigned d = bch2_extent_ptr_durability(c, &p);
+
+ if (d && durability - d >= replicas) {
+ data_opts->kill_ptrs |= BIT(i);
+ durability -= d;
+ }
+
+ i++;
+ }
+
+ return data_opts->kill_ptrs != 0;
+}
+
+static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg,
+ struct btree *b,
+ struct bch_io_opts *io_opts,
+ struct data_update_opts *data_opts)
+{
+ return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts);
+}
+
int bch2_data_job(struct bch_fs *c,
struct bch_move_stats *stats,
struct bch_ioctl_data op)
{
+ struct bbpos start = BBPOS(op.start_btree, op.start_pos);
+ struct bbpos end = BBPOS(op.end_btree, op.end_pos);
int ret = 0;
+ if (op.op >= BCH_DATA_OP_NR)
+ return -EINVAL;
+
+ bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]);
+
switch (op.op) {
- case BCH_DATA_OP_REREPLICATE:
- bch2_move_stats_init(stats, "rereplicate");
+ case BCH_DATA_OP_rereplicate:
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, -1);
-
- ret = bch2_move_btree(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ ret = bch2_move_btree(c, start, end,
rereplicate_btree_pred, c, stats) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
-
- ret = bch2_move_data(c,
- (struct bbpos) { op.start_btree, op.start_pos },
- (struct bbpos) { op.end_btree, op.end_pos },
+ ret = bch2_move_data(c, start, end,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
rereplicate_pred, c) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
-
- bch2_move_stats_exit(stats, c);
break;
- case BCH_DATA_OP_MIGRATE:
+ case BCH_DATA_OP_migrate:
if (op.migrate.dev >= c->sb.nr_devices)
return -EINVAL;
- bch2_move_stats_init(stats, "migrate");
stats->data_type = BCH_DATA_journal;
ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev);
-
- ret = bch2_move_btree(c,
- op.start_btree, op.start_pos,
- op.end_btree, op.end_pos,
+ ret = bch2_move_btree(c, start, end,
migrate_btree_pred, &op, stats) ?: ret;
- ret = bch2_replicas_gc2(c) ?: ret;
-
- ret = bch2_move_data(c,
- (struct bbpos) { op.start_btree, op.start_pos },
- (struct bbpos) { op.end_btree, op.end_pos },
+ ret = bch2_move_data(c, start, end,
NULL,
stats,
writepoint_hashed((unsigned long) current),
true,
migrate_pred, &op) ?: ret;
ret = bch2_replicas_gc2(c) ?: ret;
-
- bch2_move_stats_exit(stats, c);
break;
- case BCH_DATA_OP_REWRITE_OLD_NODES:
- bch2_move_stats_init(stats, "rewrite_old_nodes");
+ case BCH_DATA_OP_rewrite_old_nodes:
ret = bch2_scan_old_btree_nodes(c, stats);
- bch2_move_stats_exit(stats, c);
+ break;
+ case BCH_DATA_OP_drop_extra_replicas:
+ ret = bch2_move_btree(c, start, end,
+ drop_extra_replicas_btree_pred, c, stats) ?: ret;
+ ret = bch2_move_data(c, start, end, NULL, stats,
+ writepoint_hashed((unsigned long) current),
+ true,
+ drop_extra_replicas_pred, c) ?: ret;
+ ret = bch2_replicas_gc2(c) ?: ret;
break;
default:
ret = -EINVAL;
}
+ bch2_move_stats_exit(stats, c);
return ret;
}
void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats)
{
- prt_printf(out, "%s: data type=%s pos=",
- stats->name,
- bch2_data_types[stats->data_type]);
+ prt_printf(out, "%s: data type==", stats->name);
+ bch2_prt_data_type(out, stats->data_type);
+ prt_str(out, " pos=");
bch2_bbpos_to_text(out, stats->pos);
prt_newline(out);
printbuf_indent_add(out, 2);
diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h
index 0906aa2d1de2..9baf3093a678 100644
--- a/fs/bcachefs/move.h
+++ b/fs/bcachefs/move.h
@@ -75,12 +75,15 @@ do { \
typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c,
struct bch_io_opts *, struct data_update_opts *);
+extern const char * const bch2_data_ops_strs[];
+
void bch2_moving_ctxt_exit(struct moving_context *);
void bch2_moving_ctxt_init(struct moving_context *, struct bch_fs *,
struct bch_ratelimit *, struct bch_move_stats *,
struct write_point_specifier, bool);
struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *);
void bch2_moving_ctxt_do_pending_writes(struct moving_context *);
+void bch2_moving_ctxt_flush_all(struct moving_context *);
void bch2_move_ctxt_wait_for_io(struct moving_context *);
int bch2_move_ratelimit(struct moving_context *);
@@ -133,23 +136,17 @@ int bch2_move_data(struct bch_fs *,
bool,
move_pred_fn, void *);
-int __bch2_evacuate_bucket(struct moving_context *,
+int bch2_evacuate_bucket(struct moving_context *,
struct move_bucket_in_flight *,
struct bpos, int,
struct data_update_opts);
-int bch2_evacuate_bucket(struct bch_fs *, struct bpos, int,
- struct data_update_opts,
- struct bch_ratelimit *,
- struct bch_move_stats *,
- struct write_point_specifier,
- bool);
int bch2_data_job(struct bch_fs *,
struct bch_move_stats *,
struct bch_ioctl_data);
void bch2_move_stats_to_text(struct printbuf *, struct bch_move_stats *);
void bch2_move_stats_exit(struct bch_move_stats *, struct bch_fs *);
-void bch2_move_stats_init(struct bch_move_stats *, char *);
+void bch2_move_stats_init(struct bch_move_stats *, const char *);
void bch2_fs_moving_ctxts_to_text(struct printbuf *, struct bch_fs *);
diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c
index a84e79f79e5e..69e06a84dad4 100644
--- a/fs/bcachefs/movinggc.c
+++ b/fs/bcachefs/movinggc.c
@@ -91,7 +91,7 @@ static int bch2_bucket_is_movable(struct btree_trans *trans,
a = bch2_alloc_to_v4(k, &_a);
b->k.gen = a->gen;
- b->sectors = a->dirty_sectors;
+ b->sectors = bch2_bucket_sectors_dirty(*a);
ret = data_type_movable(a->data_type) &&
a->fragmentation_lru &&
@@ -145,20 +145,21 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
{
struct btree_trans *trans = ctxt->trans;
struct bch_fs *c = trans->c;
- struct btree_iter iter;
- struct bkey_s_c k;
size_t nr_to_get = max_t(size_t, 16U, buckets_in_flight->nr / 4);
size_t saw = 0, in_flight = 0, not_movable = 0, sectors = 0;
int ret;
move_buckets_wait(ctxt, buckets_in_flight, false);
- ret = bch2_btree_write_buffer_flush(trans);
- if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_flush()",
+ ret = bch2_btree_write_buffer_tryflush(trans);
+ if (bch2_err_matches(ret, EROFS))
+ return ret;
+
+ if (bch2_fs_fatal_err_on(ret, c, "%s: error %s from bch2_btree_write_buffer_tryflush()",
__func__, bch2_err_str(ret)))
return ret;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_lru,
+ ret = for_each_btree_key_upto(trans, iter, BTREE_ID_lru,
lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0),
lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX),
0, k, ({
@@ -167,15 +168,23 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt,
saw++;
- if (!bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p)))
+ ret2 = bch2_bucket_is_movable(trans, &b, lru_pos_time(k.k->p));
+ if (ret2 < 0)
+ goto err;
+
+ if (!ret2)
not_movable++;
else if (bucket_in_flight(buckets_in_flight, b.k))
in_flight++;
else {
- ret2 = darray_push(buckets, b) ?: buckets->nr >= nr_to_get;
- if (ret2 >= 0)
- sectors += b.sectors;
+ ret2 = darray_push(buckets, b);
+ if (ret2)
+ goto err;
+ sectors += b.sectors;
}
+
+ ret2 = buckets->nr >= nr_to_get;
+err:
ret2;
}));
@@ -198,7 +207,6 @@ static int bch2_copygc(struct moving_context *ctxt,
};
move_buckets buckets = { 0 };
struct move_bucket_in_flight *f;
- struct move_bucket *i;
u64 moved = atomic64_read(&ctxt->stats->sectors_moved);
int ret = 0;
@@ -221,7 +229,7 @@ static int bch2_copygc(struct moving_context *ctxt,
break;
}
- ret = __bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
+ ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket,
f->bucket.k.gen, data_opts);
if (ret)
goto err;
@@ -259,19 +267,16 @@ err:
*/
unsigned long bch2_copygc_wait_amount(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned dev_idx;
s64 wait = S64_MAX, fragmented_allowed, fragmented;
- unsigned i;
- for_each_rw_member(ca, c, dev_idx) {
+ for_each_rw_member(c, ca) {
struct bch_dev_usage usage = bch2_dev_usage_read(ca);
fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) *
ca->mi.bucket_size) >> 1);
fragmented = 0;
- for (i = 0; i < BCH_DATA_NR; i++)
+ for (unsigned i = 0; i < BCH_DATA_NR; i++)
if (data_type_movable(i))
fragmented += usage.d[i].fragmented;
@@ -313,9 +318,9 @@ static int bch2_copygc_thread(void *arg)
if (!buckets)
return -ENOMEM;
ret = rhashtable_init(&buckets->table, &bch_move_bucket_params);
+ bch_err_msg(c, ret, "allocating copygc buckets in flight");
if (ret) {
kfree(buckets);
- bch_err_msg(c, ret, "allocating copygc buckets in flight");
return ret;
}
@@ -334,7 +339,8 @@ static int bch2_copygc_thread(void *arg)
if (!c->copy_gc_enabled) {
move_buckets_wait(&ctxt, buckets, true);
- kthread_wait_freezable(c->copy_gc_enabled);
+ kthread_wait_freezable(c->copy_gc_enabled ||
+ kthread_should_stop());
}
if (unlikely(freezing(current))) {
@@ -411,10 +417,9 @@ int bch2_copygc_start(struct bch_fs *c)
t = kthread_create(bch2_copygc_thread, c, "bch-copygc/%s", c->name);
ret = PTR_ERR_OR_ZERO(t);
- if (ret) {
- bch_err_msg(c, ret, "creating copygc thread");
+ bch_err_msg(c, ret, "creating copygc thread");
+ if (ret)
return ret;
- }
get_task_struct(t);
diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c
index 8dd4046cca41..b1ed0b9a20d3 100644
--- a/fs/bcachefs/opts.c
+++ b/fs/bcachefs/opts.c
@@ -52,7 +52,7 @@ const char * const bch2_csum_opts[] = {
NULL
};
-const char * const bch2_compression_types[] = {
+const char * const __bch2_compression_types[] = {
BCH_COMPRESSION_TYPES()
NULL
};
@@ -72,7 +72,7 @@ const char * const bch2_str_hash_opts[] = {
NULL
};
-const char * const bch2_data_types[] = {
+const char * const __bch2_data_types[] = {
BCH_DATA_TYPES()
NULL
};
@@ -279,14 +279,14 @@ int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err)
if (err)
prt_printf(err, "%s: not a multiple of 512",
opt->attr.name);
- return -EINVAL;
+ return -BCH_ERR_opt_parse_error;
}
if ((opt->flags & OPT_MUST_BE_POW_2) && !is_power_of_2(v)) {
if (err)
prt_printf(err, "%s: must be a power of two",
opt->attr.name);
- return -EINVAL;
+ return -BCH_ERR_opt_parse_error;
}
if (opt->fn.validate)
diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h
index 8526f177450a..9a4b7faa3765 100644
--- a/fs/bcachefs/opts.h
+++ b/fs/bcachefs/opts.h
@@ -18,11 +18,11 @@ extern const char * const bch2_sb_compat[];
extern const char * const __bch2_btree_ids[];
extern const char * const bch2_csum_types[];
extern const char * const bch2_csum_opts[];
-extern const char * const bch2_compression_types[];
+extern const char * const __bch2_compression_types[];
extern const char * const bch2_compression_opts[];
extern const char * const bch2_str_hash_types[];
extern const char * const bch2_str_hash_opts[];
-extern const char * const bch2_data_types[];
+extern const char * const __bch2_data_types[];
extern const char * const bch2_member_states[];
extern const char * const bch2_jset_entry_types[];
extern const char * const bch2_fs_usage_types[];
@@ -233,11 +233,6 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, true, \
NULL, "Stash pointer to in memory btree node in btree ptr")\
- x(btree_write_buffer_size, u32, \
- OPT_FS|OPT_MOUNT, \
- OPT_UINT(16, (1U << 20) - 1), \
- BCH2_NO_SB_OPT, 1U << 13, \
- NULL, "Number of btree write buffer entries") \
x(gc_reserve_percent, u8, \
OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \
OPT_UINT(5, 21), \
@@ -394,7 +389,7 @@ enum fsck_err_opts {
BCH2_NO_SB_OPT, BCH_SB_SECTOR, \
"offset", "Sector offset of superblock") \
x(read_only, u8, \
- OPT_FS, \
+ OPT_FS|OPT_MOUNT, \
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, NULL) \
@@ -419,6 +414,11 @@ enum fsck_err_opts {
OPT_BOOL(), \
BCH2_NO_SB_OPT, false, \
NULL, "Allocate the buckets_nouse bitmap") \
+ x(stdio, u64, \
+ 0, \
+ OPT_UINT(0, S64_MAX), \
+ BCH2_NO_SB_OPT, false, \
+ NULL, "Pointer to a struct stdio_redirect") \
x(project, u8, \
OPT_INODE, \
OPT_BOOL(), \
@@ -458,7 +458,13 @@ enum fsck_err_opts {
OPT_UINT(0, BCH_REPLICAS_MAX), \
BCH2_NO_SB_OPT, 1, \
"n", "Data written to this device will be considered\n"\
- "to have already been replicated n times")
+ "to have already been replicated n times") \
+ x(btree_node_prefetch, u8, \
+ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \
+ OPT_BOOL(), \
+ BCH2_NO_SB_OPT, true, \
+ NULL, "BTREE_ITER_PREFETCH casuse btree nodes to be\n"\
+ " prefetched sequentially")
struct bch_opts {
#define x(_name, _bits, ...) unsigned _name##_defined:1;
@@ -558,6 +564,11 @@ struct bch_io_opts {
#undef x
};
+static inline unsigned background_compression(struct bch_io_opts opts)
+{
+ return opts.background_compression ?: opts.compression;
+}
+
struct bch_io_opts bch2_opts_to_inode_opts(struct bch_opts);
bool bch2_opt_is_inode_opt(enum bch_opt_id);
diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c
index accf246c3233..b27d22925929 100644
--- a/fs/bcachefs/printbuf.c
+++ b/fs/bcachefs/printbuf.c
@@ -56,6 +56,7 @@ void bch2_prt_vprintf(struct printbuf *out, const char *fmt, va_list args)
va_copy(args2, args);
len = vsnprintf(out->buf + out->pos, printbuf_remaining(out), fmt, args2);
+ va_end(args2);
} while (len + 1 >= printbuf_remaining(out) &&
!bch2_printbuf_make_room(out, len + 1));
diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c
index a54647c36b85..e68b34eab90a 100644
--- a/fs/bcachefs/quota.c
+++ b/fs/bcachefs/quota.c
@@ -599,14 +599,9 @@ advance:
int bch2_fs_quota_read(struct bch_fs *c)
{
- struct bch_sb_field_quota *sb_quota;
- struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
mutex_lock(&c->sb_lock);
- sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
+ struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb);
if (!sb_quota) {
mutex_unlock(&c->sb_lock);
return -BCH_ERR_ENOSPC_sb_quota;
@@ -615,19 +610,14 @@ int bch2_fs_quota_read(struct bch_fs *c)
bch2_sb_quota_read(c);
mutex_unlock(&c->sb_lock);
- trans = bch2_trans_get(c);
-
- ret = for_each_btree_key2(trans, iter, BTREE_ID_quotas,
- POS_MIN, BTREE_ITER_PREFETCH, k,
- __bch2_quota_set(c, k, NULL)) ?:
- for_each_btree_key2(trans, iter, BTREE_ID_inodes,
- POS_MIN, BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- bch2_fs_quota_read_inode(trans, &iter, k));
-
- bch2_trans_put(trans);
-
- if (ret)
- bch_err_fn(c, ret);
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_quotas, POS_MIN,
+ BTREE_ITER_PREFETCH, k,
+ __bch2_quota_set(c, k, NULL)) ?:
+ for_each_btree_key(trans, iter, BTREE_ID_inodes, POS_MIN,
+ BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
+ bch2_fs_quota_read_inode(trans, &iter, k)));
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/quota_format.h b/fs/bcachefs/quota_format.h
new file mode 100644
index 000000000000..dc34347ef6c7
--- /dev/null
+++ b/fs/bcachefs/quota_format.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_QUOTA_FORMAT_H
+#define _BCACHEFS_QUOTA_FORMAT_H
+
+/* KEY_TYPE_quota: */
+
+enum quota_types {
+ QTYP_USR = 0,
+ QTYP_GRP = 1,
+ QTYP_PRJ = 2,
+ QTYP_NR = 3,
+};
+
+enum quota_counters {
+ Q_SPC = 0,
+ Q_INO = 1,
+ Q_COUNTERS = 2,
+};
+
+struct bch_quota_counter {
+ __le64 hardlimit;
+ __le64 softlimit;
+};
+
+struct bch_quota {
+ struct bch_val v;
+ struct bch_quota_counter c[Q_COUNTERS];
+} __packed __aligned(8);
+
+/* BCH_SB_FIELD_quota: */
+
+struct bch_sb_quota_counter {
+ __le32 timelimit;
+ __le32 warnlimit;
+};
+
+struct bch_sb_quota_type {
+ __le64 flags;
+ struct bch_sb_quota_counter c[Q_COUNTERS];
+};
+
+struct bch_sb_field_quota {
+ struct bch_sb_field field;
+ struct bch_sb_quota_type q[QTYP_NR];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_QUOTA_FORMAT_H */
diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c
index 3319190b8d9c..22d1017aa49b 100644
--- a/fs/bcachefs/rebalance.c
+++ b/fs/bcachefs/rebalance.c
@@ -69,7 +69,7 @@ err:
int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum)
{
- int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_NOFAIL|BTREE_INSERT_LAZY_RW,
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc|BCH_TRANS_COMMIT_lazy_rw,
__bch2_set_rebalance_needs_scan(trans, inum));
rebalance_wakeup(c);
return ret;
@@ -125,7 +125,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans,
extent_entry_drop(bkey_i_to_s(n),
(void *) bch2_bkey_rebalance_opts(bkey_i_to_s_c(n)));
- return bch2_trans_commit(trans, NULL, NULL, BTREE_INSERT_NOFAIL);
+ return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc);
}
static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
@@ -171,6 +171,20 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans,
return bkey_s_c_null;
}
+ if (trace_rebalance_extent_enabled()) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "target=");
+ bch2_target_to_text(&buf, c, r->target);
+ prt_str(&buf, " compression=");
+ bch2_compression_opt_to_text(&buf, r->compression);
+ prt_str(&buf, " ");
+ bch2_bkey_val_to_text(&buf, c, k);
+
+ trace_rebalance_extent(c, buf.buf);
+ printbuf_exit(&buf);
+ }
+
return k;
}
@@ -239,13 +253,12 @@ static bool rebalance_pred(struct bch_fs *c, void *arg,
if (k.k->p.inode) {
target = io_opts->background_target;
- compression = io_opts->background_compression ?: io_opts->compression;
+ compression = background_compression(*io_opts);
} else {
const struct bch_extent_rebalance *r = bch2_bkey_rebalance_opts(k);
target = r ? r->target : io_opts->background_target;
- compression = r ? r->compression :
- (io_opts->background_compression ?: io_opts->compression);
+ compression = r ? r->compression : background_compression(*io_opts);
}
data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, k, target, compression);
@@ -273,7 +286,7 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie)
r->state = BCH_REBALANCE_scanning;
ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?:
- commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_clear_rebalance_needs_scan(trans, inum, cookie));
bch2_move_stats_exit(&r->scan_stats, trans->c);
@@ -317,8 +330,16 @@ static int do_rebalance(struct moving_context *ctxt)
BTREE_ID_rebalance_work, POS_MIN,
BTREE_ITER_ALL_SNAPSHOTS);
- while (!bch2_move_ratelimit(ctxt) &&
- !kthread_wait_freezable(r->enabled)) {
+ while (!bch2_move_ratelimit(ctxt)) {
+ if (!r->enabled) {
+ bch2_moving_ctxt_flush_all(ctxt);
+ kthread_wait_freezable(r->enabled ||
+ kthread_should_stop());
+ }
+
+ if (kthread_should_stop())
+ break;
+
bch2_trans_begin(trans);
ret = bkey_err(k = next_rebalance_entry(trans, &rebalance_work_iter));
@@ -348,6 +369,7 @@ static int do_rebalance(struct moving_context *ctxt)
!kthread_should_stop() &&
!atomic64_read(&r->work_stats.sectors_seen) &&
!atomic64_read(&r->scan_stats.sectors_seen)) {
+ bch2_moving_ctxt_flush_all(ctxt);
bch2_trans_unlock_long(trans);
rebalance_wait(c);
}
@@ -362,7 +384,6 @@ static int bch2_rebalance_thread(void *arg)
struct bch_fs *c = arg;
struct bch_fs_rebalance *r = &c->rebalance;
struct moving_context ctxt;
- int ret;
set_freezable();
@@ -370,8 +391,7 @@ static int bch2_rebalance_thread(void *arg)
writepoint_ptr(&c->rebalance_write_point),
true);
- while (!kthread_should_stop() &&
- !(ret = do_rebalance(&ctxt)))
+ while (!kthread_should_stop() && !do_rebalance(&ctxt))
;
bch2_moving_ctxt_exit(&ctxt);
@@ -447,10 +467,9 @@ int bch2_rebalance_start(struct bch_fs *c)
p = kthread_create(bch2_rebalance_thread, c, "bch-rebalance/%s", c->name);
ret = PTR_ERR_OR_ZERO(p);
- if (ret) {
- bch_err_msg(c, ret, "creating rebalance thread");
+ bch_err_msg(c, ret, "creating rebalance thread");
+ if (ret)
return ret;
- }
get_task_struct(p);
rcu_assign_pointer(c->rebalance.thread, p);
diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c
index 5cf7d0532002..21e13bb4335b 100644
--- a/fs/bcachefs/recovery.c
+++ b/fs/bcachefs/recovery.c
@@ -99,6 +99,11 @@ static int bch2_journal_replay_key(struct btree_trans *trans,
unsigned update_flags = BTREE_TRIGGER_NORUN;
int ret;
+ if (k->overwritten)
+ return 0;
+
+ trans->journal_res.seq = k->journal_seq;
+
/*
* BTREE_UPDATE_KEY_CACHE_RECLAIM disables key cache lookup/update to
* keep the key cache coherent with the underlying btree. Nothing
@@ -140,27 +145,13 @@ static int journal_sort_seq_cmp(const void *_l, const void *_r)
static int bch2_journal_replay(struct bch_fs *c)
{
struct journal_keys *keys = &c->journal_keys;
- struct journal_key **keys_sorted, *k;
+ DARRAY(struct journal_key *) keys_sorted = { 0 };
struct journal *j = &c->journal;
u64 start_seq = c->journal_replay_seq_start;
u64 end_seq = c->journal_replay_seq_start;
- size_t i;
+ struct btree_trans *trans = bch2_trans_get(c);
int ret = 0;
- move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
- keys->gap = keys->nr;
-
- keys_sorted = kvmalloc_array(keys->nr, sizeof(*keys_sorted), GFP_KERNEL);
- if (!keys_sorted)
- return -BCH_ERR_ENOMEM_journal_replay;
-
- for (i = 0; i < keys->nr; i++)
- keys_sorted[i] = &keys->d[i];
-
- sort(keys_sorted, keys->nr,
- sizeof(keys_sorted[0]),
- journal_sort_seq_cmp, NULL);
-
if (keys->nr) {
ret = bch2_journal_log_msg(c, "Starting journal replay (%zu keys in entries %llu-%llu)",
keys->nr, start_seq, end_seq);
@@ -170,27 +161,67 @@ static int bch2_journal_replay(struct bch_fs *c)
BUG_ON(!atomic_read(&keys->ref));
- for (i = 0; i < keys->nr; i++) {
- k = keys_sorted[i];
+ /*
+ * First, attempt to replay keys in sorted order. This is more
+ * efficient - better locality of btree access - but some might fail if
+ * that would cause a journal deadlock.
+ */
+ for (size_t i = 0; i < keys->nr; i++) {
+ cond_resched();
+
+ struct journal_key *k = keys->d + i;
+
+ /* Skip fastpath if we're low on space in the journal */
+ ret = c->journal.watermark ? -1 :
+ commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ BCH_TRANS_COMMIT_journal_reclaim|
+ (!k->allocated ? BCH_TRANS_COMMIT_no_journal_res : 0),
+ bch2_journal_replay_key(trans, k));
+ BUG_ON(!ret && !k->overwritten);
+ if (ret) {
+ ret = darray_push(&keys_sorted, k);
+ if (ret)
+ goto err;
+ }
+ }
+ /*
+ * Now, replay any remaining keys in the order in which they appear in
+ * the journal, unpinning those journal entries as we go:
+ */
+ sort(keys_sorted.data, keys_sorted.nr,
+ sizeof(keys_sorted.data[0]),
+ journal_sort_seq_cmp, NULL);
+
+ darray_for_each(keys_sorted, kp) {
cond_resched();
+ struct journal_key *k = *kp;
+
replay_now_at(j, k->journal_seq);
- ret = bch2_trans_do(c, NULL, NULL,
- BTREE_INSERT_LAZY_RW|
- BTREE_INSERT_NOFAIL|
- (!k->allocated
- ? BTREE_INSERT_JOURNAL_REPLAY|BCH_WATERMARK_reclaim
- : 0),
+ ret = commit_do(trans, NULL, NULL,
+ BCH_TRANS_COMMIT_no_enospc|
+ (!k->allocated
+ ? BCH_TRANS_COMMIT_no_journal_res|BCH_WATERMARK_reclaim
+ : 0),
bch2_journal_replay_key(trans, k));
- if (ret) {
- bch_err(c, "journal replay: error while replaying key at btree %s level %u: %s",
- bch2_btree_id_str(k->btree_id), k->level, bch2_err_str(ret));
+ bch_err_msg(c, ret, "while replaying key at btree %s level %u:",
+ bch2_btree_id_str(k->btree_id), k->level);
+ if (ret)
goto err;
- }
+
+ BUG_ON(!k->overwritten);
}
+ /*
+ * We need to put our btree_trans before calling flush_all_pins(), since
+ * that will use a btree_trans internally
+ */
+ bch2_trans_put(trans);
+ trans = NULL;
+
if (!c->opts.keep_journal)
bch2_journal_keys_put_initial(c);
@@ -198,16 +229,14 @@ static int bch2_journal_replay(struct bch_fs *c)
j->replay_journal_seq = 0;
bch2_journal_set_replay_done(j);
- bch2_journal_flush_all_pins(j);
- ret = bch2_journal_error(j);
- if (keys->nr && !ret)
+ if (keys->nr)
bch2_journal_log_msg(c, "journal replay finished");
err:
- kvfree(keys_sorted);
-
- if (ret)
- bch_err_fn(c, ret);
+ if (trans)
+ bch2_trans_put(trans);
+ darray_exit(&keys_sorted);
+ bch_err_fn(c, ret);
return ret;
}
@@ -251,7 +280,7 @@ static int journal_replay_entry_early(struct bch_fs *c,
le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_inodes:
- c->usage_base->nr_inodes = le64_to_cpu(u->v);
+ c->usage_base->b.nr_inodes = le64_to_cpu(u->v);
break;
case BCH_FS_USAGE_key_version:
atomic64_set(&c->key_version,
@@ -275,8 +304,6 @@ static int journal_replay_entry_early(struct bch_fs *c,
struct bch_dev *ca = bch_dev_bkey_exists(c, le32_to_cpu(u->dev));
unsigned i, nr_types = jset_entry_dev_usage_nr_types(u);
- ca->usage_base->buckets_ec = le64_to_cpu(u->buckets_ec);
-
for (i = 0; i < min_t(unsigned, nr_types, BCH_DATA_NR); i++) {
ca->usage_base->d[i].buckets = le64_to_cpu(u->d[i].buckets);
ca->usage_base->d[i].sectors = le64_to_cpu(u->d[i].sectors);
@@ -317,14 +344,11 @@ static int journal_replay_entry_early(struct bch_fs *c,
static int journal_replay_early(struct bch_fs *c,
struct bch_sb_field_clean *clean)
{
- struct jset_entry *entry;
- int ret;
-
if (clean) {
- for (entry = clean->start;
+ for (struct jset_entry *entry = clean->start;
entry != vstruct_end(&clean->field);
entry = vstruct_next(entry)) {
- ret = journal_replay_entry_early(c, entry);
+ int ret = journal_replay_entry_early(c, entry);
if (ret)
return ret;
}
@@ -339,7 +363,7 @@ static int journal_replay_early(struct bch_fs *c,
continue;
vstruct_for_each(&i->j, entry) {
- ret = journal_replay_entry_early(c, entry);
+ int ret = journal_replay_entry_early(c, entry);
if (ret)
return ret;
}
@@ -435,8 +459,7 @@ static int bch2_initialize_subvolumes(struct bch_fs *c)
ret = bch2_btree_insert(c, BTREE_ID_snapshot_trees, &root_tree.k_i, NULL, 0) ?:
bch2_btree_insert(c, BTREE_ID_snapshots, &root_snapshot.k_i, NULL, 0) ?:
bch2_btree_insert(c, BTREE_ID_subvolumes, &root_volume.k_i, NULL, 0);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -474,10 +497,9 @@ err:
noinline_for_stack
static int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c)
{
- int ret = bch2_trans_do(c, NULL, NULL, BTREE_INSERT_LAZY_RW,
+ int ret = bch2_trans_do(c, NULL, NULL, BCH_TRANS_COMMIT_lazy_rw,
__bch2_fs_upgrade_for_subvolumes(trans));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -495,7 +517,20 @@ static int bch2_check_allocations(struct bch_fs *c)
static int bch2_set_may_go_rw(struct bch_fs *c)
{
- set_bit(BCH_FS_MAY_GO_RW, &c->flags);
+ struct journal_keys *keys = &c->journal_keys;
+
+ /*
+ * After we go RW, the journal keys buffer can't be modified (except for
+ * setting journal_key->overwritten: it will be accessed by multiple
+ * threads
+ */
+ move_gap(keys->d, keys->nr, keys->size, keys->gap, keys->nr);
+ keys->gap = keys->nr;
+
+ set_bit(BCH_FS_may_go_rw, &c->flags);
+
+ if (keys->nr || c->opts.fsck || !c->sb.clean)
+ return bch2_fs_read_write_early(c);
return 0;
}
@@ -542,8 +577,9 @@ u64 bch2_recovery_passes_from_stable(u64 v)
static bool check_version_upgrade(struct bch_fs *c)
{
- unsigned latest_compatible = bch2_latest_compatible_version(c->sb.version);
unsigned latest_version = bcachefs_metadata_version_current;
+ unsigned latest_compatible = min(latest_version,
+ bch2_latest_compatible_version(c->sb.version));
unsigned old_version = c->sb.version_upgrade_complete ?: c->sb.version;
unsigned new_version = 0;
@@ -562,7 +598,7 @@ static bool check_version_upgrade(struct bch_fs *c)
new_version = latest_version;
break;
case BCH_VERSION_UPGRADE_none:
- new_version = old_version;
+ new_version = min(old_version, latest_version);
break;
}
}
@@ -589,17 +625,15 @@ static bool check_version_upgrade(struct bch_fs *c)
bch2_version_to_text(&buf, new_version);
prt_newline(&buf);
- u64 recovery_passes = bch2_upgrade_recovery_passes(c, old_version, new_version);
- if (recovery_passes) {
- if ((recovery_passes & RECOVERY_PASS_ALL_FSCK) == RECOVERY_PASS_ALL_FSCK)
- prt_str(&buf, "fsck required");
- else {
- prt_str(&buf, "running recovery passes: ");
- prt_bitflags(&buf, bch2_recovery_passes, recovery_passes);
- }
-
- c->recovery_passes_explicit |= recovery_passes;
- c->opts.fix_errors = FSCK_FIX_yes;
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+ __le64 passes = ext->recovery_passes_required[0];
+ bch2_sb_set_upgrade(c, old_version, new_version);
+ passes = ext->recovery_passes_required[0] & ~passes;
+
+ if (passes) {
+ prt_str(&buf, " running recovery passes: ");
+ prt_bitflags(&buf, bch2_recovery_passes,
+ bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
bch_info(c, "%s", buf.buf);
@@ -625,7 +659,7 @@ u64 bch2_fsck_recovery_passes(void)
static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
- struct recovery_pass_fn *p = recovery_pass_fns + c->curr_recovery_pass;
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
if (c->opts.norecovery && pass > BCH_RECOVERY_PASS_snapshots_read)
return false;
@@ -642,39 +676,62 @@ static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pa
static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass)
{
+ struct recovery_pass_fn *p = recovery_pass_fns + pass;
int ret;
- c->curr_recovery_pass = pass;
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."),
+ bch2_recovery_passes[pass]);
+ ret = p->fn(c);
+ if (ret)
+ return ret;
+ if (!(p->when & PASS_SILENT))
+ bch2_print(c, KERN_CONT " done\n");
- if (should_run_recovery_pass(c, pass)) {
- struct recovery_pass_fn *p = recovery_pass_fns + pass;
+ return 0;
+}
- if (!(p->when & PASS_SILENT))
- printk(KERN_INFO bch2_log_msg(c, "%s..."),
- bch2_recovery_passes[pass]);
- ret = p->fn(c);
- if (ret)
- return ret;
- if (!(p->when & PASS_SILENT))
- printk(KERN_CONT " done\n");
+static int bch2_run_recovery_passes(struct bch_fs *c)
+{
+ int ret = 0;
- c->recovery_passes_complete |= BIT_ULL(pass);
+ while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
+ if (should_run_recovery_pass(c, c->curr_recovery_pass)) {
+ unsigned pass = c->curr_recovery_pass;
+
+ ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery) ||
+ (ret && c->curr_recovery_pass < pass))
+ continue;
+ if (ret)
+ break;
+
+ c->recovery_passes_complete |= BIT_ULL(c->curr_recovery_pass);
+ }
+ c->curr_recovery_pass++;
+ c->recovery_pass_done = max(c->recovery_pass_done, c->curr_recovery_pass);
}
- return 0;
+ return ret;
}
-static int bch2_run_recovery_passes(struct bch_fs *c)
+int bch2_run_online_recovery_passes(struct bch_fs *c)
{
int ret = 0;
- while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns)) {
- ret = bch2_run_recovery_pass(c, c->curr_recovery_pass);
- if (bch2_err_matches(ret, BCH_ERR_restart_recovery))
+ for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) {
+ struct recovery_pass_fn *p = recovery_pass_fns + i;
+
+ if (!(p->when & PASS_ONLINE))
+ continue;
+
+ ret = bch2_run_recovery_pass(c, i);
+ if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) {
+ i = c->curr_recovery_pass;
continue;
+ }
if (ret)
break;
- c->curr_recovery_pass++;
}
return ret;
@@ -718,7 +775,7 @@ int bch2_fs_recovery(struct bch_fs *c)
goto err;
}
- if (!(c->opts.nochanges && c->opts.norecovery)) {
+ if (!c->opts.nochanges) {
mutex_lock(&c->sb_lock);
bool write_sb = false;
@@ -748,7 +805,7 @@ int bch2_fs_recovery(struct bch_fs *c)
if (bch2_check_version_downgrade(c)) {
struct printbuf buf = PRINTBUF;
- prt_str(&buf, "Version downgrade required:\n");
+ prt_str(&buf, "Version downgrade required:");
__le64 passes = ext->recovery_passes_required[0];
bch2_sb_set_downgrade(c,
@@ -756,7 +813,7 @@ int bch2_fs_recovery(struct bch_fs *c)
BCH_VERSION_MINOR(c->sb.version));
passes = ext->recovery_passes_required[0] & ~passes;
if (passes) {
- prt_str(&buf, " running recovery passes: ");
+ prt_str(&buf, "\n running recovery passes: ");
prt_bitflags(&buf, bch2_recovery_passes,
bch2_recovery_passes_from_stable(le64_to_cpu(passes)));
}
@@ -779,6 +836,9 @@ int bch2_fs_recovery(struct bch_fs *c)
if (c->opts.fsck && IS_ENABLED(CONFIG_BCACHEFS_DEBUG))
c->recovery_passes_explicit |= BIT_ULL(BCH_RECOVERY_PASS_check_topology);
+ if (c->opts.fsck)
+ set_bit(BCH_FS_fsck_running, &c->flags);
+
ret = bch2_blacklist_table_initialize(c);
if (ret) {
bch_err(c, "error initializing blacklist table");
@@ -919,13 +979,17 @@ use_clean:
if (ret)
goto err;
+ clear_bit(BCH_FS_fsck_running, &c->flags);
+
/* If we fixed errors, verify that fs is actually clean now: */
if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) &&
- test_bit(BCH_FS_ERRORS_FIXED, &c->flags) &&
- !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags) &&
- !test_bit(BCH_FS_ERROR, &c->flags)) {
+ test_bit(BCH_FS_errors_fixed, &c->flags) &&
+ !test_bit(BCH_FS_errors_not_fixed, &c->flags) &&
+ !test_bit(BCH_FS_error, &c->flags)) {
+ bch2_flush_fsck_errs(c);
+
bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean");
- clear_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+ clear_bit(BCH_FS_errors_fixed, &c->flags);
c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info;
@@ -933,13 +997,13 @@ use_clean:
if (ret)
goto err;
- if (test_bit(BCH_FS_ERRORS_FIXED, &c->flags) ||
- test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+ if (test_bit(BCH_FS_errors_fixed, &c->flags) ||
+ test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
bch_err(c, "Second fsck run was not clean");
- set_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags);
+ set_bit(BCH_FS_errors_not_fixed, &c->flags);
}
- set_bit(BCH_FS_ERRORS_FIXED, &c->flags);
+ set_bit(BCH_FS_errors_fixed, &c->flags);
}
if (enabled_qtypes(c)) {
@@ -958,13 +1022,13 @@ use_clean:
write_sb = true;
}
- if (!test_bit(BCH_FS_ERROR, &c->flags) &&
+ if (!test_bit(BCH_FS_error, &c->flags) &&
!(c->disk_sb.sb->compat[0] & cpu_to_le64(1ULL << BCH_COMPAT_alloc_info))) {
c->disk_sb.sb->compat[0] |= cpu_to_le64(1ULL << BCH_COMPAT_alloc_info);
write_sb = true;
}
- if (!test_bit(BCH_FS_ERROR, &c->flags)) {
+ if (!test_bit(BCH_FS_error, &c->flags)) {
struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
if (ext &&
(!bch2_is_zero(ext->recovery_passes_required, sizeof(ext->recovery_passes_required)) ||
@@ -976,8 +1040,8 @@ use_clean:
}
if (c->opts.fsck &&
- !test_bit(BCH_FS_ERROR, &c->flags) &&
- !test_bit(BCH_FS_ERRORS_NOT_FIXED, &c->flags)) {
+ !test_bit(BCH_FS_error, &c->flags) &&
+ !test_bit(BCH_FS_errors_not_fixed, &c->flags)) {
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 0);
SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 0);
write_sb = true;
@@ -993,8 +1057,12 @@ use_clean:
bch2_move_stats_init(&stats, "recovery");
- bch_info(c, "scanning for old btree nodes");
- ret = bch2_fs_read_write(c) ?:
+ struct printbuf buf = PRINTBUF;
+ bch2_version_to_text(&buf, c->sb.version_min);
+ bch_info(c, "scanning for old btree nodes: min_version %s", buf.buf);
+ printbuf_exit(&buf);
+
+ ret = bch2_fs_read_write_early(c) ?:
bch2_scan_old_btree_nodes(c, &stats);
if (ret)
goto err;
@@ -1007,7 +1075,6 @@ use_clean:
ret = 0;
out:
- set_bit(BCH_FS_FSCK_DONE, &c->flags);
bch2_flush_fsck_errs(c);
if (!c->opts.keep_journal &&
@@ -1015,13 +1082,14 @@ out:
bch2_journal_keys_put_initial(c);
kfree(clean);
- if (!ret && test_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags)) {
+ if (!ret &&
+ test_bit(BCH_FS_need_delete_dead_snapshots, &c->flags) &&
+ !c->opts.nochanges) {
bch2_fs_read_write_early(c);
bch2_delete_dead_snapshots_async(c);
}
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
err:
fsck_err:
@@ -1034,8 +1102,6 @@ int bch2_fs_initialize(struct bch_fs *c)
struct bch_inode_unpacked root_inode, lostfound_inode;
struct bkey_inode_buf packed_inode;
struct qstr lostfound = QSTR("lost+found");
- struct bch_dev *ca;
- unsigned i;
int ret;
bch_notice(c, "initializing new filesystem");
@@ -1054,13 +1120,12 @@ int bch2_fs_initialize(struct bch_fs *c)
mutex_unlock(&c->sb_lock);
c->curr_recovery_pass = ARRAY_SIZE(recovery_pass_fns);
- set_bit(BCH_FS_MAY_GO_RW, &c->flags);
- set_bit(BCH_FS_FSCK_DONE, &c->flags);
+ set_bit(BCH_FS_may_go_rw, &c->flags);
- for (i = 0; i < BTREE_ID_NR; i++)
+ for (unsigned i = 0; i < BTREE_ID_NR; i++)
bch2_btree_root_alloc(c, i);
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
bch2_dev_usage_init(ca);
ret = bch2_fs_journal_alloc(c);
@@ -1088,7 +1153,7 @@ int bch2_fs_initialize(struct bch_fs *c)
if (ret)
goto err;
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
ca->new_fs_bucket_idx = 0;
ret = bch2_fs_freespace_init(c);
@@ -1112,10 +1177,9 @@ int bch2_fs_initialize(struct bch_fs *c)
packed_inode.inode.k.p.snapshot = U32_MAX;
ret = bch2_btree_insert(c, BTREE_ID_inodes, &packed_inode.inode.k_i, NULL, 0);
- if (ret) {
- bch_err_msg(c, ret, "creating root directory");
+ bch_err_msg(c, ret, "creating root directory");
+ if (ret)
goto err;
- }
bch2_inode_init_early(c, &lostfound_inode);
@@ -1126,10 +1190,11 @@ int bch2_fs_initialize(struct bch_fs *c)
&lostfound,
0, 0, S_IFDIR|0700, 0,
NULL, NULL, (subvol_inum) { 0 }, 0));
- if (ret) {
- bch_err_msg(c, ret, "creating lost+found");
+ bch_err_msg(c, ret, "creating lost+found");
+ if (ret)
goto err;
- }
+
+ c->recovery_pass_done = ARRAY_SIZE(recovery_pass_fns) - 1;
if (enabled_qtypes(c)) {
ret = bch2_fs_quota_read(c);
@@ -1138,10 +1203,9 @@ int bch2_fs_initialize(struct bch_fs *c)
}
ret = bch2_journal_flush(&c->journal);
- if (ret) {
- bch_err_msg(c, ret, "writing first journal entry");
+ bch_err_msg(c, ret, "writing first journal entry");
+ if (ret)
goto err;
- }
mutex_lock(&c->sb_lock);
SET_BCH_SB_INITIALIZED(c->disk_sb.sb, true);
@@ -1152,6 +1216,6 @@ int bch2_fs_initialize(struct bch_fs *c)
return 0;
err:
- bch_err_fn(ca, ret);
+ bch_err_fn(c, ret);
return ret;
}
diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h
index 3a554b0751d0..4e9d24719b2e 100644
--- a/fs/bcachefs/recovery.h
+++ b/fs/bcachefs/recovery.h
@@ -31,6 +31,7 @@ static inline int bch2_run_explicit_recovery_pass(struct bch_fs *c,
}
}
+int bch2_run_online_recovery_passes(struct bch_fs *);
u64 bch2_fsck_recovery_passes(void);
int bch2_fs_recovery(struct bch_fs *);
diff --git a/fs/bcachefs/recovery_types.h b/fs/bcachefs/recovery_types.h
index d37c6fd30e38..fa0c8efd2a1b 100644
--- a/fs/bcachefs/recovery_types.h
+++ b/fs/bcachefs/recovery_types.h
@@ -6,6 +6,7 @@
#define PASS_FSCK BIT(1)
#define PASS_UNCLEAN BIT(2)
#define PASS_ALWAYS BIT(3)
+#define PASS_ONLINE BIT(4)
/*
* Passes may be reordered, but the second field is a persistent identifier and
@@ -22,18 +23,18 @@
x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \
x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \
x(journal_replay, 9, PASS_ALWAYS) \
- x(check_alloc_info, 10, PASS_FSCK) \
- x(check_lrus, 11, PASS_FSCK) \
- x(check_btree_backpointers, 12, PASS_FSCK) \
- x(check_backpointers_to_extents, 13, PASS_FSCK) \
- x(check_extents_to_backpointers, 14, PASS_FSCK) \
- x(check_alloc_to_lru_refs, 15, PASS_FSCK) \
+ x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \
+ x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \
+ x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \
+ x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK) \
+ x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \
+ x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \
x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \
x(bucket_gens_init, 17, 0) \
- x(check_snapshot_trees, 18, PASS_FSCK) \
- x(check_snapshots, 19, PASS_FSCK) \
- x(check_subvols, 20, PASS_FSCK) \
- x(delete_dead_snapshots, 21, PASS_FSCK) \
+ x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \
+ x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \
+ x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \
+ x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \
x(fs_upgrade_for_subvolumes, 22, 0) \
x(resume_logged_ops, 23, PASS_ALWAYS) \
x(check_inodes, 24, PASS_FSCK) \
@@ -41,8 +42,8 @@
x(check_indirect_extents, 26, PASS_FSCK) \
x(check_dirents, 27, PASS_FSCK) \
x(check_xattrs, 28, PASS_FSCK) \
- x(check_root, 29, PASS_FSCK) \
- x(check_directory_structure, 30, PASS_FSCK) \
+ x(check_root, 29, PASS_ONLINE|PASS_FSCK) \
+ x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \
x(check_nlinks, 31, PASS_FSCK) \
x(delete_dead_inodes, 32, PASS_FSCK|PASS_UNCLEAN) \
x(fix_reflink_p, 33, 0) \
diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c
index 37d16e04e671..c47c66c2b394 100644
--- a/fs/bcachefs/reflink.c
+++ b/fs/bcachefs/reflink.c
@@ -3,6 +3,7 @@
#include "bkey_buf.h"
#include "btree_update.h"
#include "buckets.h"
+#include "error.h"
#include "extents.h"
#include "inode.h"
#include "io_misc.h"
@@ -33,15 +34,14 @@ int bch2_reflink_p_invalid(struct bch_fs *c, struct bkey_s_c k,
struct printbuf *err)
{
struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ int ret = 0;
- if (c->sb.version >= bcachefs_metadata_version_reflink_p_fix &&
- le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad)) {
- prt_printf(err, "idx < front_pad (%llu < %u)",
- le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
- return -EINVAL;
- }
-
- return 0;
+ bkey_fsck_err_on(le64_to_cpu(p.v->idx) < le32_to_cpu(p.v->front_pad),
+ c, err, reflink_p_front_pad_bad,
+ "idx < front_pad (%llu < %u)",
+ le64_to_cpu(p.v->idx), le32_to_cpu(p.v->front_pad));
+fsck_err:
+ return ret;
}
void bch2_reflink_p_to_text(struct printbuf *out, struct bch_fs *c,
@@ -73,6 +73,184 @@ bool bch2_reflink_p_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
return true;
}
+static int trans_trigger_reflink_p_segment(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 *idx, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct btree_iter iter;
+ struct bkey_i *k;
+ __le64 *refcount;
+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ struct printbuf buf = PRINTBUF;
+ int ret;
+
+ k = bch2_bkey_get_mut_noupdate(trans, &iter,
+ BTREE_ID_reflink, POS(0, *idx),
+ BTREE_ITER_WITH_UPDATES);
+ ret = PTR_ERR_OR_ZERO(k);
+ if (ret)
+ goto err;
+
+ refcount = bkey_refcount(bkey_i_to_s(k));
+ if (!refcount) {
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
+ "nonexistent indirect extent at %llu while marking\n %s",
+ *idx, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (!*refcount && (flags & BTREE_TRIGGER_OVERWRITE)) {
+ bch2_bkey_val_to_text(&buf, c, p.s_c);
+ bch2_trans_inconsistent(trans,
+ "indirect extent refcount underflow at %llu while marking\n %s",
+ *idx, buf.buf);
+ ret = -EIO;
+ goto err;
+ }
+
+ if (flags & BTREE_TRIGGER_INSERT) {
+ struct bch_reflink_p *v = (struct bch_reflink_p *) p.v;
+ u64 pad;
+
+ pad = max_t(s64, le32_to_cpu(v->front_pad),
+ le64_to_cpu(v->idx) - bkey_start_offset(&k->k));
+ BUG_ON(pad > U32_MAX);
+ v->front_pad = cpu_to_le32(pad);
+
+ pad = max_t(s64, le32_to_cpu(v->back_pad),
+ k->k.p.offset - p.k->size - le64_to_cpu(v->idx));
+ BUG_ON(pad > U32_MAX);
+ v->back_pad = cpu_to_le32(pad);
+ }
+
+ le64_add_cpu(refcount, add);
+
+ bch2_btree_iter_set_pos_to_extent_start(&iter);
+ ret = bch2_trans_update(trans, &iter, k, 0);
+ if (ret)
+ goto err;
+
+ *idx = k->k.p.offset;
+err:
+ bch2_trans_iter_exit(trans, &iter);
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static s64 gc_trigger_reflink_p_segment(struct btree_trans *trans,
+ struct bkey_s_c_reflink_p p,
+ u64 *idx, unsigned flags, size_t r_idx)
+{
+ struct bch_fs *c = trans->c;
+ struct reflink_gc *r;
+ int add = !(flags & BTREE_TRIGGER_OVERWRITE) ? 1 : -1;
+ u64 start = le64_to_cpu(p.v->idx);
+ u64 end = le64_to_cpu(p.v->idx) + p.k->size;
+ u64 next_idx = end + le32_to_cpu(p.v->back_pad);
+ s64 ret = 0;
+ struct printbuf buf = PRINTBUF;
+
+ if (r_idx >= c->reflink_gc_nr)
+ goto not_found;
+
+ r = genradix_ptr(&c->reflink_gc_table, r_idx);
+ next_idx = min(next_idx, r->offset - r->size);
+ if (*idx < next_idx)
+ goto not_found;
+
+ BUG_ON((s64) r->refcount + add < 0);
+
+ r->refcount += add;
+ *idx = r->offset;
+ return 0;
+not_found:
+ if (fsck_err(c, reflink_p_to_missing_reflink_v,
+ "pointer to missing indirect extent\n"
+ " %s\n"
+ " missing range %llu-%llu",
+ (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf),
+ *idx, next_idx)) {
+ struct bkey_i *update = bch2_bkey_make_mut_noupdate(trans, p.s_c);
+ ret = PTR_ERR_OR_ZERO(update);
+ if (ret)
+ goto err;
+
+ if (next_idx <= start) {
+ bkey_i_to_reflink_p(update)->v.front_pad = cpu_to_le32(start - next_idx);
+ } else if (*idx >= end) {
+ bkey_i_to_reflink_p(update)->v.back_pad = cpu_to_le32(*idx - end);
+ } else {
+ bkey_error_init(update);
+ update->k.p = p.k->p;
+ update->k.p.offset = next_idx;
+ update->k.size = next_idx - *idx;
+ set_bkey_val_u64s(&update->k, 0);
+ }
+
+ ret = bch2_btree_insert_trans(trans, BTREE_ID_extents, update, BTREE_TRIGGER_NORUN);
+ }
+
+ *idx = next_idx;
+err:
+fsck_err:
+ printbuf_exit(&buf);
+ return ret;
+}
+
+static int __trigger_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c k, unsigned flags)
+{
+ struct bch_fs *c = trans->c;
+ struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k);
+ int ret = 0;
+
+ u64 idx = le64_to_cpu(p.v->idx) - le32_to_cpu(p.v->front_pad);
+ u64 end = le64_to_cpu(p.v->idx) + p.k->size + le32_to_cpu(p.v->back_pad);
+
+ if (flags & BTREE_TRIGGER_TRANSACTIONAL) {
+ while (idx < end && !ret)
+ ret = trans_trigger_reflink_p_segment(trans, p, &idx, flags);
+ }
+
+ if (flags & BTREE_TRIGGER_GC) {
+ size_t l = 0, r = c->reflink_gc_nr;
+
+ while (l < r) {
+ size_t m = l + (r - l) / 2;
+ struct reflink_gc *ref = genradix_ptr(&c->reflink_gc_table, m);
+ if (ref->offset <= idx)
+ l = m + 1;
+ else
+ r = m;
+ }
+
+ while (idx < end && !ret)
+ ret = gc_trigger_reflink_p_segment(trans, p, &idx, flags, l++);
+ }
+
+ return ret;
+}
+
+int bch2_trigger_reflink_p(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old,
+ struct bkey_s new,
+ unsigned flags)
+{
+ if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
+ (flags & BTREE_TRIGGER_INSERT)) {
+ struct bch_reflink_p *v = bkey_s_to_reflink_p(new).v;
+
+ v->front_pad = v->back_pad = 0;
+ }
+
+ return trigger_run_overwrite_then_insert(__trigger_reflink_p, trans, btree_id, level, old, new, flags);
+}
+
/* indirect extents */
int bch2_reflink_v_invalid(struct bch_fs *c, struct bkey_s_c k,
@@ -104,32 +282,26 @@ bool bch2_reflink_v_merge(struct bch_fs *c, struct bkey_s _l, struct bkey_s_c _r
}
#endif
-static inline void check_indirect_extent_deleting(struct bkey_i *new, unsigned *flags)
+static inline void check_indirect_extent_deleting(struct bkey_s new, unsigned *flags)
{
if ((*flags & BTREE_TRIGGER_INSERT) && !*bkey_refcount(new)) {
- new->k.type = KEY_TYPE_deleted;
- new->k.size = 0;
- set_bkey_val_u64s(&new->k, 0);;
+ new.k->type = KEY_TYPE_deleted;
+ new.k->size = 0;
+ set_bkey_val_u64s(new.k, 0);
*flags &= ~BTREE_TRIGGER_INSERT;
}
}
-int bch2_trans_mark_reflink_v(struct btree_trans *trans,
- enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
- unsigned flags)
+int bch2_trigger_reflink_v(struct btree_trans *trans,
+ enum btree_id btree_id, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
{
- check_indirect_extent_deleting(new, &flags);
-
- if (old.k->type == KEY_TYPE_reflink_v &&
- new->k.type == KEY_TYPE_reflink_v &&
- old.k->u64s == new->k.u64s &&
- !memcmp(bkey_s_c_to_reflink_v(old).v->start,
- bkey_i_to_reflink_v(new)->v.start,
- bkey_val_bytes(&new->k) - 8))
- return 0;
+ if ((flags & BTREE_TRIGGER_TRANSACTIONAL) &&
+ (flags & BTREE_TRIGGER_INSERT))
+ check_indirect_extent_deleting(new, &flags);
- return bch2_trans_mark_extent(trans, btree_id, level, old, new, flags);
+ return bch2_trigger_extent(trans, btree_id, level, old, new, flags);
}
/* indirect inline data */
@@ -152,9 +324,9 @@ void bch2_indirect_inline_data_to_text(struct printbuf *out,
min(datalen, 32U), d.v->data);
}
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *trans,
+int bch2_trigger_indirect_inline_data(struct btree_trans *trans,
enum btree_id btree_id, unsigned level,
- struct bkey_s_c old, struct bkey_i *new,
+ struct bkey_s_c old, struct bkey_s new,
unsigned flags)
{
check_indirect_extent_deleting(new, &flags);
@@ -197,7 +369,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans,
set_bkey_val_bytes(&r_v->k, sizeof(__le64) + bkey_val_bytes(&orig->k));
- refcount = bkey_refcount(r_v);
+ refcount = bkey_refcount(bkey_i_to_s(r_v));
*refcount = 0;
memcpy(refcount + 1, &orig->v, bkey_val_bytes(&orig->k));
@@ -314,6 +486,13 @@ s64 bch2_remap_range(struct bch_fs *c,
bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot);
+ if (dst_inum.inum < src_inum.inum) {
+ /* Avoid some lock cycle transaction restarts */
+ ret = bch2_btree_iter_traverse(&dst_iter);
+ if (ret)
+ continue;
+ }
+
dst_done = dst_iter.pos.offset - dst_start.offset;
src_want = POS(src_start.inode, src_start.offset + dst_done);
bch2_btree_iter_set_pos(&src_iter, src_want);
@@ -366,9 +545,7 @@ s64 bch2_remap_range(struct bch_fs *c,
min(src_k.k->p.offset - src_want.offset,
dst_end.offset - dst_iter.pos.offset));
- ret = bch2_bkey_set_needs_rebalance(c, new_dst.k,
- opts.background_target,
- opts.background_compression) ?:
+ ret = bch2_bkey_set_needs_rebalance(c, new_dst.k, &opts) ?:
bch2_extent_update(trans, dst_inum, &dst_iter,
new_dst.k, &disk_res,
new_i_size, i_sectors_delta,
@@ -398,7 +575,7 @@ s64 bch2_remap_range(struct bch_fs *c,
inode_u.bi_size = new_i_size;
ret2 = bch2_inode_write(trans, &inode_iter, &inode_u) ?:
bch2_trans_commit(trans, NULL, NULL,
- BTREE_INSERT_NOFAIL);
+ BCH_TRANS_COMMIT_no_enospc);
}
bch2_trans_iter_exit(trans, &inode_iter);
diff --git a/fs/bcachefs/reflink.h b/fs/bcachefs/reflink.h
index 8ccf3f9c4939..4d8867289717 100644
--- a/fs/bcachefs/reflink.h
+++ b/fs/bcachefs/reflink.h
@@ -9,13 +9,14 @@ int bch2_reflink_p_invalid(struct bch_fs *, struct bkey_s_c,
void bch2_reflink_p_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
bool bch2_reflink_p_merge(struct bch_fs *, struct bkey_s, struct bkey_s_c);
+int bch2_trigger_reflink_p(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_reflink_p ((struct bkey_ops) { \
.key_invalid = bch2_reflink_p_invalid, \
.val_to_text = bch2_reflink_p_to_text, \
.key_merge = bch2_reflink_p_merge, \
- .trans_trigger = bch2_trans_mark_reflink_p, \
- .atomic_trigger = bch2_mark_reflink_p, \
+ .trigger = bch2_trigger_reflink_p, \
.min_val_size = 16, \
})
@@ -23,15 +24,14 @@ int bch2_reflink_v_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_reflink_v_to_text(struct printbuf *, struct bch_fs *,
struct bkey_s_c);
-int bch2_trans_mark_reflink_v(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *, unsigned);
+int bch2_trigger_reflink_v(struct btree_trans *, enum btree_id, unsigned,
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_reflink_v ((struct bkey_ops) { \
.key_invalid = bch2_reflink_v_invalid, \
.val_to_text = bch2_reflink_v_to_text, \
.swab = bch2_ptr_swab, \
- .trans_trigger = bch2_trans_mark_reflink_v, \
- .atomic_trigger = bch2_mark_extent, \
+ .trigger = bch2_trigger_reflink_v, \
.min_val_size = 8, \
})
@@ -39,15 +39,15 @@ int bch2_indirect_inline_data_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
void bch2_indirect_inline_data_to_text(struct printbuf *,
struct bch_fs *, struct bkey_s_c);
-int bch2_trans_mark_indirect_inline_data(struct btree_trans *,
+int bch2_trigger_indirect_inline_data(struct btree_trans *,
enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_i *,
+ struct bkey_s_c, struct bkey_s,
unsigned);
#define bch2_bkey_ops_indirect_inline_data ((struct bkey_ops) { \
.key_invalid = bch2_indirect_inline_data_invalid, \
.val_to_text = bch2_indirect_inline_data_to_text, \
- .trans_trigger = bch2_trans_mark_indirect_inline_data, \
+ .trigger = bch2_trigger_indirect_inline_data, \
.min_val_size = 8, \
})
@@ -63,13 +63,13 @@ static inline const __le64 *bkey_refcount_c(struct bkey_s_c k)
}
}
-static inline __le64 *bkey_refcount(struct bkey_i *k)
+static inline __le64 *bkey_refcount(struct bkey_s k)
{
- switch (k->k.type) {
+ switch (k.k->type) {
case KEY_TYPE_reflink_v:
- return &bkey_i_to_reflink_v(k)->v.refcount;
+ return &bkey_s_to_reflink_v(k).v->refcount;
case KEY_TYPE_indirect_inline_data:
- return &bkey_i_to_indirect_inline_data(k)->v.refcount;
+ return &bkey_s_to_indirect_inline_data(k).v->refcount;
default:
return NULL;
}
diff --git a/fs/bcachefs/reflink_format.h b/fs/bcachefs/reflink_format.h
new file mode 100644
index 000000000000..6772eebb1fc6
--- /dev/null
+++ b/fs/bcachefs/reflink_format.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_REFLINK_FORMAT_H
+#define _BCACHEFS_REFLINK_FORMAT_H
+
+struct bch_reflink_p {
+ struct bch_val v;
+ __le64 idx;
+ /*
+ * A reflink pointer might point to an indirect extent which is then
+ * later split (by copygc or rebalance). If we only pointed to part of
+ * the original indirect extent, and then one of the fragments is
+ * outside the range we point to, we'd leak a refcount: so when creating
+ * reflink pointers, we need to store pad values to remember the full
+ * range we were taking a reference on.
+ */
+ __le32 front_pad;
+ __le32 back_pad;
+} __packed __aligned(8);
+
+struct bch_reflink_v {
+ struct bch_val v;
+ __le64 refcount;
+ union bch_extent_entry start[0];
+ __u64 _data[];
+} __packed __aligned(8);
+
+struct bch_indirect_inline_data {
+ struct bch_val v;
+ __le64 refcount;
+ u8 data[];
+};
+
+#endif /* _BCACHEFS_REFLINK_FORMAT_H */
diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c
index 2008fe8bf706..cc2672c12031 100644
--- a/fs/bcachefs/replicas.c
+++ b/fs/bcachefs/replicas.c
@@ -9,9 +9,15 @@
static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *,
struct bch_replicas_cpu *);
+/* Some (buggy!) compilers don't allow memcmp to be passed as a pointer */
+static int bch2_memcmp(const void *l, const void *r, size_t size)
+{
+ return memcmp(l, r, size);
+}
+
/* Replicas tracking - in memory: */
-static void verify_replicas_entry(struct bch_replicas_entry *e)
+static void verify_replicas_entry(struct bch_replicas_entry_v1 *e)
{
#ifdef CONFIG_BCACHEFS_DEBUG
unsigned i;
@@ -26,49 +32,39 @@ static void verify_replicas_entry(struct bch_replicas_entry *e)
#endif
}
-void bch2_replicas_entry_sort(struct bch_replicas_entry *e)
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *e)
{
bubble_sort(e->devs, e->nr_devs, u8_cmp);
}
static void bch2_cpu_replicas_sort(struct bch_replicas_cpu *r)
{
- eytzinger0_sort(r->entries, r->nr, r->entry_size, memcmp, NULL);
+ eytzinger0_sort(r->entries, r->nr, r->entry_size, bch2_memcmp, NULL);
}
static void bch2_replicas_entry_v0_to_text(struct printbuf *out,
struct bch_replicas_entry_v0 *e)
{
- unsigned i;
-
- if (e->data_type < BCH_DATA_NR)
- prt_printf(out, "%s", bch2_data_types[e->data_type]);
- else
- prt_printf(out, "(invalid data type %u)", e->data_type);
+ bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u [", e->nr_devs);
- for (i = 0; i < e->nr_devs; i++)
+ for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
void bch2_replicas_entry_to_text(struct printbuf *out,
- struct bch_replicas_entry *e)
+ struct bch_replicas_entry_v1 *e)
{
- unsigned i;
-
- if (e->data_type < BCH_DATA_NR)
- prt_printf(out, "%s", bch2_data_types[e->data_type]);
- else
- prt_printf(out, "(invalid data type %u)", e->data_type);
+ bch2_prt_data_type(out, e->data_type);
prt_printf(out, ": %u/%u [", e->nr_required, e->nr_devs);
- for (i = 0; i < e->nr_devs; i++)
+ for (unsigned i = 0; i < e->nr_devs; i++)
prt_printf(out, i ? " %u" : "%u", e->devs[i]);
prt_printf(out, "]");
}
-int bch2_replicas_entry_validate(struct bch_replicas_entry *r,
+int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r,
struct bch_sb *sb,
struct printbuf *err)
{
@@ -98,7 +94,7 @@ bad:
void bch2_cpu_replicas_to_text(struct printbuf *out,
struct bch_replicas_cpu *r)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool first = true;
for_each_cpu_replicas_entry(r, e) {
@@ -111,7 +107,7 @@ void bch2_cpu_replicas_to_text(struct printbuf *out,
}
static void extent_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry *r)
+ struct bch_replicas_entry_v1 *r)
{
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
const union bch_extent_entry *entry;
@@ -131,7 +127,7 @@ static void extent_to_replicas(struct bkey_s_c k,
}
static void stripe_to_replicas(struct bkey_s_c k,
- struct bch_replicas_entry *r)
+ struct bch_replicas_entry_v1 *r)
{
struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k);
const struct bch_extent_ptr *ptr;
@@ -144,7 +140,7 @@ static void stripe_to_replicas(struct bkey_s_c k,
r->devs[r->nr_devs++] = ptr->dev;
}
-void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *e,
struct bkey_s_c k)
{
e->nr_devs = 0;
@@ -169,12 +165,10 @@ void bch2_bkey_to_replicas(struct bch_replicas_entry *e,
bch2_replicas_entry_sort(e);
}
-void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *e,
enum bch_data_type data_type,
struct bch_devs_list devs)
{
- unsigned i;
-
BUG_ON(!data_type ||
data_type == BCH_DATA_sb ||
data_type >= BCH_DATA_NR);
@@ -183,8 +177,8 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
e->nr_devs = 0;
e->nr_required = 1;
- for (i = 0; i < devs.nr; i++)
- e->devs[e->nr_devs++] = devs.devs[i];
+ darray_for_each(devs, i)
+ e->devs[e->nr_devs++] = *i;
bch2_replicas_entry_sort(e);
}
@@ -192,7 +186,7 @@ void bch2_devlist_to_replicas(struct bch_replicas_entry *e,
static struct bch_replicas_cpu
cpu_replicas_add_entry(struct bch_fs *c,
struct bch_replicas_cpu *old,
- struct bch_replicas_entry *new_entry)
+ struct bch_replicas_entry_v1 *new_entry)
{
unsigned i;
struct bch_replicas_cpu new = {
@@ -225,7 +219,7 @@ cpu_replicas_add_entry(struct bch_fs *c,
}
static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
int idx, entry_size = replicas_entry_bytes(search);
@@ -243,7 +237,7 @@ static inline int __replicas_entry_idx(struct bch_replicas_cpu *r,
}
int bch2_replicas_entry_idx(struct bch_fs *c,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
bch2_replicas_entry_sort(search);
@@ -251,13 +245,13 @@ int bch2_replicas_entry_idx(struct bch_fs *c,
}
static bool __replicas_has_entry(struct bch_replicas_cpu *r,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
return __replicas_entry_idx(r, search) >= 0;
}
bool bch2_replicas_marked(struct bch_fs *c,
- struct bch_replicas_entry *search)
+ struct bch_replicas_entry_v1 *search)
{
bool marked;
@@ -374,7 +368,7 @@ err:
static unsigned reserve_journal_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
unsigned journal_res_u64s = 0;
/* nr_inodes: */
@@ -399,7 +393,7 @@ static unsigned reserve_journal_replicas(struct bch_fs *c,
noinline
static int bch2_mark_replicas_slowpath(struct bch_fs *c,
- struct bch_replicas_entry *new_entry)
+ struct bch_replicas_entry_v1 *new_entry)
{
struct bch_replicas_cpu new_r, new_gc;
int ret = 0;
@@ -464,7 +458,7 @@ err:
goto out;
}
-int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry *r)
+int bch2_mark_replicas(struct bch_fs *c, struct bch_replicas_entry_v1 *r)
{
return likely(bch2_replicas_marked(c, r))
? 0 : bch2_mark_replicas_slowpath(c, r);
@@ -515,7 +509,7 @@ int bch2_replicas_gc_end(struct bch_fs *c, int ret)
int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
unsigned i = 0;
lockdep_assert_held(&c->replicas_gc_lock);
@@ -590,7 +584,7 @@ retry:
}
for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
if (e->data_type == BCH_DATA_journal ||
@@ -621,7 +615,7 @@ retry:
}
int bch2_replicas_set_usage(struct bch_fs *c,
- struct bch_replicas_entry *r,
+ struct bch_replicas_entry_v1 *r,
u64 sectors)
{
int ret, idx = bch2_replicas_entry_idx(c, r);
@@ -654,7 +648,7 @@ static int
__bch2_sb_replicas_to_cpu_replicas(struct bch_sb_field_replicas *sb_r,
struct bch_replicas_cpu *cpu_r)
{
- struct bch_replicas_entry *e, *dst;
+ struct bch_replicas_entry_v1 *e, *dst;
unsigned nr = 0, entry_size = 0, idx = 0;
for_each_replicas_entry(sb_r, e) {
@@ -692,7 +686,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
nr++;
}
- entry_size += sizeof(struct bch_replicas_entry) -
+ entry_size += sizeof(struct bch_replicas_entry_v1) -
sizeof(struct bch_replicas_entry_v0);
cpu_r->entries = kcalloc(nr, entry_size, GFP_KERNEL);
@@ -703,7 +697,7 @@ __bch2_sb_replicas_v0_to_cpu_replicas(struct bch_sb_field_replicas_v0 *sb_r,
cpu_r->entry_size = entry_size;
for_each_replicas_entry(sb_r, e) {
- struct bch_replicas_entry *dst =
+ struct bch_replicas_entry_v1 *dst =
cpu_replicas_entry(cpu_r, idx++);
dst->data_type = e->data_type;
@@ -747,7 +741,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c,
{
struct bch_sb_field_replicas_v0 *sb_r;
struct bch_replicas_entry_v0 *dst;
- struct bch_replicas_entry *src;
+ struct bch_replicas_entry_v1 *src;
size_t bytes;
bytes = sizeof(struct bch_sb_field_replicas);
@@ -785,7 +779,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c,
struct bch_replicas_cpu *r)
{
struct bch_sb_field_replicas *sb_r;
- struct bch_replicas_entry *dst, *src;
+ struct bch_replicas_entry_v1 *dst, *src;
bool need_v1 = false;
size_t bytes;
@@ -833,10 +827,10 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
sort_cmp_size(cpu_r->entries,
cpu_r->nr,
cpu_r->entry_size,
- memcmp, NULL);
+ bch2_memcmp, NULL);
for (i = 0; i < cpu_r->nr; i++) {
- struct bch_replicas_entry *e =
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(cpu_r, i);
int ret = bch2_replicas_entry_validate(e, sb, err);
@@ -844,7 +838,7 @@ static int bch2_cpu_replicas_validate(struct bch_replicas_cpu *cpu_r,
return ret;
if (i + 1 < cpu_r->nr) {
- struct bch_replicas_entry *n =
+ struct bch_replicas_entry_v1 *n =
cpu_replicas_entry(cpu_r, i + 1);
BUG_ON(memcmp(e, n, cpu_r->entry_size) > 0);
@@ -881,7 +875,7 @@ static void bch2_sb_replicas_to_text(struct printbuf *out,
struct bch_sb_field *f)
{
struct bch_sb_field_replicas *r = field_to_type(f, replicas);
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool first = true;
for_each_replicas_entry(r, e) {
@@ -943,7 +937,7 @@ const struct bch_sb_field_ops bch_sb_field_ops_replicas_v0 = {
bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs,
unsigned flags, bool print)
{
- struct bch_replicas_entry *e;
+ struct bch_replicas_entry_v1 *e;
bool ret = true;
percpu_down_read(&c->mark_lock);
@@ -1003,7 +997,7 @@ unsigned bch2_sb_dev_has_data(struct bch_sb *sb, unsigned dev)
replicas_v0 = bch2_sb_field_get(sb, replicas_v0);
if (replicas) {
- struct bch_replicas_entry *r;
+ struct bch_replicas_entry_v1 *r;
for_each_replicas_entry(replicas, r)
for (i = 0; i < r->nr_devs; i++)
diff --git a/fs/bcachefs/replicas.h b/fs/bcachefs/replicas.h
index f70a642775d1..654a4b26d3a3 100644
--- a/fs/bcachefs/replicas.h
+++ b/fs/bcachefs/replicas.h
@@ -6,28 +6,28 @@
#include "eytzinger.h"
#include "replicas_types.h"
-void bch2_replicas_entry_sort(struct bch_replicas_entry *);
+void bch2_replicas_entry_sort(struct bch_replicas_entry_v1 *);
void bch2_replicas_entry_to_text(struct printbuf *,
- struct bch_replicas_entry *);
-int bch2_replicas_entry_validate(struct bch_replicas_entry *,
+ struct bch_replicas_entry_v1 *);
+int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *,
struct bch_sb *, struct printbuf *);
void bch2_cpu_replicas_to_text(struct printbuf *, struct bch_replicas_cpu *);
-static inline struct bch_replicas_entry *
+static inline struct bch_replicas_entry_v1 *
cpu_replicas_entry(struct bch_replicas_cpu *r, unsigned i)
{
return (void *) r->entries + r->entry_size * i;
}
int bch2_replicas_entry_idx(struct bch_fs *,
- struct bch_replicas_entry *);
+ struct bch_replicas_entry_v1 *);
-void bch2_devlist_to_replicas(struct bch_replicas_entry *,
+void bch2_devlist_to_replicas(struct bch_replicas_entry_v1 *,
enum bch_data_type,
struct bch_devs_list);
-bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry *);
+bool bch2_replicas_marked(struct bch_fs *, struct bch_replicas_entry_v1 *);
int bch2_mark_replicas(struct bch_fs *,
- struct bch_replicas_entry *);
+ struct bch_replicas_entry_v1 *);
static inline struct replicas_delta *
replicas_delta_next(struct replicas_delta *d)
@@ -37,9 +37,9 @@ replicas_delta_next(struct replicas_delta *d)
int bch2_replicas_delta_list_mark(struct bch_fs *, struct replicas_delta_list *);
-void bch2_bkey_to_replicas(struct bch_replicas_entry *, struct bkey_s_c);
+void bch2_bkey_to_replicas(struct bch_replicas_entry_v1 *, struct bkey_s_c);
-static inline void bch2_replicas_entry_cached(struct bch_replicas_entry *e,
+static inline void bch2_replicas_entry_cached(struct bch_replicas_entry_v1 *e,
unsigned dev)
{
e->data_type = BCH_DATA_cached;
@@ -59,7 +59,7 @@ int bch2_replicas_gc_start(struct bch_fs *, unsigned);
int bch2_replicas_gc2(struct bch_fs *);
int bch2_replicas_set_usage(struct bch_fs *,
- struct bch_replicas_entry *,
+ struct bch_replicas_entry_v1 *,
u64);
#define for_each_cpu_replicas_entry(_r, _i) \
diff --git a/fs/bcachefs/replicas_types.h b/fs/bcachefs/replicas_types.h
index 5cfff489bbc3..ac90d142c4e8 100644
--- a/fs/bcachefs/replicas_types.h
+++ b/fs/bcachefs/replicas_types.h
@@ -5,12 +5,12 @@
struct bch_replicas_cpu {
unsigned nr;
unsigned entry_size;
- struct bch_replicas_entry *entries;
+ struct bch_replicas_entry_v1 *entries;
};
struct replicas_delta {
s64 delta;
- struct bch_replicas_entry r;
+ struct bch_replicas_entry_v1 r;
} __packed;
struct replicas_delta_list {
@@ -21,7 +21,7 @@ struct replicas_delta_list {
u64 nr_inodes;
u64 persistent_reserved[BCH_REPLICAS_MAX];
struct {} memset_end;
- struct replicas_delta d[0];
+ struct replicas_delta d[];
};
#endif /* _BCACHEFS_REPLICAS_TYPES_H */
diff --git a/fs/bcachefs/sb-clean.c b/fs/bcachefs/sb-clean.c
index c76ad8ea5e4a..b6bf0ebe7e84 100644
--- a/fs/bcachefs/sb-clean.c
+++ b/fs/bcachefs/sb-clean.c
@@ -191,13 +191,10 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry **end,
u64 journal_seq)
{
- struct bch_dev *ca;
- unsigned i, dev;
-
percpu_down_read(&c->mark_lock);
if (!journal_seq) {
- for (i = 0; i < ARRAY_SIZE(c->usage); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(c->usage); i++)
bch2_fs_usage_acc_to_base(c, i);
} else {
bch2_fs_usage_acc_to_base(c, journal_seq & JOURNAL_BUF_MASK);
@@ -210,7 +207,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->entry.type = BCH_JSET_ENTRY_usage;
u->entry.btree_id = BCH_FS_USAGE_inodes;
- u->v = cpu_to_le64(c->usage_base->nr_inodes);
+ u->v = cpu_to_le64(c->usage_base->b.nr_inodes);
}
{
@@ -223,7 +220,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->v = cpu_to_le64(atomic64_read(&c->key_version));
}
- for (i = 0; i < BCH_REPLICAS_MAX; i++) {
+ for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) {
struct jset_entry_usage *u =
container_of(jset_entry_init(end, sizeof(*u)),
struct jset_entry_usage, entry);
@@ -234,8 +231,8 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]);
}
- for (i = 0; i < c->replicas.nr; i++) {
- struct bch_replicas_entry *e =
+ for (unsigned i = 0; i < c->replicas.nr; i++) {
+ struct bch_replicas_entry_v1 *e =
cpu_replicas_entry(&c->replicas, i);
struct jset_entry_data_usage *u =
container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs),
@@ -247,7 +244,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
"embedded variable length struct");
}
- for_each_member_device(ca, c, dev) {
+ for_each_member_device(c, ca) {
unsigned b = sizeof(struct jset_entry_dev_usage) +
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR;
struct jset_entry_dev_usage *u =
@@ -255,10 +252,9 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
struct jset_entry_dev_usage, entry);
u->entry.type = BCH_JSET_ENTRY_dev_usage;
- u->dev = cpu_to_le32(dev);
- u->buckets_ec = cpu_to_le64(ca->usage_base->buckets_ec);
+ u->dev = cpu_to_le32(ca->dev_idx);
- for (i = 0; i < BCH_DATA_NR; i++) {
+ for (unsigned i = 0; i < BCH_DATA_NR; i++) {
u->d[i].buckets = cpu_to_le64(ca->usage_base->d[i].buckets);
u->d[i].sectors = cpu_to_le64(ca->usage_base->d[i].sectors);
u->d[i].fragmented = cpu_to_le64(ca->usage_base->d[i].fragmented);
@@ -267,7 +263,7 @@ void bch2_journal_super_entries_add_common(struct bch_fs *c,
percpu_up_read(&c->mark_lock);
- for (i = 0; i < 2; i++) {
+ for (unsigned i = 0; i < 2; i++) {
struct jset_entry_clock *clock =
container_of(jset_entry_init(end, sizeof(*clock)),
struct jset_entry_clock, entry);
diff --git a/fs/bcachefs/counters.c b/fs/bcachefs/sb-counters.c
index 02a996e06a64..7dc898761bb3 100644
--- a/fs/bcachefs/counters.c
+++ b/fs/bcachefs/sb-counters.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include "bcachefs.h"
#include "super-io.h"
-#include "counters.h"
+#include "sb-counters.h"
/* BCH_SB_FIELD_counters */
diff --git a/fs/bcachefs/counters.h b/fs/bcachefs/sb-counters.h
index 4778aa19bf34..81f8aec9fcb1 100644
--- a/fs/bcachefs/counters.h
+++ b/fs/bcachefs/sb-counters.h
@@ -1,11 +1,10 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _BCACHEFS_COUNTERS_H
-#define _BCACHEFS_COUNTERS_H
+#ifndef _BCACHEFS_SB_COUNTERS_H
+#define _BCACHEFS_SB_COUNTERS_H
#include "bcachefs.h"
#include "super-io.h"
-
int bch2_sb_counters_to_cpu(struct bch_fs *);
int bch2_sb_counters_from_cpu(struct bch_fs *);
@@ -14,4 +13,4 @@ int bch2_fs_counters_init(struct bch_fs *);
extern const struct bch_sb_field_ops bch_sb_field_ops_counters;
-#endif // _BCACHEFS_COUNTERS_H
+#endif // _BCACHEFS_SB_COUNTERS_H
diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h
new file mode 100644
index 000000000000..62ea478215d0
--- /dev/null
+++ b/fs/bcachefs/sb-counters_format.h
@@ -0,0 +1,98 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SB_COUNTERS_FORMAT_H
+#define _BCACHEFS_SB_COUNTERS_FORMAT_H
+
+#define BCH_PERSISTENT_COUNTERS() \
+ x(io_read, 0) \
+ x(io_write, 1) \
+ x(io_move, 2) \
+ x(bucket_invalidate, 3) \
+ x(bucket_discard, 4) \
+ x(bucket_alloc, 5) \
+ x(bucket_alloc_fail, 6) \
+ x(btree_cache_scan, 7) \
+ x(btree_cache_reap, 8) \
+ x(btree_cache_cannibalize, 9) \
+ x(btree_cache_cannibalize_lock, 10) \
+ x(btree_cache_cannibalize_lock_fail, 11) \
+ x(btree_cache_cannibalize_unlock, 12) \
+ x(btree_node_write, 13) \
+ x(btree_node_read, 14) \
+ x(btree_node_compact, 15) \
+ x(btree_node_merge, 16) \
+ x(btree_node_split, 17) \
+ x(btree_node_rewrite, 18) \
+ x(btree_node_alloc, 19) \
+ x(btree_node_free, 20) \
+ x(btree_node_set_root, 21) \
+ x(btree_path_relock_fail, 22) \
+ x(btree_path_upgrade_fail, 23) \
+ x(btree_reserve_get_fail, 24) \
+ x(journal_entry_full, 25) \
+ x(journal_full, 26) \
+ x(journal_reclaim_finish, 27) \
+ x(journal_reclaim_start, 28) \
+ x(journal_write, 29) \
+ x(read_promote, 30) \
+ x(read_bounce, 31) \
+ x(read_split, 33) \
+ x(read_retry, 32) \
+ x(read_reuse_race, 34) \
+ x(move_extent_read, 35) \
+ x(move_extent_write, 36) \
+ x(move_extent_finish, 37) \
+ x(move_extent_fail, 38) \
+ x(move_extent_start_fail, 39) \
+ x(copygc, 40) \
+ x(copygc_wait, 41) \
+ x(gc_gens_end, 42) \
+ x(gc_gens_start, 43) \
+ x(trans_blocked_journal_reclaim, 44) \
+ x(trans_restart_btree_node_reused, 45) \
+ x(trans_restart_btree_node_split, 46) \
+ x(trans_restart_fault_inject, 47) \
+ x(trans_restart_iter_upgrade, 48) \
+ x(trans_restart_journal_preres_get, 49) \
+ x(trans_restart_journal_reclaim, 50) \
+ x(trans_restart_journal_res_get, 51) \
+ x(trans_restart_key_cache_key_realloced, 52) \
+ x(trans_restart_key_cache_raced, 53) \
+ x(trans_restart_mark_replicas, 54) \
+ x(trans_restart_mem_realloced, 55) \
+ x(trans_restart_memory_allocation_failure, 56) \
+ x(trans_restart_relock, 57) \
+ x(trans_restart_relock_after_fill, 58) \
+ x(trans_restart_relock_key_cache_fill, 59) \
+ x(trans_restart_relock_next_node, 60) \
+ x(trans_restart_relock_parent_for_fill, 61) \
+ x(trans_restart_relock_path, 62) \
+ x(trans_restart_relock_path_intent, 63) \
+ x(trans_restart_too_many_iters, 64) \
+ x(trans_restart_traverse, 65) \
+ x(trans_restart_upgrade, 66) \
+ x(trans_restart_would_deadlock, 67) \
+ x(trans_restart_would_deadlock_write, 68) \
+ x(trans_restart_injected, 69) \
+ x(trans_restart_key_cache_upgrade, 70) \
+ x(trans_traverse_all, 71) \
+ x(transaction_commit, 72) \
+ x(write_super, 73) \
+ x(trans_restart_would_deadlock_recursion_limit, 74) \
+ x(trans_restart_write_buffer_flush, 75) \
+ x(trans_restart_split_race, 76) \
+ x(write_buffer_flush_slowpath, 77) \
+ x(write_buffer_flush_sync, 78)
+
+enum bch_persistent_counters {
+#define x(t, n, ...) BCH_COUNTER_##t,
+ BCH_PERSISTENT_COUNTERS()
+#undef x
+ BCH_COUNTER_NR
+};
+
+struct bch_sb_field_counters {
+ struct bch_sb_field field;
+ __le64 d[];
+};
+
+#endif /* _BCACHEFS_SB_COUNTERS_FORMAT_H */
diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c
index 4919237bbe73..441dcb1bf160 100644
--- a/fs/bcachefs/sb-downgrade.c
+++ b/fs/bcachefs/sb-downgrade.c
@@ -12,33 +12,105 @@
#include "sb-errors.h"
#include "super-io.h"
+#define RECOVERY_PASS_ALL_FSCK BIT_ULL(63)
+
/*
- * Downgrade table:
- * When dowgrading past certain versions, we need to run certain recovery passes
- * and fix certain errors:
+ * Upgrade, downgrade tables - run certain recovery passes, fix certain errors
*
* x(version, recovery_passes, errors...)
*/
+#define UPGRADE_TABLE() \
+ x(backpointers, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(inode_v3, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(unwritten_extents, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(bucket_gens, \
+ BIT_ULL(BCH_RECOVERY_PASS_bucket_gens_init)| \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(lru_v2, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(fragmentation_lru, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(no_bps_in_alloc_keys, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(snapshot_trees, \
+ RECOVERY_PASS_ALL_FSCK) \
+ x(snapshot_skiplists, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_snapshots), \
+ BCH_FSCK_ERR_snapshot_bad_depth, \
+ BCH_FSCK_ERR_snapshot_bad_skiplist) \
+ x(deleted_inodes, \
+ BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \
+ BCH_FSCK_ERR_unlinked_inode_not_on_deleted_list) \
+ x(rebalance_work, \
+ BIT_ULL(BCH_RECOVERY_PASS_set_fs_needs_rebalance))
#define DOWNGRADE_TABLE()
-struct downgrade_entry {
+struct upgrade_downgrade_entry {
u64 recovery_passes;
u16 version;
u16 nr_errors;
const u16 *errors;
};
-#define x(ver, passes, ...) static const u16 ver_##errors[] = { __VA_ARGS__ };
+#define x(ver, passes, ...) static const u16 upgrade_##ver##_errors[] = { __VA_ARGS__ };
+UPGRADE_TABLE()
+#undef x
+
+static const struct upgrade_downgrade_entry upgrade_table[] = {
+#define x(ver, passes, ...) { \
+ .recovery_passes = passes, \
+ .version = bcachefs_metadata_version_##ver,\
+ .nr_errors = ARRAY_SIZE(upgrade_##ver##_errors), \
+ .errors = upgrade_##ver##_errors, \
+},
+UPGRADE_TABLE()
+#undef x
+};
+
+void bch2_sb_set_upgrade(struct bch_fs *c,
+ unsigned old_version,
+ unsigned new_version)
+{
+ lockdep_assert_held(&c->sb_lock);
+
+ struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext);
+
+ for (const struct upgrade_downgrade_entry *i = upgrade_table;
+ i < upgrade_table + ARRAY_SIZE(upgrade_table);
+ i++)
+ if (i->version > old_version && i->version <= new_version) {
+ u64 passes = i->recovery_passes;
+
+ if (passes & RECOVERY_PASS_ALL_FSCK)
+ passes |= bch2_fsck_recovery_passes();
+ passes &= ~RECOVERY_PASS_ALL_FSCK;
+
+ ext->recovery_passes_required[0] |=
+ cpu_to_le64(bch2_recovery_passes_to_stable(passes));
+
+ for (const u16 *e = i->errors;
+ e < i->errors + i->nr_errors;
+ e++) {
+ __set_bit(*e, c->sb.errors_silent);
+ ext->errors_silent[*e / 64] |= cpu_to_le64(BIT_ULL(*e % 64));
+ }
+ }
+}
+
+#define x(ver, passes, ...) static const u16 downgrade_ver_##errors[] = { __VA_ARGS__ };
DOWNGRADE_TABLE()
#undef x
-static const struct downgrade_entry downgrade_table[] = {
+static const struct upgrade_downgrade_entry downgrade_table[] = {
#define x(ver, passes, ...) { \
.recovery_passes = passes, \
.version = bcachefs_metadata_version_##ver,\
- .nr_errors = ARRAY_SIZE(ver_##errors), \
- .errors = ver_##errors, \
+ .nr_errors = ARRAY_SIZE(downgrade_##ver##_errors), \
+ .errors = downgrade_##ver##_errors, \
},
DOWNGRADE_TABLE()
#undef x
@@ -118,7 +190,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c)
darray_char table = {};
int ret = 0;
- for (const struct downgrade_entry *src = downgrade_table;
+ for (const struct upgrade_downgrade_entry *src = downgrade_table;
src < downgrade_table + ARRAY_SIZE(downgrade_table);
src++) {
if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version)))
diff --git a/fs/bcachefs/sb-downgrade.h b/fs/bcachefs/sb-downgrade.h
index bc48fd2ca70e..57e6c916fc73 100644
--- a/fs/bcachefs/sb-downgrade.h
+++ b/fs/bcachefs/sb-downgrade.h
@@ -5,6 +5,7 @@
extern const struct bch_sb_field_ops bch_sb_field_ops_downgrade;
int bch2_sb_downgrade_update(struct bch_fs *);
+void bch2_sb_set_upgrade(struct bch_fs *, unsigned, unsigned);
void bch2_sb_set_downgrade(struct bch_fs *, unsigned, unsigned);
#endif /* _BCACHEFS_SB_DOWNGRADE_H */
diff --git a/fs/bcachefs/sb-errors_types.h b/fs/bcachefs/sb-errors_types.h
index 3504c2d09c29..c08aacdfd073 100644
--- a/fs/bcachefs/sb-errors_types.h
+++ b/fs/bcachefs/sb-errors_types.h
@@ -248,7 +248,9 @@
x(root_inode_not_dir, 240) \
x(dir_loop, 241) \
x(hash_table_key_duplicate, 242) \
- x(hash_table_key_wrong_offset, 243)
+ x(hash_table_key_wrong_offset, 243) \
+ x(unlinked_inode_not_on_deleted_list, 244) \
+ x(reflink_p_front_pad_bad, 245)
enum bch_sb_error_id {
#define x(t, n) BCH_FSCK_ERR_##t = n,
diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c
index bed0f857fe5b..eff5ce18c69c 100644
--- a/fs/bcachefs/sb-members.c
+++ b/fs/bcachefs/sb-members.c
@@ -235,6 +235,11 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "(never)");
prt_newline(out);
+ prt_printf(out, "Last superblock write:");
+ prt_tab(out);
+ prt_u64(out, le64_to_cpu(m.seq));
+ prt_newline(out);
+
prt_printf(out, "State:");
prt_tab(out);
prt_printf(out, "%s",
@@ -246,7 +251,7 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Data allowed:");
prt_tab(out);
if (BCH_MEMBER_DATA_ALLOWED(&m))
- prt_bitflags(out, bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
+ prt_bitflags(out, __bch2_data_types, BCH_MEMBER_DATA_ALLOWED(&m));
else
prt_printf(out, "(none)");
prt_newline(out);
@@ -254,11 +259,16 @@ static void member_to_text(struct printbuf *out,
prt_printf(out, "Has data:");
prt_tab(out);
if (data_have)
- prt_bitflags(out, bch2_data_types, data_have);
+ prt_bitflags(out, __bch2_data_types, data_have);
else
prt_printf(out, "(none)");
prt_newline(out);
+ prt_str(out, "Durability:");
+ prt_tab(out);
+ prt_printf(out, "%llu", BCH_MEMBER_DURABILITY(&m) ? BCH_MEMBER_DURABILITY(&m) - 1 : 1);
+ prt_newline(out);
+
prt_printf(out, "Discard:");
prt_tab(out);
prt_printf(out, "%llu", BCH_MEMBER_DISCARD(&m));
@@ -353,14 +363,12 @@ const struct bch_sb_field_ops bch_sb_field_ops_members_v2 = {
void bch2_sb_members_from_cpu(struct bch_fs *c)
{
struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
- struct bch_dev *ca;
- unsigned i, e;
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL) {
- struct bch_member *m = __bch2_members_v2_get_mut(mi, i);
+ for_each_member_device_rcu(c, ca, NULL) {
+ struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx);
- for (e = 0; e < BCH_MEMBER_ERROR_NR; e++)
+ for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++)
m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e]));
}
rcu_read_unlock();
@@ -413,7 +421,7 @@ void bch2_dev_errors_reset(struct bch_dev *ca)
m = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx);
for (unsigned i = 0; i < ARRAY_SIZE(m->errors_at_reset); i++)
m->errors_at_reset[i] = cpu_to_le64(atomic64_read(&ca->errors[i]));
- m->errors_reset_time = ktime_get_real_seconds();
+ m->errors_reset_time = cpu_to_le64(ktime_get_real_seconds());
bch2_write_super(c);
mutex_unlock(&c->sb_lock);
diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h
index 03613e3eb8e3..be0a94183271 100644
--- a/fs/bcachefs/sb-members.h
+++ b/fs/bcachefs/sb-members.h
@@ -2,6 +2,8 @@
#ifndef _BCACHEFS_SB_MEMBERS_H
#define _BCACHEFS_SB_MEMBERS_H
+#include "darray.h"
+
extern char * const bch2_member_error_strs[];
static inline struct bch_member *
@@ -47,23 +49,18 @@ static inline unsigned dev_mask_nr(const struct bch_devs_mask *devs)
static inline bool bch2_dev_list_has_dev(struct bch_devs_list devs,
unsigned dev)
{
- unsigned i;
-
- for (i = 0; i < devs.nr; i++)
- if (devs.devs[i] == dev)
+ darray_for_each(devs, i)
+ if (*i == dev)
return true;
-
return false;
}
static inline void bch2_dev_list_drop_dev(struct bch_devs_list *devs,
unsigned dev)
{
- unsigned i;
-
- for (i = 0; i < devs->nr; i++)
- if (devs->devs[i] == dev) {
- array_remove_item(devs->devs, devs->nr, i);
+ darray_for_each(*devs, i)
+ if (*i == dev) {
+ darray_remove_item(devs, i);
return;
}
}
@@ -72,40 +69,48 @@ static inline void bch2_dev_list_add_dev(struct bch_devs_list *devs,
unsigned dev)
{
if (!bch2_dev_list_has_dev(*devs, dev)) {
- BUG_ON(devs->nr >= ARRAY_SIZE(devs->devs));
- devs->devs[devs->nr++] = dev;
+ BUG_ON(devs->nr >= ARRAY_SIZE(devs->data));
+ devs->data[devs->nr++] = dev;
}
}
static inline struct bch_devs_list bch2_dev_list_single(unsigned dev)
{
- return (struct bch_devs_list) { .nr = 1, .devs[0] = dev };
+ return (struct bch_devs_list) { .nr = 1, .data[0] = dev };
}
-static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, unsigned *iter,
- const struct bch_devs_mask *mask)
+static inline struct bch_dev *__bch2_next_dev_idx(struct bch_fs *c, unsigned idx,
+ const struct bch_devs_mask *mask)
{
struct bch_dev *ca = NULL;
- while ((*iter = mask
- ? find_next_bit(mask->d, c->sb.nr_devices, *iter)
- : *iter) < c->sb.nr_devices &&
- !(ca = rcu_dereference_check(c->devs[*iter],
+ while ((idx = mask
+ ? find_next_bit(mask->d, c->sb.nr_devices, idx)
+ : idx) < c->sb.nr_devices &&
+ !(ca = rcu_dereference_check(c->devs[idx],
lockdep_is_held(&c->state_lock))))
- (*iter)++;
+ idx++;
return ca;
}
-#define for_each_member_device_rcu(ca, c, iter, mask) \
- for ((iter) = 0; ((ca) = __bch2_next_dev((c), &(iter), mask)); (iter)++)
+static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev *ca,
+ const struct bch_devs_mask *mask)
+{
+ return __bch2_next_dev_idx(c, ca ? ca->dev_idx + 1 : 0, mask);
+}
-static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter)
+#define for_each_member_device_rcu(_c, _ca, _mask) \
+ for (struct bch_dev *_ca = NULL; \
+ (_ca = __bch2_next_dev((_c), _ca, (_mask)));)
+
+static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca)
{
- struct bch_dev *ca;
+ if (ca)
+ percpu_ref_put(&ca->ref);
rcu_read_lock();
- if ((ca = __bch2_next_dev(c, iter, NULL)))
+ if ((ca = __bch2_next_dev(c, ca, NULL)))
percpu_ref_get(&ca->ref);
rcu_read_unlock();
@@ -115,41 +120,42 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, unsigned *iter
/*
* If you break early, you must drop your ref on the current device
*/
-#define for_each_member_device(ca, c, iter) \
- for ((iter) = 0; \
- (ca = bch2_get_next_dev(c, &(iter))); \
- percpu_ref_put(&ca->ref), (iter)++)
+#define __for_each_member_device(_c, _ca) \
+ for (; (_ca = bch2_get_next_dev(_c, _ca));)
+
+#define for_each_member_device(_c, _ca) \
+ for (struct bch_dev *_ca = NULL; \
+ (_ca = bch2_get_next_dev(_c, _ca));)
static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c,
- unsigned *iter,
- int state_mask)
+ struct bch_dev *ca,
+ unsigned state_mask)
{
- struct bch_dev *ca;
+ if (ca)
+ percpu_ref_put(&ca->io_ref);
rcu_read_lock();
- while ((ca = __bch2_next_dev(c, iter, NULL)) &&
+ while ((ca = __bch2_next_dev(c, ca, NULL)) &&
(!((1 << ca->mi.state) & state_mask) ||
!percpu_ref_tryget(&ca->io_ref)))
- (*iter)++;
+ ;
rcu_read_unlock();
return ca;
}
-#define __for_each_online_member(ca, c, iter, state_mask) \
- for ((iter) = 0; \
- (ca = bch2_get_next_online_dev(c, &(iter), state_mask)); \
- percpu_ref_put(&ca->io_ref), (iter)++)
+#define __for_each_online_member(_c, _ca, state_mask) \
+ for (struct bch_dev *_ca = NULL; \
+ (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));)
-#define for_each_online_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, ~0)
+#define for_each_online_member(c, ca) \
+ __for_each_online_member(c, ca, ~0)
-#define for_each_rw_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, 1 << BCH_MEMBER_STATE_rw)
+#define for_each_rw_member(c, ca) \
+ __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw))
-#define for_each_readable_member(ca, c, iter) \
- __for_each_online_member(ca, c, iter, \
- (1 << BCH_MEMBER_STATE_rw)|(1 << BCH_MEMBER_STATE_ro))
+#define for_each_readable_member(c, ca) \
+ __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro))
/*
* If a key exists that references a device, the device won't be going away and
@@ -175,11 +181,9 @@ static inline struct bch_dev *bch_dev_locked(struct bch_fs *c, unsigned idx)
static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c)
{
struct bch_devs_mask devs;
- struct bch_dev *ca;
- unsigned i;
memset(&devs, 0, sizeof(devs));
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
__set_bit(ca->dev_idx, devs.d);
return devs;
}
diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c
index 97790445e67a..3a494c5d1247 100644
--- a/fs/bcachefs/six.c
+++ b/fs/bcachefs/six.c
@@ -324,101 +324,57 @@ bool six_relock_ip(struct six_lock *lock, enum six_lock_type type,
}
EXPORT_SYMBOL_GPL(six_relock_ip);
-#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
+#ifdef CONFIG_BCACHEFS_SIX_OPTIMISTIC_SPIN
-static inline bool six_can_spin_on_owner(struct six_lock *lock)
+static inline bool six_owner_running(struct six_lock *lock)
{
- struct task_struct *owner;
- bool ret;
-
- if (need_resched())
- return false;
-
+ /*
+ * When there's no owner, we might have preempted between the owner
+ * acquiring the lock and setting the owner field. If we're an RT task
+ * that will live-lock because we won't let the owner complete.
+ */
rcu_read_lock();
- owner = READ_ONCE(lock->owner);
- ret = !owner || owner_on_cpu(owner);
+ struct task_struct *owner = READ_ONCE(lock->owner);
+ bool ret = owner ? owner_on_cpu(owner) : !rt_task(current);
rcu_read_unlock();
return ret;
}
-static inline bool six_spin_on_owner(struct six_lock *lock,
- struct task_struct *owner,
- u64 end_time)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+ struct six_lock_waiter *wait,
+ enum six_lock_type type)
{
- bool ret = true;
unsigned loop = 0;
-
- rcu_read_lock();
- while (lock->owner == owner) {
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_
- * checking lock->owner still matches owner. If that fails,
- * owner might point to freed memory. If it still matches,
- * the rcu_read_lock() ensures the memory stays valid.
- */
- barrier();
-
- if (!owner_on_cpu(owner) || need_resched()) {
- ret = false;
- break;
- }
-
- if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
- six_set_bitmask(lock, SIX_LOCK_NOSPIN);
- ret = false;
- break;
- }
-
- cpu_relax();
- }
- rcu_read_unlock();
-
- return ret;
-}
-
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
-{
- struct task_struct *task = current;
u64 end_time;
if (type == SIX_LOCK_write)
return false;
- preempt_disable();
- if (!six_can_spin_on_owner(lock))
- goto fail;
+ if (lock->wait_list.next != &wait->list)
+ return false;
- if (!osq_lock(&lock->osq))
- goto fail;
+ if (atomic_read(&lock->state) & SIX_LOCK_NOSPIN)
+ return false;
+ preempt_disable();
end_time = sched_clock() + 10 * NSEC_PER_USEC;
- while (1) {
- struct task_struct *owner;
-
+ while (!need_resched() && six_owner_running(lock)) {
/*
- * If there's an owner, wait for it to either
- * release the lock or go to sleep.
+ * Ensures that writes to the waitlist entry happen after we see
+ * wait->lock_acquired: pairs with the smp_store_release in
+ * __six_lock_wakeup
*/
- owner = READ_ONCE(lock->owner);
- if (owner && !six_spin_on_owner(lock, owner, end_time))
- break;
-
- if (do_six_trylock(lock, type, false)) {
- osq_unlock(&lock->osq);
+ if (smp_load_acquire(&wait->lock_acquired)) {
preempt_enable();
return true;
}
- /*
- * When there's no owner, we might have preempted between the
- * owner acquiring the lock and setting the owner field. If
- * we're an RT task that will live-lock because we won't let
- * the owner complete.
- */
- if (!owner && (need_resched() || rt_task(task)))
+ if (!(++loop & 0xf) && (time_after64(sched_clock(), end_time))) {
+ six_set_bitmask(lock, SIX_LOCK_NOSPIN);
break;
+ }
/*
* The cpu_relax() call is a compiler barrier which forces
@@ -429,24 +385,15 @@ static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type
cpu_relax();
}
- osq_unlock(&lock->osq);
-fail:
preempt_enable();
-
- /*
- * If we fell out of the spin path because of need_resched(),
- * reschedule now, before we try-lock again. This avoids getting
- * scheduled out right after we obtained the lock.
- */
- if (need_resched())
- schedule();
-
return false;
}
-#else /* CONFIG_SIX_LOCK_SPIN_ON_OWNER */
+#else /* CONFIG_LOCK_SPIN_ON_OWNER */
-static inline bool six_optimistic_spin(struct six_lock *lock, enum six_lock_type type)
+static inline bool six_optimistic_spin(struct six_lock *lock,
+ struct six_lock_waiter *wait,
+ enum six_lock_type type)
{
return false;
}
@@ -470,9 +417,6 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
trace_contention_begin(lock, 0);
lock_contended(&lock->dep_map, ip);
- if (six_optimistic_spin(lock, type))
- goto out;
-
wait->task = current;
wait->lock_want = type;
wait->lock_acquired = false;
@@ -510,6 +454,9 @@ static int six_lock_slowpath(struct six_lock *lock, enum six_lock_type type,
ret = 0;
}
+ if (six_optimistic_spin(lock, wait, type))
+ goto out;
+
while (1) {
set_current_state(TASK_UNINTERRUPTIBLE);
diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h
index 4c268b0b8316..68d46fd7f391 100644
--- a/fs/bcachefs/six.h
+++ b/fs/bcachefs/six.h
@@ -15,7 +15,7 @@
* will have to take write locks for the full duration of the operation.
*
* But by adding an intent state, which is exclusive with other intent locks but
- * not with readers, we can take intent locks at thte start of the operation,
+ * not with readers, we can take intent locks at the start of the operation,
* and then take write locks only for the actual update to each individual
* nodes, without deadlocking.
*
@@ -65,8 +65,8 @@
*
* Reentrancy:
*
- * Six locks are not by themselves reentrent, but have counters for both the
- * read and intent states that can be used to provide reentrency by an upper
+ * Six locks are not by themselves reentrant, but have counters for both the
+ * read and intent states that can be used to provide reentrancy by an upper
* layer that tracks held locks. If a lock is known to already be held in the
* read or intent state, six_lock_increment() can be used to bump the "lock
* held in this state" counter, increasing the number of unlock calls that
@@ -127,10 +127,6 @@
#include <linux/sched.h>
#include <linux/types.h>
-#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
-#include <linux/osq_lock.h>
-#endif
-
enum six_lock_type {
SIX_LOCK_read,
SIX_LOCK_intent,
@@ -143,9 +139,6 @@ struct six_lock {
unsigned intent_lock_recurse;
struct task_struct *owner;
unsigned __percpu *readers;
-#ifdef CONFIG_SIX_LOCK_SPIN_ON_OWNER
- struct optimistic_spin_queue osq;
-#endif
raw_spinlock_t wait_lock;
struct list_head wait_list;
#ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c
index 5dac038f0851..ac6ba04d5521 100644
--- a/fs/bcachefs/snapshot.c
+++ b/fs/bcachefs/snapshot.c
@@ -123,7 +123,7 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
struct snapshot_table *t;
bool ret;
- EBUG_ON(c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_snapshots);
+ EBUG_ON(c->recovery_pass_done <= BCH_RECOVERY_PASS_check_snapshots);
rcu_read_lock();
t = rcu_dereference(c->snapshots);
@@ -276,7 +276,7 @@ static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
mutex_unlock(&c->snapshot_table_lock);
}
-int bch2_mark_snapshot(struct btree_trans *trans,
+static int __bch2_mark_snapshot(struct btree_trans *trans,
enum btree_id btree, unsigned level,
struct bkey_s_c old, struct bkey_s_c new,
unsigned flags)
@@ -318,7 +318,7 @@ int bch2_mark_snapshot(struct btree_trans *trans,
__set_is_ancestor_bitmap(c, id);
if (BCH_SNAPSHOT_DELETED(s.v)) {
- set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
bch2_delete_dead_snapshots_async(c);
}
@@ -330,6 +330,14 @@ err:
return ret;
}
+int bch2_mark_snapshot(struct btree_trans *trans,
+ enum btree_id btree, unsigned level,
+ struct bkey_s_c old, struct bkey_s new,
+ unsigned flags)
+{
+ return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
+}
+
int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
struct bch_snapshot *s)
{
@@ -459,7 +467,6 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
struct bch_fs *c = trans->c;
struct btree_iter iter;
struct bkey_s_c k;
- struct bkey_s_c_subvolume s;
bool found = false;
int ret;
@@ -468,7 +475,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
if (k.k->type != KEY_TYPE_subvolume)
continue;
- s = bkey_s_c_to_subvolume(k);
+ struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
continue;
if (!BCH_SUBVOLUME_SNAP(s.v)) {
@@ -582,19 +589,13 @@ fsck_err:
*/
int bch2_check_snapshot_trees(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
BTREE_ID_snapshot_trees, POS_MIN,
BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
check_snapshot_tree(trans, &iter, k)));
-
- if (ret)
- bch_err(c, "error %i checking snapshot trees", ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -727,7 +728,7 @@ static int check_snapshot(struct btree_trans *trans,
return 0;
memset(&s, 0, sizeof(s));
- memcpy(&s, k.v, bkey_val_bytes(k.k));
+ memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
id = le32_to_cpu(s.parent);
if (id) {
@@ -813,11 +814,10 @@ static int check_snapshot(struct btree_trans *trans,
real_depth = bch2_snapshot_depth(c, parent_id);
- if (le32_to_cpu(s.depth) != real_depth &&
- (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
- fsck_err(c, snapshot_bad_depth,
- "snapshot with incorrect depth field, should be %u:\n %s",
- real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+ if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
+ c, snapshot_bad_depth,
+ "snapshot with incorrect depth field, should be %u:\n %s",
+ real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
if (ret)
@@ -831,11 +831,9 @@ static int check_snapshot(struct btree_trans *trans,
if (ret < 0)
goto err;
- if (!ret &&
- (c->sb.version_upgrade_complete < bcachefs_metadata_version_snapshot_skiplists ||
- fsck_err(c, snapshot_bad_skiplist,
- "snapshot with bad skiplist field:\n %s",
- (bch2_bkey_val_to_text(&buf, c, k), buf.buf)))) {
+ if (fsck_err_on(!ret, c, snapshot_bad_skiplist,
+ "snapshot with bad skiplist field:\n %s",
+ (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
ret = PTR_ERR_OR_ZERO(u);
if (ret)
@@ -856,22 +854,17 @@ fsck_err:
int bch2_check_snapshots(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
/*
* We iterate backwards as checking/fixing the depth field requires that
* the parent's depth already be correct:
*/
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_reverse_commit(trans, iter,
- BTREE_ID_snapshots, POS_MAX,
- BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_snapshot(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ BTREE_ID_snapshots, POS_MAX,
+ BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_snapshot(trans, &iter, k)));
+ bch_err_fn(c, ret);
return ret;
}
@@ -1060,6 +1053,8 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
n->v.subvol = cpu_to_le32(snapshot_subvols[i]);
n->v.tree = cpu_to_le32(tree);
n->v.depth = cpu_to_le32(depth);
+ n->v.btime.lo = cpu_to_le64(bch2_current_time(c));
+ n->v.btime.hi = 0;
for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
@@ -1067,7 +1062,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
- ret = bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
+ ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
if (ret)
goto err;
@@ -1315,7 +1310,6 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
struct bch_fs *c = trans->c;
u32 nr_deleted_ancestors = 0;
struct bkey_i_snapshot *s;
- u32 *i;
int ret;
if (k.k->type != KEY_TYPE_snapshot)
@@ -1368,23 +1362,19 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
int bch2_delete_dead_snapshots(struct bch_fs *c)
{
struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
- struct bkey_s_c_snapshot snap;
snapshot_id_list deleted = { 0 };
snapshot_id_list deleted_interior = { 0 };
- u32 *i, id;
+ u32 id;
int ret = 0;
- if (!test_and_clear_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags))
+ if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
return 0;
- if (!test_bit(BCH_FS_STARTED, &c->flags)) {
+ if (!test_bit(BCH_FS_started, &c->flags)) {
ret = bch2_fs_read_write_early(c);
- if (ret) {
- bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
+ bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
+ if (ret)
return ret;
- }
}
trans = bch2_trans_get(c);
@@ -1397,37 +1387,29 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
POS_MIN, 0, k,
NULL, NULL, 0,
bch2_delete_redundant_snapshot(trans, k));
- if (ret) {
- bch_err_msg(c, ret, "deleting redundant snapshots");
+ bch_err_msg(c, ret, "deleting redundant snapshots");
+ if (ret)
goto err;
- }
- ret = for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
bch2_snapshot_set_equiv(trans, k));
- if (ret) {
- bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+ bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
+ if (ret)
goto err;
- }
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ({
if (k.k->type != KEY_TYPE_snapshot)
continue;
- snap = bkey_s_c_to_snapshot(k);
- if (BCH_SNAPSHOT_DELETED(snap.v)) {
- ret = snapshot_list_add(c, &deleted, k.k->p.offset);
- if (ret)
- break;
- }
- }
- bch2_trans_iter_exit(trans, &iter);
-
- if (ret) {
- bch_err_msg(c, ret, "walking snapshots");
+ BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
+ ? snapshot_list_add(c, &deleted, k.k->p.offset)
+ : 0;
+ }));
+ bch_err_msg(c, ret, "walking snapshots");
+ if (ret)
goto err;
- }
for (id = 0; id < BTREE_ID_NR; id++) {
struct bpos last_pos = POS_MIN;
@@ -1449,36 +1431,36 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
ret = for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL, BTREE_INSERT_NOFAIL,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc,
snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
for_each_btree_key_commit(trans, iter,
id, POS_MIN,
BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
- &res, NULL, BTREE_INSERT_NOFAIL,
+ &res, NULL, BCH_TRANS_COMMIT_no_enospc,
move_key_to_correct_snapshot(trans, &iter, k));
bch2_disk_reservation_put(c, &res);
darray_exit(&equiv_seen);
- if (ret) {
- bch_err_msg(c, ret, "deleting keys from dying snapshots");
+ bch_err_msg(c, ret, "deleting keys from dying snapshots");
+ if (ret)
goto err;
- }
}
bch2_trans_unlock(trans);
down_write(&c->snapshot_create_lock);
- for_each_btree_key(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k, ret) {
+ ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k, ({
u32 snapshot = k.k->p.offset;
u32 equiv = bch2_snapshot_equiv(c, snapshot);
- if (equiv != snapshot)
- snapshot_list_add(c, &deleted_interior, snapshot);
- }
- bch2_trans_iter_exit(trans, &iter);
+ equiv != snapshot
+ ? snapshot_list_add(c, &deleted_interior, snapshot)
+ : 0;
+ }));
+ bch_err_msg(c, ret, "walking snapshots");
if (ret)
goto err_create_lock;
@@ -1489,7 +1471,7 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
*/
ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
BTREE_ITER_INTENT, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
if (ret)
goto err_create_lock;
@@ -1497,19 +1479,17 @@ int bch2_delete_dead_snapshots(struct bch_fs *c)
darray_for_each(deleted, i) {
ret = commit_do(trans, NULL, NULL, 0,
bch2_snapshot_node_delete(trans, *i));
- if (ret) {
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ if (ret)
goto err_create_lock;
- }
}
darray_for_each(deleted_interior, i) {
ret = commit_do(trans, NULL, NULL, 0,
bch2_snapshot_node_delete(trans, *i));
- if (ret) {
- bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ bch_err_msg(c, ret, "deleting snapshot %u", *i);
+ if (ret)
goto err_create_lock;
- }
}
err_create_lock:
up_write(&c->snapshot_create_lock);
@@ -1517,8 +1497,7 @@ err:
darray_exit(&deleted_interior);
darray_exit(&deleted);
bch2_trans_put(trans);
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
@@ -1680,7 +1659,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct
if (BCH_SNAPSHOT_DELETED(snap.v) ||
bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
(ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
- set_bit(BCH_FS_NEED_DELETE_DEAD_SNAPSHOTS, &c->flags);
+ set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
return 0;
}
@@ -1689,25 +1668,20 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct
int bch2_snapshots_read(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
-
- ret = bch2_trans_run(c,
- for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
- bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
+ int ret = bch2_trans_run(c,
+ for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
+ __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
bch2_snapshot_set_equiv(trans, k) ?:
bch2_check_snapshot_needs_deletion(trans, k)) ?:
- for_each_btree_key2(trans, iter, BTREE_ID_snapshots,
- POS_MIN, 0, k,
+ for_each_btree_key(trans, iter, BTREE_ID_snapshots,
+ POS_MIN, 0, k,
(set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
- if (ret)
- bch_err_fn(c, ret);
+ bch_err_fn(c, ret);
return ret;
}
void bch2_fs_snapshots_exit(struct bch_fs *c)
{
- kfree(rcu_dereference_protected(c->snapshots, true));
+ kvfree(rcu_dereference_protected(c->snapshots, true));
}
diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h
index f09a22f44239..7c66ffc06385 100644
--- a/fs/bcachefs/snapshot.h
+++ b/fs/bcachefs/snapshot.h
@@ -22,12 +22,12 @@ void bch2_snapshot_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c);
int bch2_snapshot_invalid(struct bch_fs *, struct bkey_s_c,
enum bkey_invalid_flags, struct printbuf *);
int bch2_mark_snapshot(struct btree_trans *, enum btree_id, unsigned,
- struct bkey_s_c, struct bkey_s_c, unsigned);
+ struct bkey_s_c, struct bkey_s, unsigned);
#define bch2_bkey_ops_snapshot ((struct bkey_ops) { \
.key_invalid = bch2_snapshot_invalid, \
.val_to_text = bch2_snapshot_to_text, \
- .atomic_trigger = bch2_mark_snapshot, \
+ .trigger = bch2_mark_snapshot, \
.min_val_size = 24, \
})
@@ -202,8 +202,6 @@ static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id)
static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
{
- u32 *i;
-
darray_for_each(*s, i)
if (*i == id)
return true;
@@ -212,8 +210,6 @@ static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id)
static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id)
{
- u32 *i;
-
darray_for_each(*s, i)
if (bch2_snapshot_is_ancestor(c, id, *i))
return true;
diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h
new file mode 100644
index 000000000000..aabcd3a74cd9
--- /dev/null
+++ b/fs/bcachefs/snapshot_format.h
@@ -0,0 +1,36 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SNAPSHOT_FORMAT_H
+#define _BCACHEFS_SNAPSHOT_FORMAT_H
+
+struct bch_snapshot {
+ struct bch_val v;
+ __le32 flags;
+ __le32 parent;
+ __le32 children[2];
+ __le32 subvol;
+ /* corresponds to a bch_snapshot_tree in BTREE_ID_snapshot_trees */
+ __le32 tree;
+ __le32 depth;
+ __le32 skip[3];
+ bch_le128 btime;
+};
+
+LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1)
+
+/* True if a subvolume points to this snapshot node: */
+LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2)
+
+/*
+ * Snapshot trees:
+ *
+ * The snapshot_trees btree gives us persistent indentifier for each tree of
+ * bch_snapshot nodes, and allow us to record and easily find the root/master
+ * subvolume that other snapshots were created from:
+ */
+struct bch_snapshot_tree {
+ struct bch_val v;
+ __le32 master_subvol;
+ __le32 root_snapshot;
+};
+
+#endif /* _BCACHEFS_SNAPSHOT_FORMAT_H */
diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h
index ae21a8cca1b4..fcaa5a888744 100644
--- a/fs/bcachefs/str_hash.h
+++ b/fs/bcachefs/str_hash.h
@@ -15,6 +15,16 @@
#include <crypto/hash.h>
#include <crypto/sha2.h>
+typedef unsigned __bitwise bch_str_hash_flags_t;
+
+enum bch_str_hash_flags {
+ __BCH_HASH_SET_MUST_CREATE,
+ __BCH_HASH_SET_MUST_REPLACE,
+};
+
+#define BCH_HASH_SET_MUST_CREATE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_CREATE)
+#define BCH_HASH_SET_MUST_REPLACE (__force bch_str_hash_flags_t) BIT(__BCH_HASH_SET_MUST_REPLACE)
+
static inline enum bch_str_hash_type
bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt)
{
@@ -150,21 +160,16 @@ static inline bool is_visible_key(struct bch_hash_desc desc, subvol_inum inum, s
}
static __always_inline int
-bch2_hash_lookup(struct btree_trans *trans,
+bch2_hash_lookup_in_snapshot(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum, const void *key,
- unsigned flags)
+ unsigned flags, u32 snapshot)
{
struct bkey_s_c k;
- u32 snapshot;
int ret;
- ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot);
- if (ret)
- return ret;
-
for_each_btree_key_upto_norestart(trans, *iter, desc.btree_id,
SPOS(inum.inum, desc.hash_key(info, key), snapshot),
POS(inum.inum, U64_MAX),
@@ -185,6 +190,19 @@ bch2_hash_lookup(struct btree_trans *trans,
}
static __always_inline int
+bch2_hash_lookup(struct btree_trans *trans,
+ struct btree_iter *iter,
+ const struct bch_hash_desc desc,
+ const struct bch_hash_info *info,
+ subvol_inum inum, const void *key,
+ unsigned flags)
+{
+ u32 snapshot;
+ return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?:
+ bch2_hash_lookup_in_snapshot(trans, iter, desc, info, inum, key, flags, snapshot);
+}
+
+static __always_inline int
bch2_hash_hole(struct btree_trans *trans,
struct btree_iter *iter,
const struct bch_hash_desc desc,
@@ -246,7 +264,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
const struct bch_hash_info *info,
subvol_inum inum, u32 snapshot,
struct bkey_i *insert,
- int flags,
+ bch_str_hash_flags_t str_hash_flags,
int update_flags)
{
struct btree_iter iter, slot = { NULL };
@@ -269,7 +287,7 @@ int bch2_hash_set_snapshot(struct btree_trans *trans,
}
if (!slot.path &&
- !(flags & BCH_HASH_SET_MUST_REPLACE))
+ !(str_hash_flags & BCH_HASH_SET_MUST_REPLACE))
bch2_trans_copy_iter(&slot, &iter);
if (k.k->type != KEY_TYPE_hash_whiteout)
@@ -287,16 +305,16 @@ found:
found = true;
not_found:
- if (!found && (flags & BCH_HASH_SET_MUST_REPLACE)) {
+ if (!found && (str_hash_flags & BCH_HASH_SET_MUST_REPLACE)) {
ret = -BCH_ERR_ENOENT_str_hash_set_must_replace;
- } else if (found && (flags & BCH_HASH_SET_MUST_CREATE)) {
+ } else if (found && (str_hash_flags & BCH_HASH_SET_MUST_CREATE)) {
ret = -EEXIST;
} else {
if (!found && slot.path)
swap(iter, slot);
insert->k.p = iter.pos;
- ret = bch2_trans_update(trans, &iter, insert, 0);
+ ret = bch2_trans_update(trans, &iter, insert, update_flags);
}
goto out;
@@ -307,7 +325,8 @@ int bch2_hash_set(struct btree_trans *trans,
const struct bch_hash_desc desc,
const struct bch_hash_info *info,
subvol_inum inum,
- struct bkey_i *insert, int flags)
+ struct bkey_i *insert,
+ bch_str_hash_flags_t str_hash_flags)
{
u32 snapshot;
int ret;
@@ -319,7 +338,7 @@ int bch2_hash_set(struct btree_trans *trans,
insert->k.p.inode = inum.inum;
return bch2_hash_set_snapshot(trans, desc, info, inum,
- snapshot, insert, flags, 0);
+ snapshot, insert, str_hash_flags, 0);
}
static __always_inline
diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c
index 22b34a8e4d6e..7c67c28d3ef8 100644
--- a/fs/bcachefs/subvolume.c
+++ b/fs/bcachefs/subvolume.c
@@ -37,11 +37,8 @@ static int check_subvol(struct btree_trans *trans,
return ret;
if (BCH_SUBVOLUME_UNLINKED(subvol.v)) {
- bch2_fs_lazy_rw(c);
-
ret = bch2_subvolume_delete(trans, iter->pos.offset);
- if (ret)
- bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
+ bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset);
return ret ?: -BCH_ERR_transaction_restart_nested;
}
@@ -82,17 +79,12 @@ fsck_err:
int bch2_check_subvols(struct bch_fs *c)
{
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret;
-
- ret = bch2_trans_run(c,
+ int ret = bch2_trans_run(c,
for_each_btree_key_commit(trans, iter,
- BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_LAZY_RW|BTREE_INSERT_NOFAIL,
- check_subvol(trans, &iter, k)));
- if (ret)
- bch_err_fn(c, ret);
+ BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
+ check_subvol(trans, &iter, k)));
+ bch_err_fn(c, ret);
return ret;
}
@@ -228,8 +220,6 @@ static int bch2_subvolume_reparent(struct btree_trans *trans,
*/
static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_delete)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bch_subvolume s;
return lockrestart_do(trans,
@@ -237,7 +227,7 @@ static int bch2_subvolumes_reparent(struct btree_trans *trans, u32 subvolid_to_d
BTREE_ITER_CACHED, &s)) ?:
for_each_btree_key_commit(trans, iter,
BTREE_ID_subvolumes, POS_MIN, BTREE_ITER_PREFETCH, k,
- NULL, NULL, BTREE_INSERT_NOFAIL,
+ NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
bch2_subvolume_reparent(trans, &iter, k,
subvolid_to_delete, le32_to_cpu(s.parent)));
}
@@ -274,7 +264,7 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid)
{
return bch2_subvolumes_reparent(trans, subvolid) ?:
- commit_do(trans, NULL, NULL, BTREE_INSERT_NOFAIL,
+ commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
__bch2_subvolume_delete(trans, subvolid));
}
@@ -299,10 +289,9 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor
for (id = s.data; id < s.data + s.nr; id++) {
ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id));
- if (ret) {
- bch_err_msg(c, ret, "deleting subvolume %u", *id);
+ bch_err_msg(c, ret, "deleting subvolume %u", *id);
+ if (ret)
break;
- }
}
darray_exit(&s);
diff --git a/fs/bcachefs/subvolume_format.h b/fs/bcachefs/subvolume_format.h
new file mode 100644
index 000000000000..af79134b07d6
--- /dev/null
+++ b/fs/bcachefs/subvolume_format.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_SUBVOLUME_FORMAT_H
+#define _BCACHEFS_SUBVOLUME_FORMAT_H
+
+#define SUBVOL_POS_MIN POS(0, 1)
+#define SUBVOL_POS_MAX POS(0, S32_MAX)
+#define BCACHEFS_ROOT_SUBVOL 1
+
+struct bch_subvolume {
+ struct bch_val v;
+ __le32 flags;
+ __le32 snapshot;
+ __le64 inode;
+ /*
+ * Snapshot subvolumes form a tree, separate from the snapshot nodes
+ * tree - if this subvolume is a snapshot, this is the ID of the
+ * subvolume it was created from:
+ *
+ * This is _not_ necessarily the subvolume of the directory containing
+ * this subvolume:
+ */
+ __le32 parent;
+ __le32 pad;
+ bch_le128 otime;
+};
+
+LE32_BITMASK(BCH_SUBVOLUME_RO, struct bch_subvolume, flags, 0, 1)
+/*
+ * We need to know whether a subvolume is a snapshot so we can know whether we
+ * can delete it (or whether it should just be rm -rf'd)
+ */
+LE32_BITMASK(BCH_SUBVOLUME_SNAP, struct bch_subvolume, flags, 1, 2)
+LE32_BITMASK(BCH_SUBVOLUME_UNLINKED, struct bch_subvolume, flags, 2, 3)
+
+#endif /* _BCACHEFS_SUBVOLUME_FORMAT_H */
diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h
index 2d2e66a4e468..ae644adfc391 100644
--- a/fs/bcachefs/subvolume_types.h
+++ b/fs/bcachefs/subvolume_types.h
@@ -20,7 +20,11 @@ struct snapshot_t {
};
struct snapshot_table {
+#ifndef RUST_BINDGEN
DECLARE_FLEX_ARRAY(struct snapshot_t, s);
+#else
+ struct snapshot_t s[0];
+#endif
};
typedef struct {
diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c
index 4c98d8cc2a79..36988add581f 100644
--- a/fs/bcachefs/super-io.c
+++ b/fs/bcachefs/super-io.c
@@ -2,7 +2,6 @@
#include "bcachefs.h"
#include "checksum.h"
-#include "counters.h"
#include "disk_groups.h"
#include "ec.h"
#include "error.h"
@@ -13,6 +12,7 @@
#include "replicas.h"
#include "quota.h"
#include "sb-clean.h"
+#include "sb-counters.h"
#include "sb-downgrade.h"
#include "sb-errors.h"
#include "sb-members.h"
@@ -30,14 +30,12 @@ static const struct blk_holder_ops bch2_sb_handle_bdev_ops = {
struct bch2_metadata_version {
u16 version;
const char *name;
- u64 recovery_passes;
};
static const struct bch2_metadata_version bch2_metadata_versions[] = {
-#define x(n, v, _recovery_passes) { \
+#define x(n, v) { \
.version = v, \
.name = #n, \
- .recovery_passes = _recovery_passes, \
},
BCH_METADATA_VERSIONS()
#undef x
@@ -70,24 +68,6 @@ unsigned bch2_latest_compatible_version(unsigned v)
return v;
}
-u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
- unsigned old_version,
- unsigned new_version)
-{
- u64 ret = 0;
-
- for (const struct bch2_metadata_version *i = bch2_metadata_versions;
- i < bch2_metadata_versions + ARRAY_SIZE(bch2_metadata_versions);
- i++)
- if (i->version > old_version && i->version <= new_version) {
- if (i->recovery_passes & RECOVERY_PASS_ALL_FSCK)
- ret |= bch2_fsck_recovery_passes();
- ret |= i->recovery_passes;
- }
-
- return ret &= ~RECOVERY_PASS_ALL_FSCK;
-}
-
const char * const bch2_sb_fields[] = {
#define x(name, nr) #name,
BCH_SB_FIELDS()
@@ -101,8 +81,6 @@ static int bch2_sb_field_validate(struct bch_sb *, struct bch_sb_field *,
struct bch_sb_field *bch2_sb_field_get_id(struct bch_sb *sb,
enum bch_sb_field_type type)
{
- struct bch_sb_field *f;
-
/* XXX: need locking around superblock to access optional fields */
vstruct_for_each(sb, f)
@@ -164,8 +142,8 @@ void bch2_sb_field_delete(struct bch_sb_handle *sb,
void bch2_free_super(struct bch_sb_handle *sb)
{
kfree(sb->bio);
- if (!IS_ERR_OR_NULL(sb->bdev))
- blkdev_put(sb->bdev, sb->holder);
+ if (!IS_ERR_OR_NULL(sb->bdev_handle))
+ bdev_release(sb->bdev_handle);
kfree(sb->holder);
kfree(sb->sb_name);
@@ -192,8 +170,12 @@ int bch2_sb_realloc(struct bch_sb_handle *sb, unsigned u64s)
u64 max_bytes = 512 << sb->sb->layout.sb_max_size_bits;
if (new_bytes > max_bytes) {
- pr_err("%pg: superblock too big: want %zu but have %llu",
- sb->bdev, new_bytes, max_bytes);
+ struct printbuf buf = PRINTBUF;
+
+ prt_bdevname(&buf, sb->bdev);
+ prt_printf(&buf, ": superblock too big: want %zu but have %llu", new_bytes, max_bytes);
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
return -BCH_ERR_ENOSPC_sb;
}
}
@@ -241,14 +223,12 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb,
if (sb->fs_sb) {
struct bch_fs *c = container_of(sb, struct bch_fs, disk_sb);
- struct bch_dev *ca;
- unsigned i;
lockdep_assert_held(&c->sb_lock);
/* XXX: we're not checking that offline device have enough space */
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
struct bch_sb_handle *dev_sb = &ca->disk_sb;
if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) {
@@ -368,7 +348,6 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, struct printbuf *out,
int rw)
{
struct bch_sb *sb = disk_sb->sb;
- struct bch_sb_field *f;
struct bch_sb_field_members_v1 *mi;
enum bch_opt_id opt_id;
u16 block_size;
@@ -514,8 +493,6 @@ static void le_bitvector_to_cpu(unsigned long *dst, unsigned long *src, unsigned
static void bch2_sb_update(struct bch_fs *c)
{
struct bch_sb *src = c->disk_sb.sb;
- struct bch_dev *ca;
- unsigned i;
lockdep_assert_held(&c->sb_lock);
@@ -546,7 +523,7 @@ static void bch2_sb_update(struct bch_fs *c)
le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent,
sizeof(c->sb.errors_silent) * 8);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
struct bch_member m = bch2_sb_member_get(src, ca->dev_idx);
ca->mi = bch2_mi_to_cpu(&m);
}
@@ -571,6 +548,7 @@ static int __copy_super(struct bch_sb_handle *dst_handle, struct bch_sb *src)
dst->time_base_lo = src->time_base_lo;
dst->time_base_hi = src->time_base_hi;
dst->time_precision = src->time_precision;
+ dst->write_time = src->write_time;
memcpy(dst->flags, src->flags, sizeof(dst->flags));
memcpy(dst->features, src->features, sizeof(dst->features));
@@ -634,7 +612,6 @@ int bch2_sb_from_fs(struct bch_fs *c, struct bch_dev *ca)
static int read_one_super(struct bch_sb_handle *sb, u64 offset, struct printbuf *err)
{
- struct bch_csum csum;
size_t bytes;
int ret;
reread:
@@ -650,7 +627,9 @@ reread:
if (!uuid_equal(&sb->sb->magic, &BCACHE_MAGIC) &&
!uuid_equal(&sb->sb->magic, &BCHFS_MAGIC)) {
- prt_printf(err, "Not a bcachefs superblock");
+ prt_str(err, "Not a bcachefs superblock (got magic ");
+ pr_uuid(err, sb->sb->magic.b);
+ prt_str(err, ")");
return -BCH_ERR_invalid_sb_magic;
}
@@ -673,17 +652,16 @@ reread:
goto reread;
}
- if (BCH_SB_CSUM_TYPE(sb->sb) >= BCH_CSUM_NR) {
+ enum bch_csum_type csum_type = BCH_SB_CSUM_TYPE(sb->sb);
+ if (csum_type >= BCH_CSUM_NR) {
prt_printf(err, "unknown checksum type %llu", BCH_SB_CSUM_TYPE(sb->sb));
return -BCH_ERR_invalid_sb_csum_type;
}
/* XXX: verify MACs */
- csum = csum_vstruct(NULL, BCH_SB_CSUM_TYPE(sb->sb),
- null_nonce(), sb->sb);
-
+ struct bch_csum csum = csum_vstruct(NULL, csum_type, null_nonce(), sb->sb);
if (bch2_crc_cmp(csum, sb->sb->csum)) {
- prt_printf(err, "bad checksum");
+ bch2_csum_err_msg(err, csum_type, sb->sb->csum, csum);
return -BCH_ERR_invalid_sb_csum;
}
@@ -692,12 +670,13 @@ reread:
return 0;
}
-int bch2_read_super(const char *path, struct bch_opts *opts,
- struct bch_sb_handle *sb)
+static int __bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb, bool ignore_notbchfs_msg)
{
u64 offset = opt_get(*opts, sb);
struct bch_sb_layout layout;
struct printbuf err = PRINTBUF;
+ struct printbuf err2 = PRINTBUF;
__le64 *i;
int ret;
#ifndef __KERNEL__
@@ -725,21 +704,22 @@ retry:
if (!opt_get(*opts, nochanges))
sb->mode |= BLK_OPEN_WRITE;
- sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (IS_ERR(sb->bdev) &&
- PTR_ERR(sb->bdev) == -EACCES &&
+ sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (IS_ERR(sb->bdev_handle) &&
+ PTR_ERR(sb->bdev_handle) == -EACCES &&
opt_get(*opts, read_only)) {
sb->mode &= ~BLK_OPEN_WRITE;
- sb->bdev = blkdev_get_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
- if (!IS_ERR(sb->bdev))
+ sb->bdev_handle = bdev_open_by_path(path, sb->mode, sb->holder, &bch2_sb_handle_bdev_ops);
+ if (!IS_ERR(sb->bdev_handle))
opt_set(*opts, nochanges, true);
}
- if (IS_ERR(sb->bdev)) {
- ret = PTR_ERR(sb->bdev);
- goto out;
+ if (IS_ERR(sb->bdev_handle)) {
+ ret = PTR_ERR(sb->bdev_handle);
+ goto err;
}
+ sb->bdev = sb->bdev_handle->bdev;
ret = bch2_sb_realloc(sb, 0);
if (ret) {
@@ -760,8 +740,14 @@ retry:
if (opt_defined(*opts, sb))
goto err;
- printk(KERN_ERR "bcachefs (%s): error reading default superblock: %s\n",
+ prt_printf(&err2, "bcachefs (%s): error reading default superblock: %s\n",
path, err.buf);
+ if (ret == -BCH_ERR_invalid_sb_magic && ignore_notbchfs_msg)
+ printk(KERN_INFO "%s", err2.buf);
+ else
+ printk(KERN_ERR "%s", err2.buf);
+
+ printbuf_exit(&err2);
printbuf_reset(&err);
/*
@@ -837,6 +823,20 @@ err_no_print:
goto out;
}
+int bch2_read_super(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
+{
+ return __bch2_read_super(path, opts, sb, false);
+}
+
+/* provide a silenced version for mount.bcachefs */
+
+int bch2_read_super_silent(const char *path, struct bch_opts *opts,
+ struct bch_sb_handle *sb)
+{
+ return __bch2_read_super(path, opts, sb, true);
+}
+
/* write superblock: */
static void write_super_endio(struct bio *bio)
@@ -905,9 +905,8 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx)
int bch2_write_super(struct bch_fs *c)
{
struct closure *cl = &c->sb_write;
- struct bch_dev *ca;
struct printbuf err = PRINTBUF;
- unsigned i, sb = 0, nr_wrote;
+ unsigned sb = 0, nr_wrote;
struct bch_devs_mask sb_written;
bool wrote, can_mount_without_written, can_mount_with_written;
unsigned degraded_flags = BCH_FORCE_IF_DEGRADED;
@@ -929,9 +928,14 @@ int bch2_write_super(struct bch_fs *c)
le64_add_cpu(&c->disk_sb.sb->seq, 1);
- if (test_bit(BCH_FS_ERROR, &c->flags))
+ struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2);
+ for_each_online_member(c, ca)
+ __bch2_members_v2_get_mut(mi, ca->dev_idx)->seq = c->disk_sb.sb->seq;
+ c->disk_sb.sb->write_time = cpu_to_le64(ktime_get_real_seconds());
+
+ if (test_bit(BCH_FS_error, &c->flags))
SET_BCH_SB_HAS_ERRORS(c->disk_sb.sb, 1);
- if (test_bit(BCH_FS_TOPOLOGY_ERROR, &c->flags))
+ if (test_bit(BCH_FS_topology_error, &c->flags))
SET_BCH_SB_HAS_TOPOLOGY_ERRORS(c->disk_sb.sb, 1);
SET_BCH_SB_BIG_ENDIAN(c->disk_sb.sb, CPU_BIG_ENDIAN);
@@ -942,10 +946,10 @@ int bch2_write_super(struct bch_fs *c)
bch2_sb_errors_from_cpu(c);
bch2_sb_downgrade_update(c);
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
bch2_sb_from_fs(c, ca);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
printbuf_reset(&err);
ret = bch2_sb_validate(&ca->disk_sb, &err, WRITE);
@@ -966,16 +970,28 @@ int bch2_write_super(struct bch_fs *c)
if (!BCH_SB_INITIALIZED(c->disk_sb.sb))
goto out;
- for_each_online_member(ca, c, i) {
+ if (le16_to_cpu(c->disk_sb.sb->version) > bcachefs_metadata_version_current) {
+ struct printbuf buf = PRINTBUF;
+ prt_printf(&buf, "attempting to write superblock that wasn't version downgraded (");
+ bch2_version_to_text(&buf, le16_to_cpu(c->disk_sb.sb->version));
+ prt_str(&buf, " > ");
+ bch2_version_to_text(&buf, bcachefs_metadata_version_current);
+ prt_str(&buf, ")");
+ bch2_fs_fatal_error(c, "%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_sb_not_downgraded;
+ }
+
+ for_each_online_member(c, ca) {
__set_bit(ca->dev_idx, sb_written.d);
ca->sb_write_error = 0;
}
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
read_back_super(c, ca);
closure_sync(cl);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (ca->sb_write_error)
continue;
@@ -1002,7 +1018,7 @@ int bch2_write_super(struct bch_fs *c)
do {
wrote = false;
- for_each_online_member(ca, c, i)
+ for_each_online_member(c, ca)
if (!ca->sb_write_error &&
sb < ca->disk_sb.sb->layout.nr_superblocks) {
write_one_super(c, ca, sb);
@@ -1012,7 +1028,7 @@ int bch2_write_super(struct bch_fs *c)
sb++;
} while (wrote);
- for_each_online_member(ca, c, i) {
+ for_each_online_member(c, ca) {
if (ca->sb_write_error)
__clear_bit(ca->dev_idx, sb_written.d);
else
@@ -1024,7 +1040,7 @@ int bch2_write_super(struct bch_fs *c)
can_mount_with_written =
bch2_have_enough_devs(c, sb_written, degraded_flags, false);
- for (i = 0; i < ARRAY_SIZE(sb_written.d); i++)
+ for (unsigned i = 0; i < ARRAY_SIZE(sb_written.d); i++)
sb_written.d[i] = ~sb_written.d[i];
can_mount_without_written =
@@ -1073,13 +1089,22 @@ bool bch2_check_version_downgrade(struct bch_fs *c)
/*
* Downgrade, if superblock is at a higher version than currently
* supported:
+ *
+ * c->sb will be checked before we write the superblock, so update it as
+ * well:
*/
- if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current)
+ if (BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb) > bcachefs_metadata_version_current) {
SET_BCH_SB_VERSION_UPGRADE_COMPLETE(c->disk_sb.sb, bcachefs_metadata_version_current);
- if (c->sb.version > bcachefs_metadata_version_current)
+ c->sb.version_upgrade_complete = bcachefs_metadata_version_current;
+ }
+ if (c->sb.version > bcachefs_metadata_version_current) {
c->disk_sb.sb->version = cpu_to_le16(bcachefs_metadata_version_current);
- if (c->sb.version_min > bcachefs_metadata_version_current)
+ c->sb.version = bcachefs_metadata_version_current;
+ }
+ if (c->sb.version_min > bcachefs_metadata_version_current) {
c->disk_sb.sb->version_min = cpu_to_le16(bcachefs_metadata_version_current);
+ c->sb.version_min = bcachefs_metadata_version_current;
+ }
c->disk_sb.sb->compat[0] &= cpu_to_le64((1ULL << BCH_COMPAT_NR) - 1);
return ret;
}
@@ -1172,8 +1197,8 @@ static int bch2_sb_field_validate(struct bch_sb *sb, struct bch_sb_field *f,
return ret;
}
-void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
- struct bch_sb_field *f)
+void __bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
{
unsigned type = le32_to_cpu(f->type);
const struct bch_sb_field_ops *ops = bch2_sb_field_type_ops(type);
@@ -1181,6 +1206,15 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
if (!out->nr_tabstops)
printbuf_tabstop_push(out, 32);
+ if (ops->to_text)
+ ops->to_text(out, sb, f);
+}
+
+void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
+ struct bch_sb_field *f)
+{
+ unsigned type = le32_to_cpu(f->type);
+
if (type < BCH_SB_FIELD_NR)
prt_printf(out, "%s", bch2_sb_fields[type]);
else
@@ -1189,11 +1223,7 @@ void bch2_sb_field_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, " (size %zu):", vstruct_bytes(f));
prt_newline(out);
- if (ops->to_text) {
- printbuf_indent_add(out, 2);
- ops->to_text(out, sb, f);
- printbuf_indent_sub(out, 2);
- }
+ __bch2_sb_field_to_text(out, sb, f);
}
void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
@@ -1222,7 +1252,6 @@ void bch2_sb_layout_to_text(struct printbuf *out, struct bch_sb_layout *l)
void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
bool print_layout, unsigned fields)
{
- struct bch_sb_field *f;
u64 fields_have = 0;
unsigned nr_devices = 0;
@@ -1242,6 +1271,11 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
pr_uuid(out, sb->uuid.b);
prt_newline(out);
+ prt_printf(out, "Magic number:");
+ prt_tab(out);
+ pr_uuid(out, sb->magic.b);
+ prt_newline(out);
+
prt_str(out, "Device index:");
prt_tab(out);
prt_printf(out, "%u", sb->dev_idx);
@@ -1280,9 +1314,16 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb,
prt_printf(out, "%llu", le64_to_cpu(sb->seq));
prt_newline(out);
+ prt_printf(out, "Time of last write:");
+ prt_tab(out);
+ bch2_prt_datetime(out, le64_to_cpu(sb->write_time));
+ prt_newline(out);
+
prt_printf(out, "Superblock size:");
prt_tab(out);
- prt_printf(out, "%zu", vstruct_bytes(sb));
+ prt_units_u64(out, vstruct_bytes(sb));
+ prt_str(out, "/");
+ prt_units_u64(out, 512ULL << sb->layout.sb_max_size_bits);
prt_newline(out);
prt_printf(out, "Clean:");
diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h
index e41e5de531a0..95e80e06316b 100644
--- a/fs/bcachefs/super-io.h
+++ b/fs/bcachefs/super-io.h
@@ -19,10 +19,6 @@ static inline bool bch2_version_compatible(u16 version)
void bch2_version_to_text(struct printbuf *, unsigned);
unsigned bch2_latest_compatible_version(unsigned);
-u64 bch2_upgrade_recovery_passes(struct bch_fs *c,
- unsigned,
- unsigned);
-
static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f)
{
return le32_to_cpu(f->u64s) * sizeof(u64);
@@ -84,6 +80,7 @@ void bch2_free_super(struct bch_sb_handle *);
int bch2_sb_realloc(struct bch_sb_handle *, unsigned);
int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *);
+int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *);
int bch2_write_super(struct bch_fs *);
void __bch2_check_set_feature(struct bch_fs *, unsigned);
@@ -96,6 +93,8 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat)
bool bch2_check_version_downgrade(struct bch_fs *);
void bch2_sb_upgrade(struct bch_fs *, unsigned);
+void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
+ struct bch_sb_field *);
void bch2_sb_field_to_text(struct printbuf *, struct bch_sb *,
struct bch_sb_field *);
void bch2_sb_layout_to_text(struct printbuf *, struct bch_sb_layout *);
diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c
index 818ec467a06b..6b23e11825e6 100644
--- a/fs/bcachefs/super.c
+++ b/fs/bcachefs/super.c
@@ -23,7 +23,6 @@
#include "checksum.h"
#include "clock.h"
#include "compress.h"
-#include "counters.h"
#include "debug.h"
#include "disk_groups.h"
#include "ec.h"
@@ -49,6 +48,7 @@
#include "recovery.h"
#include "replicas.h"
#include "sb-clean.h"
+#include "sb-counters.h"
#include "sb-errors.h"
#include "sb-members.h"
#include "snapshot.h"
@@ -79,6 +79,36 @@ MODULE_SOFTDEP("pre: chacha20");
MODULE_SOFTDEP("pre: poly1305");
MODULE_SOFTDEP("pre: xxhash");
+const char * const bch2_fs_flag_strs[] = {
+#define x(n) #n,
+ BCH_FS_FLAGS()
+#undef x
+ NULL
+};
+
+void __bch2_print(struct bch_fs *c, const char *fmt, ...)
+{
+ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c);
+
+ va_list args;
+ va_start(args, fmt);
+ if (likely(!stdio)) {
+ vprintk(fmt, args);
+ } else {
+ unsigned long flags;
+
+ if (fmt[0] == KERN_SOH[0])
+ fmt += 2;
+
+ spin_lock_irqsave(&stdio->output_lock, flags);
+ prt_vprintf(&stdio->output_buf, fmt, args);
+ spin_unlock_irqrestore(&stdio->output_lock, flags);
+
+ wake_up(&stdio->output_wait);
+ }
+ va_end(args);
+}
+
#define KTYPE(type) \
static const struct attribute_group type ## _group = { \
.attrs = type ## _files \
@@ -134,14 +164,12 @@ static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *);
struct bch_fs *bch2_dev_to_fs(dev_t dev)
{
struct bch_fs *c;
- struct bch_dev *ca;
- unsigned i;
mutex_lock(&bch_fs_list_lock);
rcu_read_lock();
list_for_each_entry(c, &bch_fs_list, list)
- for_each_member_device_rcu(ca, c, i, NULL)
+ for_each_member_device_rcu(c, ca, NULL)
if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) {
closure_get(&c->cl);
goto found;
@@ -182,14 +210,13 @@ struct bch_fs *bch2_uuid_to_fs(__uuid_t uuid)
static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i, nr = 0, u64s =
+ unsigned nr = 0, u64s =
((sizeof(struct jset_entry_dev_usage) +
sizeof(struct jset_entry_dev_usage_type) * BCH_DATA_NR)) /
sizeof(u64);
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL)
+ for_each_member_device_rcu(c, ca, NULL)
nr++;
rcu_read_unlock();
@@ -216,8 +243,7 @@ static void bch2_dev_usage_journal_reserve(struct bch_fs *c)
static void __bch2_fs_read_only(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i, clean_passes = 0;
+ unsigned clean_passes = 0;
u64 seq = 0;
bch2_fs_ec_stop(c);
@@ -246,14 +272,14 @@ static void __bch2_fs_read_only(struct bch_fs *c)
journal_cur_seq(&c->journal));
if (test_bit(JOURNAL_REPLAY_DONE, &c->journal.flags) &&
- !test_bit(BCH_FS_EMERGENCY_RO, &c->flags))
- set_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+ !test_bit(BCH_FS_emergency_ro, &c->flags))
+ set_bit(BCH_FS_clean_shutdown, &c->flags);
bch2_fs_journal_stop(&c->journal);
/*
* After stopping journal:
*/
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
bch2_dev_allocator_remove(c, ca);
}
@@ -262,25 +288,27 @@ static void bch2_writes_disabled(struct percpu_ref *writes)
{
struct bch_fs *c = container_of(writes, struct bch_fs, writes);
- set_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
+ set_bit(BCH_FS_write_disable_complete, &c->flags);
wake_up(&bch2_read_only_wait);
}
#endif
void bch2_fs_read_only(struct bch_fs *c)
{
- if (!test_bit(BCH_FS_RW, &c->flags)) {
+ if (!test_bit(BCH_FS_rw, &c->flags)) {
bch2_journal_reclaim_stop(&c->journal);
return;
}
- BUG_ON(test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+ BUG_ON(test_bit(BCH_FS_write_disable_complete, &c->flags));
+
+ bch_verbose(c, "going read-only");
/*
* Block new foreground-end write operations from starting - any new
* writes will return -EROFS:
*/
- set_bit(BCH_FS_GOING_RO, &c->flags);
+ set_bit(BCH_FS_going_ro, &c->flags);
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_kill(&c->writes);
#else
@@ -300,33 +328,42 @@ void bch2_fs_read_only(struct bch_fs *c)
* that going RO is complete:
*/
wait_event(bch2_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags) ||
- test_bit(BCH_FS_EMERGENCY_RO, &c->flags));
+ test_bit(BCH_FS_write_disable_complete, &c->flags) ||
+ test_bit(BCH_FS_emergency_ro, &c->flags));
+
+ bool writes_disabled = test_bit(BCH_FS_write_disable_complete, &c->flags);
+ if (writes_disabled)
+ bch_verbose(c, "finished waiting for writes to stop");
__bch2_fs_read_only(c);
wait_event(bch2_read_only_wait,
- test_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags));
+ test_bit(BCH_FS_write_disable_complete, &c->flags));
- clear_bit(BCH_FS_WRITE_DISABLE_COMPLETE, &c->flags);
- clear_bit(BCH_FS_GOING_RO, &c->flags);
+ if (!writes_disabled)
+ bch_verbose(c, "finished waiting for writes to stop");
+
+ clear_bit(BCH_FS_write_disable_complete, &c->flags);
+ clear_bit(BCH_FS_going_ro, &c->flags);
+ clear_bit(BCH_FS_rw, &c->flags);
if (!bch2_journal_error(&c->journal) &&
- !test_bit(BCH_FS_ERROR, &c->flags) &&
- !test_bit(BCH_FS_EMERGENCY_RO, &c->flags) &&
- test_bit(BCH_FS_STARTED, &c->flags) &&
- test_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags) &&
+ !test_bit(BCH_FS_error, &c->flags) &&
+ !test_bit(BCH_FS_emergency_ro, &c->flags) &&
+ test_bit(BCH_FS_started, &c->flags) &&
+ test_bit(BCH_FS_clean_shutdown, &c->flags) &&
!c->opts.norecovery) {
BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal));
BUG_ON(atomic_read(&c->btree_cache.dirty));
BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty));
- BUG_ON(c->btree_write_buffer.state.nr);
+ BUG_ON(c->btree_write_buffer.inc.keys.nr);
+ BUG_ON(c->btree_write_buffer.flushing.keys.nr);
bch_verbose(c, "marking filesystem clean");
bch2_fs_mark_clean(c);
+ } else {
+ bch_verbose(c, "done going read-only, filesystem not clean");
}
-
- clear_bit(BCH_FS_RW, &c->flags);
}
static void bch2_fs_read_only_work(struct work_struct *work)
@@ -346,7 +383,7 @@ static void bch2_fs_read_only_async(struct bch_fs *c)
bool bch2_fs_emergency_read_only(struct bch_fs *c)
{
- bool ret = !test_and_set_bit(BCH_FS_EMERGENCY_RO, &c->flags);
+ bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags);
bch2_journal_halt(&c->journal);
bch2_fs_read_only_async(c);
@@ -383,28 +420,16 @@ static int bch2_fs_read_write_late(struct bch_fs *c)
static int __bch2_fs_read_write(struct bch_fs *c, bool early)
{
- struct bch_dev *ca;
- unsigned i;
int ret;
- if (test_bit(BCH_FS_INITIAL_GC_UNFIXED, &c->flags)) {
+ if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) {
bch_err(c, "cannot go rw, unfixed btree errors");
return -BCH_ERR_erofs_unfixed_errors;
}
- if (test_bit(BCH_FS_RW, &c->flags))
+ if (test_bit(BCH_FS_rw, &c->flags))
return 0;
- if (c->opts.norecovery)
- return -BCH_ERR_erofs_norecovery;
-
- /*
- * nochanges is used for fsck -n mode - we have to allow going rw
- * during recovery for that to work:
- */
- if (c->opts.nochanges && (!early || c->opts.read_only))
- return -BCH_ERR_erofs_nochanges;
-
bch_info(c, "going read-write");
ret = bch2_sb_members_v2_init(c);
@@ -415,7 +440,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
if (ret)
goto err;
- clear_bit(BCH_FS_CLEAN_SHUTDOWN, &c->flags);
+ clear_bit(BCH_FS_clean_shutdown, &c->flags);
/*
* First journal write must be a flush write: after a clean shutdown we
@@ -425,17 +450,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
*/
set_bit(JOURNAL_NEED_FLUSH_WRITE, &c->journal.flags);
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
- set_bit(BCH_FS_RW, &c->flags);
- set_bit(BCH_FS_WAS_RW, &c->flags);
+ set_bit(BCH_FS_rw, &c->flags);
+ set_bit(BCH_FS_was_rw, &c->flags);
#ifndef BCH_WRITE_REF_DEBUG
percpu_ref_reinit(&c->writes);
#else
- for (i = 0; i < BCH_WRITE_REF_NR; i++) {
+ for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) {
BUG_ON(atomic_long_read(&c->writes[i]));
atomic_long_inc(&c->writes[i]);
}
@@ -463,7 +488,7 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early)
bch2_do_pending_node_rewrites(c);
return 0;
err:
- if (test_bit(BCH_FS_RW, &c->flags))
+ if (test_bit(BCH_FS_rw, &c->flags))
bch2_fs_read_only(c);
else
__bch2_fs_read_only(c);
@@ -472,6 +497,12 @@ err:
int bch2_fs_read_write(struct bch_fs *c)
{
+ if (c->opts.norecovery)
+ return -BCH_ERR_erofs_norecovery;
+
+ if (c->opts.nochanges)
+ return -BCH_ERR_erofs_nochanges;
+
return __bch2_fs_read_write(c, false);
}
@@ -558,12 +589,9 @@ static void bch2_fs_release(struct kobject *kobj)
void __bch2_fs_stop(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
-
bch_verbose(c, "shutting down");
- set_bit(BCH_FS_STOPPING, &c->flags);
+ set_bit(BCH_FS_stopping, &c->flags);
cancel_work_sync(&c->journal_seq_blacklist_gc_work);
@@ -571,7 +599,7 @@ void __bch2_fs_stop(struct bch_fs *c)
bch2_fs_read_only(c);
up_write(&c->state_lock);
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
if (ca->kobj.state_in_sysfs &&
ca->disk_sb.bdev)
sysfs_remove_link(bdev_kobj(ca->disk_sb.bdev), "bcachefs");
@@ -582,6 +610,9 @@ void __bch2_fs_stop(struct bch_fs *c)
bch2_fs_debug_exit(c);
bch2_fs_chardev_exit(c);
+ bch2_ro_ref_put(c);
+ wait_event(c->ro_ref_wait, !refcount_read(&c->ro_ref));
+
kobject_put(&c->counters_kobj);
kobject_put(&c->time_stats);
kobject_put(&c->opts_dir);
@@ -590,7 +621,7 @@ void __bch2_fs_stop(struct bch_fs *c)
/* btree prefetch might have kicked off reads in the background: */
bch2_btree_flush_all_reads(c);
- for_each_member_device(ca, c, i)
+ for_each_member_device(c, ca)
cancel_work_sync(&ca->io_error_work);
cancel_work_sync(&c->read_only_work);
@@ -629,8 +660,6 @@ void bch2_fs_stop(struct bch_fs *c)
static int bch2_fs_online(struct bch_fs *c)
{
- struct bch_dev *ca;
- unsigned i;
int ret = 0;
lockdep_assert_held(&bch_fs_list_lock);
@@ -651,7 +680,9 @@ static int bch2_fs_online(struct bch_fs *c)
ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?:
kobject_add(&c->internal, &c->kobj, "internal") ?:
kobject_add(&c->opts_dir, &c->kobj, "options") ?:
+#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
kobject_add(&c->time_stats, &c->kobj, "time_stats") ?:
+#endif
kobject_add(&c->counters_kobj, &c->kobj, "counters") ?:
bch2_opts_create_sysfs_files(&c->opts_dir);
if (ret) {
@@ -661,7 +692,7 @@ static int bch2_fs_online(struct bch_fs *c)
down_write(&c->state_lock);
- for_each_member_device(ca, c, i) {
+ for_each_member_device(c, ca) {
ret = bch2_dev_sysfs_online(c, ca);
if (ret) {
bch_err(c, "error creating sysfs objects");
@@ -690,6 +721,8 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
goto out;
}
+ c->stdio = (void *)(unsigned long) opts.stdio;
+
__module_get(THIS_MODULE);
closure_init(&c->cl, NULL);
@@ -710,6 +743,10 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
mutex_init(&c->btree_root_lock);
INIT_WORK(&c->read_only_work, bch2_fs_read_only_work);
+ refcount_set(&c->ro_ref, 1);
+ init_waitqueue_head(&c->ro_ref_wait);
+ sema_init(&c->online_fsck_mutex, 1);
+
init_rwsem(&c->gc_lock);
mutex_init(&c->gc_gens_lock);
atomic_set(&c->journal_keys.ref, 1);
@@ -763,7 +800,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
c->journal.flush_write_time = &c->times[BCH_TIME_journal_flush_write];
c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write];
- c->journal.blocked_time = &c->times[BCH_TIME_blocked_journal];
c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq];
bch2_fs_btree_cache_init_early(&c->btree_cache);
@@ -832,7 +868,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->copygc_wq = alloc_workqueue("bcachefs_copygc",
WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) ||
!(c->io_complete_wq = alloc_workqueue("bcachefs_io",
- WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 1)) ||
+ WQ_FREEZABLE|WQ_HIGHPRI|WQ_MEM_RECLAIM, 512)) ||
!(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref",
WQ_FREEZABLE, 0)) ||
#ifndef BCH_WRITE_REF_DEBUG
@@ -847,7 +883,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts)
!(c->pcpu = alloc_percpu(struct bch_fs_pcpu)) ||
!(c->online_reserved = alloc_percpu(u64)) ||
mempool_init_kvpmalloc_pool(&c->btree_bounce_pool, 1,
- btree_bytes(c)) ||
+ c->opts.btree_node_size) ||
mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048) ||
!(c->unused_inode_hints = kcalloc(1U << c->inode_shard_bits,
sizeof(u64), GFP_KERNEL))) {
@@ -946,16 +982,14 @@ static void print_mount_opts(struct bch_fs *c)
int bch2_fs_start(struct bch_fs *c)
{
- struct bch_dev *ca;
time64_t now = ktime_get_real_seconds();
- unsigned i;
int ret;
print_mount_opts(c);
down_write(&c->state_lock);
- BUG_ON(test_bit(BCH_FS_STARTED, &c->flags));
+ BUG_ON(test_bit(BCH_FS_started, &c->flags));
mutex_lock(&c->sb_lock);
@@ -965,12 +999,12 @@ int bch2_fs_start(struct bch_fs *c)
goto err;
}
- for_each_online_member(ca, c, i)
- bch2_members_v2_get_mut(c->disk_sb.sb, i)->last_mount = cpu_to_le64(now);
+ for_each_online_member(c, ca)
+ bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now);
mutex_unlock(&c->sb_lock);
- for_each_rw_member(ca, c, i)
+ for_each_rw_member(c, ca)
bch2_dev_allocator_add(c, ca);
bch2_recalc_capacity(c);
@@ -990,12 +1024,12 @@ int bch2_fs_start(struct bch_fs *c)
goto err;
}
- set_bit(BCH_FS_STARTED, &c->flags);
+ set_bit(BCH_FS_started, &c->flags);
- if (c->opts.read_only || c->opts.nochanges) {
+ if (c->opts.read_only) {
bch2_fs_read_only(c);
} else {
- ret = !test_bit(BCH_FS_RW, &c->flags)
+ ret = !test_bit(BCH_FS_rw, &c->flags)
? bch2_fs_read_write(c)
: bch2_fs_read_write_late(c);
if (ret)
@@ -1003,12 +1037,13 @@ int bch2_fs_start(struct bch_fs *c)
}
ret = 0;
-out:
+err:
+ if (ret)
+ bch_err_msg(c, ret, "starting filesystem");
+ else
+ bch_verbose(c, "done starting filesystem");
up_write(&c->state_lock);
return ret;
-err:
- bch_err_msg(c, ret, "starting filesystem");
- goto out;
}
static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
@@ -1025,20 +1060,83 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c)
return 0;
}
-static int bch2_dev_in_fs(struct bch_sb *fs, struct bch_sb *sb)
+static int bch2_dev_in_fs(struct bch_sb_handle *fs,
+ struct bch_sb_handle *sb)
{
- struct bch_sb *newest =
- le64_to_cpu(fs->seq) > le64_to_cpu(sb->seq) ? fs : sb;
+ if (fs == sb)
+ return 0;
- if (!uuid_equal(&fs->uuid, &sb->uuid))
+ if (!uuid_equal(&fs->sb->uuid, &sb->sb->uuid))
return -BCH_ERR_device_not_a_member_of_filesystem;
- if (!bch2_dev_exists(newest, sb->dev_idx))
+ if (!bch2_dev_exists(fs->sb, sb->sb->dev_idx))
return -BCH_ERR_device_has_been_removed;
- if (fs->block_size != sb->block_size)
+ if (fs->sb->block_size != sb->sb->block_size)
return -BCH_ERR_mismatched_block_size;
+ if (le16_to_cpu(fs->sb->version) < bcachefs_metadata_version_member_seq ||
+ le16_to_cpu(sb->sb->version) < bcachefs_metadata_version_member_seq)
+ return 0;
+
+ if (fs->sb->seq == sb->sb->seq &&
+ fs->sb->write_time != sb->sb->write_time) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Split brain detected between ");
+ prt_bdevname(&buf, sb->bdev);
+ prt_str(&buf, " and ");
+ prt_bdevname(&buf, fs->bdev);
+ prt_char(&buf, ':');
+ prt_newline(&buf);
+ prt_printf(&buf, "seq=%llu but write_time different, got", le64_to_cpu(sb->sb->seq));
+ prt_newline(&buf);
+
+ prt_bdevname(&buf, fs->bdev);
+ prt_char(&buf, ' ');
+ bch2_prt_datetime(&buf, le64_to_cpu(fs->sb->write_time));;
+ prt_newline(&buf);
+
+ prt_bdevname(&buf, sb->bdev);
+ prt_char(&buf, ' ');
+ bch2_prt_datetime(&buf, le64_to_cpu(sb->sb->write_time));;
+ prt_newline(&buf);
+
+ prt_printf(&buf, "Not using older sb");
+
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_device_splitbrain;
+ }
+
+ struct bch_member m = bch2_sb_member_get(fs->sb, sb->sb->dev_idx);
+ u64 seq_from_fs = le64_to_cpu(m.seq);
+ u64 seq_from_member = le64_to_cpu(sb->sb->seq);
+
+ if (seq_from_fs && seq_from_fs < seq_from_member) {
+ struct printbuf buf = PRINTBUF;
+
+ prt_str(&buf, "Split brain detected between ");
+ prt_bdevname(&buf, sb->bdev);
+ prt_str(&buf, " and ");
+ prt_bdevname(&buf, fs->bdev);
+ prt_char(&buf, ':');
+ prt_newline(&buf);
+
+ prt_bdevname(&buf, fs->bdev);
+ prt_str(&buf, "believes seq of ");
+ prt_bdevname(&buf, sb->bdev);
+ prt_printf(&buf, " to be %llu, but ", seq_from_fs);
+ prt_bdevname(&buf, sb->bdev);
+ prt_printf(&buf, " has %llu\n", seq_from_member);
+ prt_str(&buf, "Not using ");
+ prt_bdevname(&buf, sb->bdev);
+
+ pr_err("%s", buf.buf);
+ printbuf_exit(&buf);
+ return -BCH_ERR_device_splitbrain;
+ }
+
return 0;
}
@@ -1284,9 +1382,14 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb)
bch2_dev_sysfs_online(c, ca);
+ struct printbuf name = PRINTBUF;
+ prt_bdevname(&name, ca->disk_sb.bdev);
+
if (c->sb.nr_devices == 1)
- snprintf(c->name, sizeof(c->name), "%pg", ca->disk_sb.bdev);
- snprintf(ca->name, sizeof(ca->name), "%pg", ca->disk_sb.bdev);
+ strscpy(c->name, name.buf, sizeof(c->name));
+ strscpy(ca->name, name.buf, sizeof(ca->name));
+
+ printbuf_exit(&name);
rebalance_wakeup(c);
return 0;
@@ -1307,8 +1410,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
enum bch_member_state new_state, int flags)
{
struct bch_devs_mask new_online_devs;
- struct bch_dev *ca2;
- int i, nr_rw = 0, required;
+ int nr_rw = 0, required;
lockdep_assert_held(&c->state_lock);
@@ -1320,16 +1422,16 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca,
return true;
/* do we have enough devices to write to? */
- for_each_member_device(ca2, c, i)
+ for_each_member_device(c, ca2)
if (ca2 != ca)
nr_rw += ca2->mi.state == BCH_MEMBER_STATE_rw;
required = max(!(flags & BCH_FORCE_IF_METADATA_DEGRADED)
? c->opts.metadata_replicas
- : c->opts.metadata_replicas_required,
+ : metadata_replicas_required(c),
!(flags & BCH_FORCE_IF_DATA_DEGRADED)
? c->opts.data_replicas
- : c->opts.data_replicas_required);
+ : data_replicas_required(c));
return nr_rw >= required;
case BCH_MEMBER_STATE_failed:
@@ -1468,9 +1570,7 @@ static int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca)
BTREE_TRIGGER_NORUN, NULL) ?:
bch2_btree_delete_range(c, BTREE_ID_bucket_gens, start, end,
BTREE_TRIGGER_NORUN, NULL);
- if (ret)
- bch_err_msg(c, ret, "removing dev alloc info");
-
+ bch_err_msg(c, ret, "removing dev alloc info");
return ret;
}
@@ -1497,40 +1597,35 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags)
__bch2_dev_read_only(c, ca);
ret = bch2_dev_data_drop(c, ca->dev_idx, flags);
- if (ret) {
- bch_err_msg(ca, ret, "dropping data");
+ bch_err_msg(ca, ret, "dropping data");
+ if (ret)
goto err;
- }
ret = bch2_dev_remove_alloc(c, ca);
- if (ret) {
- bch_err_msg(ca, ret, "deleting alloc info");
+ bch_err_msg(ca, ret, "deleting alloc info");
+ if (ret)
goto err;
- }
ret = bch2_journal_flush_device_pins(&c->journal, ca->dev_idx);
- if (ret) {
- bch_err_msg(ca, ret, "flushing journal");
+ bch_err_msg(ca, ret, "flushing journal");
+ if (ret)
goto err;
- }
ret = bch2_journal_flush(&c->journal);
- if (ret) {
- bch_err(ca, "journal error");
+ bch_err(ca, "journal error");
+ if (ret)
goto err;
- }
ret = bch2_replicas_gc2(c);
- if (ret) {
- bch_err_msg(ca, ret, "in replicas_gc2()");
+ bch_err_msg(ca, ret, "in replicas_gc2()");
+ if (ret)
goto err;
- }
data = bch2_dev_has_data(c, ca);
if (data) {
struct printbuf data_has = PRINTBUF;
- prt_bitflags(&data_has, bch2_data_types, data);
+ prt_bitflags(&data_has, __bch2_data_types, data);
bch_err(ca, "Remove failed, still has data (%s)", data_has.buf);
printbuf_exit(&data_has);
ret = -EBUSY;
@@ -1596,10 +1691,9 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
int ret;
ret = bch2_read_super(path, &opts, &sb);
- if (ret) {
- bch_err_msg(c, ret, "reading super");
+ bch_err_msg(c, ret, "reading super");
+ if (ret)
goto err;
- }
dev_mi = bch2_sb_member_get(sb.sb, sb.sb->dev_idx);
@@ -1612,10 +1706,8 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
}
ret = bch2_dev_may_add(sb.sb, c);
- if (ret) {
- bch_err_fn(c, ret);
+ if (ret)
goto err;
- }
ca = __bch2_dev_alloc(c, &dev_mi);
if (!ca) {
@@ -1630,19 +1722,17 @@ int bch2_dev_add(struct bch_fs *c, const char *path)
goto err;
ret = bch2_dev_journal_alloc(ca);
- if (ret) {
- bch_err_msg(c, ret, "allocating journal");
+ bch_err_msg(c, ret, "allocating journal");
+ if (ret)
goto err;
- }
down_write(&c->state_lock);
mutex_lock(&c->sb_lock);
ret = bch2_sb_from_fs(c, ca);
- if (ret) {
- bch_err_msg(c, ret, "setting up new superblock");
+ bch_err_msg(c, ret, "setting up new superblock");
+ if (ret)
goto err_unlock;
- }
if (dynamic_fault("bcachefs:add:no_slot"))
goto no_slot;
@@ -1681,10 +1771,9 @@ have_slot:
if (BCH_MEMBER_GROUP(&dev_mi)) {
ret = __bch2_dev_group_set(c, ca, label.buf);
- if (ret) {
- bch_err_msg(c, ret, "creating new label");
+ bch_err_msg(c, ret, "creating new label");
+ if (ret)
goto err_unlock;
- }
}
bch2_write_super(c);
@@ -1693,16 +1782,14 @@ have_slot:
bch2_dev_usage_journal_reserve(c);
ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret) {
- bch_err_msg(ca, ret, "marking new superblock");
+ bch_err_msg(ca, ret, "marking new superblock");
+ if (ret)
goto err_late;
- }
ret = bch2_fs_freespace_init(c);
- if (ret) {
- bch_err_msg(ca, ret, "initializing free space");
+ bch_err_msg(ca, ret, "initializing free space");
+ if (ret)
goto err_late;
- }
ca->new_fs_bucket_idx = 0;
@@ -1721,6 +1808,7 @@ err:
bch2_free_super(&sb);
printbuf_exit(&label);
printbuf_exit(&errbuf);
+ bch_err_fn(c, ret);
return ret;
err_late:
up_write(&c->state_lock);
@@ -1747,11 +1835,10 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
dev_idx = sb.sb->dev_idx;
- ret = bch2_dev_in_fs(c->disk_sb.sb, sb.sb);
- if (ret) {
- bch_err_msg(c, ret, "bringing %s online", path);
+ ret = bch2_dev_in_fs(&c->disk_sb, &sb);
+ bch_err_msg(c, ret, "bringing %s online", path);
+ if (ret)
goto err;
- }
ret = bch2_dev_attach_bdev(c, &sb);
if (ret)
@@ -1760,10 +1847,9 @@ int bch2_dev_online(struct bch_fs *c, const char *path)
ca = bch_dev_locked(c, dev_idx);
ret = bch2_trans_mark_dev_sb(c, ca);
- if (ret) {
- bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
+ bch_err_msg(c, ret, "bringing %s online: error from bch2_trans_mark_dev_sb", path);
+ if (ret)
goto err;
- }
if (ca->mi.state == BCH_MEMBER_STATE_rw)
__bch2_dev_read_write(c, ca);
@@ -1842,10 +1928,9 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets)
}
ret = bch2_dev_buckets_resize(c, ca, nbuckets);
- if (ret) {
- bch_err_msg(ca, ret, "resizing buckets");
+ bch_err_msg(ca, ret, "resizing buckets");
+ if (ret)
goto err;
- }
ret = bch2_trans_mark_dev_sb(c, ca);
if (ret)
@@ -1879,28 +1964,30 @@ err:
/* return with ref on ca->ref: */
struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name)
{
- struct bch_dev *ca;
- unsigned i;
-
rcu_read_lock();
- for_each_member_device_rcu(ca, c, i, NULL)
- if (!strcmp(name, ca->name))
- goto found;
- ca = ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
-found:
+ for_each_member_device_rcu(c, ca, NULL)
+ if (!strcmp(name, ca->name)) {
+ rcu_read_unlock();
+ return ca;
+ }
rcu_read_unlock();
-
- return ca;
+ return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found);
}
/* Filesystem open: */
+static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r)
+{
+ return cmp_int(le64_to_cpu(l->seq), le64_to_cpu(r->seq)) ?:
+ cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time));
+}
+
struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
struct bch_opts opts)
{
DARRAY(struct bch_sb_handle) sbs = { 0 };
struct bch_fs *c = NULL;
- struct bch_sb_handle *sb, *best = NULL;
+ struct bch_sb_handle *best = NULL;
struct printbuf errbuf = PRINTBUF;
int ret = 0;
@@ -1926,20 +2013,27 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices,
BUG_ON(darray_push(&sbs, sb));
}
+ if (opts.nochanges && !opts.read_only) {
+ ret = -BCH_ERR_erofs_nochanges;
+ goto err_print;
+ }
+
darray_for_each(sbs, sb)
- if (!best || le64_to_cpu(sb->sb->seq) > le64_to_cpu(best->sb->seq))
+ if (!best || sb_cmp(sb->sb, best->sb) > 0)
best = sb;
darray_for_each_reverse(sbs, sb) {
- if (sb != best && !bch2_dev_exists(best->sb, sb->sb->dev_idx)) {
- pr_info("%pg has been removed, skipping", sb->bdev);
+ ret = bch2_dev_in_fs(best, sb);
+
+ if (ret == -BCH_ERR_device_has_been_removed ||
+ ret == -BCH_ERR_device_splitbrain) {
bch2_free_super(sb);
darray_remove_item(&sbs, sb);
best -= best > sb;
+ ret = 0;
continue;
}
- ret = bch2_dev_in_fs(best->sb, sb->sb);
if (ret)
goto err_print;
}
diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h
index bf762df18012..dada09331d2e 100644
--- a/fs/bcachefs/super.h
+++ b/fs/bcachefs/super.h
@@ -8,6 +8,8 @@
#include <linux/math64.h>
+extern const char * const bch2_fs_flag_strs[];
+
struct bch_fs *bch2_dev_to_fs(dev_t);
struct bch_fs *bch2_uuid_to_fs(__uuid_t);
@@ -37,8 +39,8 @@ int bch2_fs_read_write_early(struct bch_fs *);
*/
static inline void bch2_fs_lazy_rw(struct bch_fs *c)
{
- if (!test_bit(BCH_FS_RW, &c->flags) &&
- !test_bit(BCH_FS_WAS_RW, &c->flags))
+ if (!test_bit(BCH_FS_rw, &c->flags) &&
+ !test_bit(BCH_FS_was_rw, &c->flags))
bch2_fs_read_write_early(c);
}
diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h
index 9c1fd4ca2b10..0e5a14fc8e7f 100644
--- a/fs/bcachefs/super_types.h
+++ b/fs/bcachefs/super_types.h
@@ -4,6 +4,7 @@
struct bch_sb_handle {
struct bch_sb *sb;
+ struct bdev_handle *bdev_handle;
struct block_device *bdev;
char *sb_name;
struct bio *bio;
@@ -22,7 +23,7 @@ struct bch_devs_mask {
struct bch_devs_list {
u8 nr;
- u8 devs[BCH_BKEY_PTRS_MAX];
+ u8 data[BCH_BKEY_PTRS_MAX];
};
struct bch_member_cpu {
diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c
index f3cb7115b530..cee80c47feea 100644
--- a/fs/bcachefs/sysfs.c
+++ b/fs/bcachefs/sysfs.c
@@ -21,6 +21,7 @@
#include "btree_gc.h"
#include "buckets.h"
#include "clock.h"
+#include "compress.h"
#include "disk_groups.h"
#include "ec.h"
#include "inode.h"
@@ -145,6 +146,7 @@ rw_attribute(gc_gens_pos);
read_attribute(uuid);
read_attribute(minor);
+read_attribute(flags);
read_attribute(bucket_size);
read_attribute(first_bucket);
read_attribute(nbuckets);
@@ -246,7 +248,7 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
mutex_lock(&c->btree_cache.lock);
list_for_each_entry(b, &c->btree_cache.live, list)
- ret += btree_bytes(c);
+ ret += btree_buf_bytes(b);
mutex_unlock(&c->btree_cache.lock);
return ret;
@@ -255,19 +257,18 @@ static size_t bch2_btree_cache_size(struct bch_fs *c)
static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c)
{
struct btree_trans *trans;
- struct btree_iter iter;
- struct bkey_s_c k;
enum btree_id id;
- u64 nr_uncompressed_extents = 0,
- nr_compressed_extents = 0,
- nr_incompressible_extents = 0,
- uncompressed_sectors = 0,
- incompressible_sectors = 0,
- compressed_sectors_compressed = 0,
- compressed_sectors_uncompressed = 0;
+ struct compression_type_stats {
+ u64 nr_extents;
+ u64 sectors_compressed;
+ u64 sectors_uncompressed;
+ } s[BCH_COMPRESSION_TYPE_NR];
+ u64 compressed_incompressible = 0;
int ret = 0;
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ memset(s, 0, sizeof(s));
+
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EPERM;
trans = bch2_trans_get(c);
@@ -276,39 +277,33 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
if (!btree_type_has_ptrs(id))
continue;
- ret = for_each_btree_key2(trans, iter, id, POS_MIN,
- BTREE_ITER_ALL_SNAPSHOTS, k, ({
+ ret = for_each_btree_key(trans, iter, id, POS_MIN,
+ BTREE_ITER_ALL_SNAPSHOTS, k, ({
struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k);
+ struct bch_extent_crc_unpacked crc;
const union bch_extent_entry *entry;
- struct extent_ptr_decoded p;
- bool compressed = false, uncompressed = false, incompressible = false;
-
- bkey_for_each_ptr_decode(k.k, ptrs, p, entry) {
- switch (p.crc.compression_type) {
- case BCH_COMPRESSION_TYPE_none:
- uncompressed = true;
- uncompressed_sectors += k.k->size;
- break;
- case BCH_COMPRESSION_TYPE_incompressible:
- incompressible = true;
- incompressible_sectors += k.k->size;
- break;
- default:
- compressed_sectors_compressed +=
- p.crc.compressed_size;
- compressed_sectors_uncompressed +=
- p.crc.uncompressed_size;
- compressed = true;
- break;
+ bool compressed = false, incompressible = false;
+
+ bkey_for_each_crc(k.k, ptrs, crc, entry) {
+ incompressible |= crc.compression_type == BCH_COMPRESSION_TYPE_incompressible;
+ compressed |= crc_is_compressed(crc);
+
+ if (crc_is_compressed(crc)) {
+ s[crc.compression_type].nr_extents++;
+ s[crc.compression_type].sectors_compressed += crc.compressed_size;
+ s[crc.compression_type].sectors_uncompressed += crc.uncompressed_size;
}
}
- if (incompressible)
- nr_incompressible_extents++;
- else if (uncompressed)
- nr_uncompressed_extents++;
- else if (compressed)
- nr_compressed_extents++;
+ compressed_incompressible += compressed && incompressible;
+
+ if (!compressed) {
+ unsigned t = incompressible ? BCH_COMPRESSION_TYPE_incompressible : 0;
+
+ s[t].nr_extents++;
+ s[t].sectors_compressed += k.k->size;
+ s[t].sectors_uncompressed += k.k->size;
+ }
0;
}));
}
@@ -318,26 +313,45 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c
if (ret)
return ret;
- prt_printf(out, "uncompressed:\n");
- prt_printf(out, " nr extents: %llu\n", nr_uncompressed_extents);
- prt_printf(out, " size: ");
- prt_human_readable_u64(out, uncompressed_sectors << 9);
- prt_printf(out, "\n");
+ prt_str(out, "type");
+ printbuf_tabstop_push(out, 12);
+ prt_tab(out);
- prt_printf(out, "compressed:\n");
- prt_printf(out, " nr extents: %llu\n", nr_compressed_extents);
- prt_printf(out, " compressed size: ");
- prt_human_readable_u64(out, compressed_sectors_compressed << 9);
- prt_printf(out, "\n");
- prt_printf(out, " uncompressed size: ");
- prt_human_readable_u64(out, compressed_sectors_uncompressed << 9);
- prt_printf(out, "\n");
+ prt_str(out, "compressed");
+ printbuf_tabstop_push(out, 16);
+ prt_tab_rjust(out);
+
+ prt_str(out, "uncompressed");
+ printbuf_tabstop_push(out, 16);
+ prt_tab_rjust(out);
+
+ prt_str(out, "average extent size");
+ printbuf_tabstop_push(out, 24);
+ prt_tab_rjust(out);
+ prt_newline(out);
+
+ for (unsigned i = 0; i < ARRAY_SIZE(s); i++) {
+ bch2_prt_compression_type(out, i);
+ prt_tab(out);
+
+ prt_human_readable_u64(out, s[i].sectors_compressed << 9);
+ prt_tab_rjust(out);
+
+ prt_human_readable_u64(out, s[i].sectors_uncompressed << 9);
+ prt_tab_rjust(out);
+
+ prt_human_readable_u64(out, s[i].nr_extents
+ ? div_u64(s[i].sectors_uncompressed << 9, s[i].nr_extents)
+ : 0);
+ prt_tab_rjust(out);
+ prt_newline(out);
+ }
+
+ if (compressed_incompressible) {
+ prt_printf(out, "%llu compressed & incompressible extents", compressed_incompressible);
+ prt_newline(out);
+ }
- prt_printf(out, "incompressible:\n");
- prt_printf(out, " nr extents: %llu\n", nr_incompressible_extents);
- prt_printf(out, " size: ");
- prt_human_readable_u64(out, incompressible_sectors << 9);
- prt_printf(out, "\n");
return 0;
}
@@ -370,6 +384,9 @@ SHOW(bch2_fs)
sysfs_print(minor, c->minor);
sysfs_printf(internal_uuid, "%pU", c->sb.uuid.b);
+ if (attr == &sysfs_flags)
+ prt_bitflags(out, bch2_fs_flag_strs, c->flags);
+
sysfs_hprint(btree_cache_size, bch2_btree_cache_size(c));
if (attr == &sysfs_btree_write_stats)
@@ -483,12 +500,12 @@ STORE(bch2_fs)
/* Debugging: */
- if (!test_bit(BCH_FS_STARTED, &c->flags))
+ if (!test_bit(BCH_FS_started, &c->flags))
return -EPERM;
/* Debugging: */
- if (!test_bit(BCH_FS_RW, &c->flags))
+ if (!test_bit(BCH_FS_rw, &c->flags))
return -EROFS;
if (attr == &sysfs_prune_cache) {
@@ -620,6 +637,7 @@ STORE(bch2_fs_internal)
SYSFS_OPS(bch2_fs_internal);
struct attribute *bch2_fs_internal_files[] = {
+ &sysfs_flags,
&sysfs_journal_debug,
&sysfs_btree_updates,
&sysfs_btree_cache,
@@ -708,8 +726,10 @@ STORE(bch2_fs_opts_dir)
bch2_opt_set_sb(c, opt, v);
bch2_opt_set_by_id(&c->opts, id, v);
- if ((id == Opt_background_target ||
- id == Opt_background_compression) && v)
+ if (v &&
+ (id == Opt_background_target ||
+ id == Opt_background_compression ||
+ (id == Opt_compression && !c->opts.background_compression)))
bch2_set_rebalance_needs_scan(c, 0);
ret = size;
@@ -786,32 +806,7 @@ static void dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca)
printbuf_tabstop_push(out, 16);
printbuf_tabstop_push(out, 16);
- prt_tab(out);
- prt_str(out, "buckets");
- prt_tab_rjust(out);
- prt_str(out, "sectors");
- prt_tab_rjust(out);
- prt_str(out, "fragmented");
- prt_tab_rjust(out);
- prt_newline(out);
-
- for (i = 0; i < BCH_DATA_NR; i++) {
- prt_str(out, bch2_data_types[i]);
- prt_tab(out);
- prt_u64(out, stats.d[i].buckets);
- prt_tab_rjust(out);
- prt_u64(out, stats.d[i].sectors);
- prt_tab_rjust(out);
- prt_u64(out, stats.d[i].fragmented);
- prt_tab_rjust(out);
- prt_newline(out);
- }
-
- prt_str(out, "ec");
- prt_tab(out);
- prt_u64(out, stats.buckets_ec);
- prt_tab_rjust(out);
- prt_newline(out);
+ bch2_dev_usage_to_text(out, &stats);
prt_newline(out);
@@ -891,7 +886,7 @@ static void dev_io_done_to_text(struct printbuf *out, struct bch_dev *ca)
for (i = 1; i < BCH_DATA_NR; i++)
prt_printf(out, "%-12s:%12llu\n",
- bch2_data_types[i],
+ bch2_data_type_str(i),
percpu_u64_get(&ca->io_done->sectors[rw][i]) << 9);
}
}
@@ -916,7 +911,7 @@ SHOW(bch2_dev)
}
if (attr == &sysfs_has_data) {
- prt_bitflags(out, bch2_data_types, bch2_dev_has_data(c, ca));
+ prt_bitflags(out, __bch2_data_types, bch2_dev_has_data(c, ca));
prt_char(out, '\n');
}
diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c
index 2fc9e60c754b..b3fe9fc57747 100644
--- a/fs/bcachefs/tests.c
+++ b/fs/bcachefs/tests.c
@@ -107,9 +107,6 @@ err:
static int test_iterate(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -127,49 +124,43 @@ static int test_iterate(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(k.k->p.offset != i++);
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(k.k->p.offset != i++);
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr);
pr_info("iterating backwards");
- ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
- SPOS(0, U64_MAX, U32_MAX), 0, k,
- ({
+ ret = bch2_trans_run(c,
+ for_each_btree_key_reverse(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, U64_MAX, U32_MAX), 0, k, ({
BUG_ON(k.k->p.offset != --i);
0;
- }));
+ })));
bch_err_msg(c, ret, "error iterating backwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i);
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
+ return 0;
}
static int test_iterate_extents(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -188,51 +179,45 @@ static int test_iterate_extents(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(bkey_start_offset(k.k) != i);
- i = k.k->p.offset;
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(bkey_start_offset(k.k) != i);
+ i = k.k->p.offset;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr);
pr_info("iterating backwards");
- ret = for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
- SPOS(0, U64_MAX, U32_MAX), 0, k,
- ({
+ ret = bch2_trans_run(c,
+ for_each_btree_key_reverse(trans, iter, BTREE_ID_extents,
+ SPOS(0, U64_MAX, U32_MAX), 0, k, ({
BUG_ON(k.k->p.offset != i);
i = bkey_start_offset(k.k);
0;
- }));
+ })));
bch_err_msg(c, ret, "error iterating backwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i);
-err:
- bch2_trans_iter_exit(trans, &iter);
- bch2_trans_put(trans);
- return ret;
+ return 0;
}
static int test_iterate_slots(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -250,57 +235,48 @@ static int test_iterate_slots(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_xattrs, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(k.k->p.offset != i);
- i += 2;
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(k.k->p.offset != i);
+ i += 2;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr * 2);
pr_info("iterating forwards by slots");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
- if (i >= nr * 2)
- break;
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i >= nr * 2)
+ break;
- BUG_ON(k.k->p.offset != i);
- BUG_ON(bkey_deleted(k.k) != (i & 1));
+ BUG_ON(k.k->p.offset != i);
+ BUG_ON(bkey_deleted(k.k) != (i & 1));
- i++;
- 0;
- }));
- if (ret < 0) {
- bch_err_msg(c, ret, "error iterating forwards by slots");
- goto err;
- }
- ret = 0;
-err:
- bch2_trans_put(trans);
+ i++;
+ 0;
+ })));
+ bch_err_msg(c, ret, "error iterating forwards by slots");
return ret;
}
static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
{
- struct btree_trans *trans = bch2_trans_get(c);
- struct btree_iter iter = { NULL };
- struct bkey_s_c k;
u64 i;
int ret = 0;
@@ -319,50 +295,45 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr)
ret = bch2_btree_insert(c, BTREE_ID_extents, &ck.k_i, NULL, 0);
bch_err_msg(c, ret, "insert error");
if (ret)
- goto err;
+ return ret;
}
pr_info("iterating forwards");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- 0, k, ({
- BUG_ON(bkey_start_offset(k.k) != i + 8);
- BUG_ON(k.k->size != 8);
- i += 16;
- 0;
- }));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ 0, k, ({
+ BUG_ON(bkey_start_offset(k.k) != i + 8);
+ BUG_ON(k.k->size != 8);
+ i += 16;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards");
if (ret)
- goto err;
+ return ret;
BUG_ON(i != nr);
pr_info("iterating forwards by slots");
-
i = 0;
- ret = for_each_btree_key2_upto(trans, iter, BTREE_ID_extents,
- SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
- BTREE_ITER_SLOTS, k, ({
- if (i == nr)
- break;
- BUG_ON(bkey_deleted(k.k) != !(i % 16));
+ ret = bch2_trans_run(c,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_extents,
+ SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
+ BTREE_ITER_SLOTS, k, ({
+ if (i == nr)
+ break;
+ BUG_ON(bkey_deleted(k.k) != !(i % 16));
- BUG_ON(bkey_start_offset(k.k) != i);
- BUG_ON(k.k->size != 8);
- i = k.k->p.offset;
- 0;
- }));
+ BUG_ON(bkey_start_offset(k.k) != i);
+ BUG_ON(k.k->size != 8);
+ i = k.k->p.offset;
+ 0;
+ })));
bch_err_msg(c, ret, "error iterating forwards by slots");
- if (ret)
- goto err;
- ret = 0;
-err:
- bch2_trans_put(trans);
- return 0;
+ return ret;
}
/*
@@ -736,8 +707,6 @@ static int rand_delete(struct bch_fs *c, u64 nr)
static int seq_insert(struct bch_fs *c, u64 nr)
{
- struct btree_iter iter;
- struct bkey_s_c k;
struct bkey_i_cookie insert;
bkey_cookie_init(&insert.k_i);
@@ -756,11 +725,8 @@ static int seq_insert(struct bch_fs *c, u64 nr)
static int seq_lookup(struct bch_fs *c, u64 nr)
{
- struct btree_iter iter;
- struct bkey_s_c k;
-
return bch2_trans_run(c,
- for_each_btree_key2_upto(trans, iter, BTREE_ID_xattrs,
+ for_each_btree_key_upto(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX), POS(0, U64_MAX),
0, k,
0));
@@ -768,9 +734,6 @@ static int seq_lookup(struct bch_fs *c, u64 nr)
static int seq_overwrite(struct bch_fs *c, u64 nr)
{
- struct btree_iter iter;
- struct bkey_s_c k;
-
return bch2_trans_run(c,
for_each_btree_key_commit(trans, iter, BTREE_ID_xattrs,
SPOS(0, 0, U32_MAX),
diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c
new file mode 100644
index 000000000000..9220d7de10db
--- /dev/null
+++ b/fs/bcachefs/thread_with_file.c
@@ -0,0 +1,299 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef NO_BCACHEFS_FS
+
+#include "bcachefs.h"
+#include "printbuf.h"
+#include "thread_with_file.h"
+
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <linux/poll.h>
+
+void bch2_thread_with_file_exit(struct thread_with_file *thr)
+{
+ if (thr->task) {
+ kthread_stop(thr->task);
+ put_task_struct(thr->task);
+ }
+}
+
+int bch2_run_thread_with_file(struct thread_with_file *thr,
+ const struct file_operations *fops,
+ int (*fn)(void *))
+{
+ struct file *file = NULL;
+ int ret, fd = -1;
+ unsigned fd_flags = O_CLOEXEC;
+
+ if (fops->read && fops->write)
+ fd_flags |= O_RDWR;
+ else if (fops->read)
+ fd_flags |= O_RDONLY;
+ else if (fops->write)
+ fd_flags |= O_WRONLY;
+
+ char name[TASK_COMM_LEN];
+ get_task_comm(name, current);
+
+ thr->ret = 0;
+ thr->task = kthread_create(fn, thr, "%s", name);
+ ret = PTR_ERR_OR_ZERO(thr->task);
+ if (ret)
+ return ret;
+
+ ret = get_unused_fd_flags(fd_flags);
+ if (ret < 0)
+ goto err;
+ fd = ret;
+
+ file = anon_inode_getfile(name, fops, thr, fd_flags);
+ ret = PTR_ERR_OR_ZERO(file);
+ if (ret)
+ goto err;
+
+ get_task_struct(thr->task);
+ wake_up_process(thr->task);
+ fd_install(fd, file);
+ return fd;
+err:
+ if (fd >= 0)
+ put_unused_fd(fd);
+ if (thr->task)
+ kthread_stop(thr->task);
+ return ret;
+}
+
+static inline bool thread_with_stdio_has_output(struct thread_with_stdio *thr)
+{
+ return thr->stdio.output_buf.pos ||
+ thr->output2.nr ||
+ thr->thr.done;
+}
+
+static ssize_t thread_with_stdio_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+ size_t copied = 0, b;
+ int ret = 0;
+
+ if ((file->f_flags & O_NONBLOCK) &&
+ !thread_with_stdio_has_output(thr))
+ return -EAGAIN;
+
+ ret = wait_event_interruptible(thr->stdio.output_wait,
+ thread_with_stdio_has_output(thr));
+ if (ret)
+ return ret;
+
+ if (thr->thr.done)
+ return 0;
+
+ while (len) {
+ ret = darray_make_room(&thr->output2, thr->stdio.output_buf.pos);
+ if (ret)
+ break;
+
+ spin_lock_irq(&thr->stdio.output_lock);
+ b = min_t(size_t, darray_room(thr->output2), thr->stdio.output_buf.pos);
+
+ memcpy(&darray_top(thr->output2), thr->stdio.output_buf.buf, b);
+ memmove(thr->stdio.output_buf.buf,
+ thr->stdio.output_buf.buf + b,
+ thr->stdio.output_buf.pos - b);
+
+ thr->output2.nr += b;
+ thr->stdio.output_buf.pos -= b;
+ spin_unlock_irq(&thr->stdio.output_lock);
+
+ b = min(len, thr->output2.nr);
+ if (!b)
+ break;
+
+ b -= copy_to_user(buf, thr->output2.data, b);
+ if (!b) {
+ ret = -EFAULT;
+ break;
+ }
+
+ copied += b;
+ buf += b;
+ len -= b;
+
+ memmove(thr->output2.data,
+ thr->output2.data + b,
+ thr->output2.nr - b);
+ thr->output2.nr -= b;
+ }
+
+ return copied ?: ret;
+}
+
+static int thread_with_stdio_release(struct inode *inode, struct file *file)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ bch2_thread_with_file_exit(&thr->thr);
+ printbuf_exit(&thr->stdio.input_buf);
+ printbuf_exit(&thr->stdio.output_buf);
+ darray_exit(&thr->output2);
+ thr->exit(thr);
+ return 0;
+}
+
+#define WRITE_BUFFER 4096
+
+static inline bool thread_with_stdio_has_input_space(struct thread_with_stdio *thr)
+{
+ return thr->stdio.input_buf.pos < WRITE_BUFFER || thr->thr.done;
+}
+
+static ssize_t thread_with_stdio_write(struct file *file, const char __user *ubuf,
+ size_t len, loff_t *ppos)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+ struct printbuf *buf = &thr->stdio.input_buf;
+ size_t copied = 0;
+ ssize_t ret = 0;
+
+ while (len) {
+ if (thr->thr.done) {
+ ret = -EPIPE;
+ break;
+ }
+
+ size_t b = len - fault_in_readable(ubuf, len);
+ if (!b) {
+ ret = -EFAULT;
+ break;
+ }
+
+ spin_lock(&thr->stdio.input_lock);
+ if (buf->pos < WRITE_BUFFER)
+ bch2_printbuf_make_room(buf, min(b, WRITE_BUFFER - buf->pos));
+ b = min(len, printbuf_remaining_size(buf));
+
+ if (b && !copy_from_user_nofault(&buf->buf[buf->pos], ubuf, b)) {
+ ubuf += b;
+ len -= b;
+ copied += b;
+ buf->pos += b;
+ }
+ spin_unlock(&thr->stdio.input_lock);
+
+ if (b) {
+ wake_up(&thr->stdio.input_wait);
+ } else {
+ if ((file->f_flags & O_NONBLOCK)) {
+ ret = -EAGAIN;
+ break;
+ }
+
+ ret = wait_event_interruptible(thr->stdio.input_wait,
+ thread_with_stdio_has_input_space(thr));
+ if (ret)
+ break;
+ }
+ }
+
+ return copied ?: ret;
+}
+
+static __poll_t thread_with_stdio_poll(struct file *file, struct poll_table_struct *wait)
+{
+ struct thread_with_stdio *thr =
+ container_of(file->private_data, struct thread_with_stdio, thr);
+
+ poll_wait(file, &thr->stdio.output_wait, wait);
+ poll_wait(file, &thr->stdio.input_wait, wait);
+
+ __poll_t mask = 0;
+
+ if (thread_with_stdio_has_output(thr))
+ mask |= EPOLLIN;
+ if (thread_with_stdio_has_input_space(thr))
+ mask |= EPOLLOUT;
+ if (thr->thr.done)
+ mask |= EPOLLHUP|EPOLLERR;
+ return mask;
+}
+
+static const struct file_operations thread_with_stdio_fops = {
+ .release = thread_with_stdio_release,
+ .read = thread_with_stdio_read,
+ .write = thread_with_stdio_write,
+ .poll = thread_with_stdio_poll,
+ .llseek = no_llseek,
+};
+
+int bch2_run_thread_with_stdio(struct thread_with_stdio *thr,
+ void (*exit)(struct thread_with_stdio *),
+ int (*fn)(void *))
+{
+ thr->stdio.input_buf = PRINTBUF;
+ thr->stdio.input_buf.atomic++;
+ spin_lock_init(&thr->stdio.input_lock);
+ init_waitqueue_head(&thr->stdio.input_wait);
+
+ thr->stdio.output_buf = PRINTBUF;
+ thr->stdio.output_buf.atomic++;
+ spin_lock_init(&thr->stdio.output_lock);
+ init_waitqueue_head(&thr->stdio.output_wait);
+
+ darray_init(&thr->output2);
+ thr->exit = exit;
+
+ return bch2_run_thread_with_file(&thr->thr, &thread_with_stdio_fops, fn);
+}
+
+int bch2_stdio_redirect_read(struct stdio_redirect *stdio, char *buf, size_t len)
+{
+ wait_event(stdio->input_wait,
+ stdio->input_buf.pos || stdio->done);
+
+ if (stdio->done)
+ return -1;
+
+ spin_lock(&stdio->input_lock);
+ int ret = min(len, stdio->input_buf.pos);
+ stdio->input_buf.pos -= ret;
+ memcpy(buf, stdio->input_buf.buf, ret);
+ memmove(stdio->input_buf.buf,
+ stdio->input_buf.buf + ret,
+ stdio->input_buf.pos);
+ spin_unlock(&stdio->input_lock);
+
+ wake_up(&stdio->input_wait);
+ return ret;
+}
+
+int bch2_stdio_redirect_readline(struct stdio_redirect *stdio, char *buf, size_t len)
+{
+ wait_event(stdio->input_wait,
+ stdio->input_buf.pos || stdio->done);
+
+ if (stdio->done)
+ return -1;
+
+ spin_lock(&stdio->input_lock);
+ int ret = min(len, stdio->input_buf.pos);
+ char *n = memchr(stdio->input_buf.buf, '\n', ret);
+ if (n)
+ ret = min(ret, n + 1 - stdio->input_buf.buf);
+ stdio->input_buf.pos -= ret;
+ memcpy(buf, stdio->input_buf.buf, ret);
+ memmove(stdio->input_buf.buf,
+ stdio->input_buf.buf + ret,
+ stdio->input_buf.pos);
+ spin_unlock(&stdio->input_lock);
+
+ wake_up(&stdio->input_wait);
+ return ret;
+}
+
+#endif /* NO_BCACHEFS_FS */
diff --git a/fs/bcachefs/thread_with_file.h b/fs/bcachefs/thread_with_file.h
new file mode 100644
index 000000000000..05879c5048c8
--- /dev/null
+++ b/fs/bcachefs/thread_with_file.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_THREAD_WITH_FILE_H
+#define _BCACHEFS_THREAD_WITH_FILE_H
+
+#include "thread_with_file_types.h"
+
+struct task_struct;
+
+struct thread_with_file {
+ struct task_struct *task;
+ int ret;
+ bool done;
+};
+
+void bch2_thread_with_file_exit(struct thread_with_file *);
+int bch2_run_thread_with_file(struct thread_with_file *,
+ const struct file_operations *,
+ int (*fn)(void *));
+
+struct thread_with_stdio {
+ struct thread_with_file thr;
+ struct stdio_redirect stdio;
+ DARRAY(char) output2;
+ void (*exit)(struct thread_with_stdio *);
+};
+
+static inline void thread_with_stdio_done(struct thread_with_stdio *thr)
+{
+ thr->thr.done = true;
+ thr->stdio.done = true;
+ wake_up(&thr->stdio.input_wait);
+ wake_up(&thr->stdio.output_wait);
+}
+
+int bch2_run_thread_with_stdio(struct thread_with_stdio *,
+ void (*exit)(struct thread_with_stdio *),
+ int (*fn)(void *));
+int bch2_stdio_redirect_read(struct stdio_redirect *, char *, size_t);
+int bch2_stdio_redirect_readline(struct stdio_redirect *, char *, size_t);
+
+#endif /* _BCACHEFS_THREAD_WITH_FILE_H */
diff --git a/fs/bcachefs/thread_with_file_types.h b/fs/bcachefs/thread_with_file_types.h
new file mode 100644
index 000000000000..90b5e645e98c
--- /dev/null
+++ b/fs/bcachefs/thread_with_file_types.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+#define _BCACHEFS_THREAD_WITH_FILE_TYPES_H
+
+struct stdio_redirect {
+ spinlock_t output_lock;
+ wait_queue_head_t output_wait;
+ struct printbuf output_buf;
+
+ spinlock_t input_lock;
+ wait_queue_head_t input_wait;
+ struct printbuf input_buf;
+ bool done;
+};
+
+#endif /* _BCACHEFS_THREAD_WITH_FILE_TYPES_H */
diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h
index fd49b63562c3..293b90d704fb 100644
--- a/fs/bcachefs/trace.h
+++ b/fs/bcachefs/trace.h
@@ -32,22 +32,68 @@ DECLARE_EVENT_CLASS(bpos,
TP_printk("%llu:%llu:%u", __entry->p_inode, __entry->p_offset, __entry->p_snapshot)
);
-DECLARE_EVENT_CLASS(bkey,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k),
+DECLARE_EVENT_CLASS(fs_str,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str),
TP_STRUCT__entry(
- __string(k, k )
+ __field(dev_t, dev )
+ __string(str, str )
),
TP_fast_assign(
- __assign_str(k, k);
+ __entry->dev = c->dev;
+ __assign_str(str, str);
),
- TP_printk("%s", __get_str(k))
+ TP_printk("%d,%d\n%s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(str))
);
-DECLARE_EVENT_CLASS(btree_node,
+DECLARE_EVENT_CLASS(trans_str,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
+ TP_ARGS(trans, caller_ip, str),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ __string(str, str )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ __assign_str(str, str);
+ ),
+
+ TP_printk("%d,%d %s %pS %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->trans_fn, (void *) __entry->caller_ip, __get_str(str))
+);
+
+DECLARE_EVENT_CLASS(trans_str_nocaller,
+ TP_PROTO(struct btree_trans *trans, const char *str),
+ TP_ARGS(trans, str),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __string(str, str )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __assign_str(str, str);
+ ),
+
+ TP_printk("%d,%d %s %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->trans_fn, __get_str(str))
+);
+
+DECLARE_EVENT_CLASS(btree_node_nofs,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b),
@@ -72,6 +118,33 @@ DECLARE_EVENT_CLASS(btree_node,
__entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
);
+DECLARE_EVENT_CLASS(btree_node,
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ __field(u8, level )
+ __field(u8, btree_id )
+ TRACE_BPOS_entries(pos)
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->level = b->c.level;
+ __entry->btree_id = b->c.btree_id;
+ TRACE_BPOS_assign(pos, b->key.k.p);
+ ),
+
+ TP_printk("%d,%d %s %u %s %llu:%llu:%u",
+ MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn,
+ __entry->level,
+ bch2_btree_id_str(__entry->btree_id),
+ __entry->pos_inode, __entry->pos_offset, __entry->pos_snapshot)
+);
+
DECLARE_EVENT_CLASS(bch_fs,
TP_PROTO(struct bch_fs *c),
TP_ARGS(c),
@@ -87,6 +160,23 @@ DECLARE_EVENT_CLASS(bch_fs,
TP_printk("%d,%d", MAJOR(__entry->dev), MINOR(__entry->dev))
);
+DECLARE_EVENT_CLASS(btree_trans,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans),
+
+ TP_STRUCT__entry(
+ __field(dev_t, dev )
+ __array(char, trans_fn, 32 )
+ ),
+
+ TP_fast_assign(
+ __entry->dev = trans->c->dev;
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ ),
+
+ TP_printk("%d,%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->trans_fn)
+);
+
DECLARE_EVENT_CLASS(bio,
TP_PROTO(struct bio *bio),
TP_ARGS(bio),
@@ -183,9 +273,14 @@ DEFINE_EVENT(bch_fs, journal_full,
TP_ARGS(c)
);
-DEFINE_EVENT(bch_fs, journal_entry_full,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(fs_str, journal_entry_full,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, journal_entry_close,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
DEFINE_EVENT(bio, journal_write,
@@ -286,36 +381,36 @@ TRACE_EVENT(btree_cache_scan,
__entry->nr_to_scan, __entry->can_free, __entry->ret)
);
-DEFINE_EVENT(btree_node, btree_cache_reap,
+DEFINE_EVENT(btree_node_nofs, btree_cache_reap,
TP_PROTO(struct bch_fs *c, struct btree *b),
TP_ARGS(c, b)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock_fail,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock_fail,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_lock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_lock,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
-DEFINE_EVENT(bch_fs, btree_cache_cannibalize_unlock,
- TP_PROTO(struct bch_fs *c),
- TP_ARGS(c)
+DEFINE_EVENT(btree_trans, btree_cache_cannibalize_unlock,
+ TP_PROTO(struct btree_trans *trans),
+ TP_ARGS(trans)
);
/* Btree */
DEFINE_EVENT(btree_node, btree_node_read,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_node_write,
@@ -339,13 +434,13 @@ TRACE_EVENT(btree_node_write,
);
DEFINE_EVENT(btree_node, btree_node_alloc,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_free,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_reserve_get_fail,
@@ -377,28 +472,28 @@ TRACE_EVENT(btree_reserve_get_fail,
);
DEFINE_EVENT(btree_node, btree_node_compact,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_merge,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_split,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_rewrite,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
DEFINE_EVENT(btree_node, btree_node_set_root,
- TP_PROTO(struct bch_fs *c, struct btree *b),
- TP_ARGS(c, b)
+ TP_PROTO(struct btree_trans *trans, struct btree *b),
+ TP_ARGS(trans, b)
);
TRACE_EVENT(btree_path_relock_fail,
@@ -433,7 +528,7 @@ TRACE_EVENT(btree_path_relock_fail,
__entry->level = path->level;
TRACE_BPOS_assign(pos, path->pos);
- c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level),
+ c = bch2_btree_node_lock_counts(trans, NULL, &path->l[level].b->c, level);
__entry->self_read_count = c.n[SIX_LOCK_read];
__entry->self_intent_count = c.n[SIX_LOCK_intent];
@@ -717,44 +812,32 @@ TRACE_EVENT(bucket_evacuate,
__entry->dev_idx, __entry->bucket)
);
-DEFINE_EVENT(bkey, move_extent,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_read,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent_read,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_write,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent_write,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_finish,
- TP_PROTO(struct bch_fs *c, const char *k),
- TP_ARGS(c, k)
+DEFINE_EVENT(fs_str, move_extent_finish,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-TRACE_EVENT(move_extent_fail,
- TP_PROTO(struct bch_fs *c, const char *msg),
- TP_ARGS(c, msg),
-
- TP_STRUCT__entry(
- __field(dev_t, dev )
- __string(msg, msg )
- ),
-
- TP_fast_assign(
- __entry->dev = c->dev;
- __assign_str(msg, msg);
- ),
-
- TP_printk("%d:%d %s", MAJOR(__entry->dev), MINOR(__entry->dev), __get_str(msg))
+DEFINE_EVENT(fs_str, move_extent_fail,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
-DEFINE_EVENT(bkey, move_extent_start_fail,
+DEFINE_EVENT(fs_str, move_extent_start_fail,
TP_PROTO(struct bch_fs *c, const char *str),
TP_ARGS(c, str)
);
@@ -930,7 +1013,7 @@ TRACE_EVENT(trans_restart_split_race,
__entry->level = b->c.level;
__entry->written = b->written;
__entry->blocks = btree_blocks(trans->c);
- __entry->u64s_remaining = bch_btree_keys_u64s_remaining(trans->c, b);
+ __entry->u64s_remaining = bch2_btree_keys_u64s_remaining(b);
),
TP_printk("%s %pS l=%u written %u/%u u64s remaining %u",
@@ -987,10 +1070,11 @@ DEFINE_EVENT(transaction_event, trans_restart_key_cache_raced,
TP_ARGS(trans, caller_ip)
);
-DEFINE_EVENT(transaction_event, trans_restart_too_many_iters,
+DEFINE_EVENT(trans_str, trans_restart_too_many_iters,
TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+ unsigned long caller_ip,
+ const char *paths),
+ TP_ARGS(trans, caller_ip, paths)
);
DECLARE_EVENT_CLASS(transaction_restart_iter,
@@ -1036,8 +1120,6 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split,
TP_ARGS(trans, caller_ip, path)
);
-struct get_locks_fail;
-
TRACE_EVENT(trans_restart_upgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
@@ -1056,8 +1138,6 @@ TRACE_EVENT(trans_restart_upgrade,
__field(u8, level )
__field(u32, path_seq )
__field(u32, node_seq )
- __field(u32, path_alloc_seq )
- __field(u32, downgrade_seq)
TRACE_BPOS_entries(pos)
),
@@ -1070,12 +1150,10 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->level = f->l;
__entry->path_seq = path->l[f->l].lock_seq;
__entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq;
- __entry->path_alloc_seq = path->alloc_seq;
- __entry->downgrade_seq = path->downgrade_seq;
TRACE_BPOS_assign(pos, path->pos)
),
- TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u alloc_seq %u downgrade_seq %u",
+ TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u",
__entry->trans_fn,
(void *) __entry->caller_ip,
bch2_btree_id_str(__entry->btree_id),
@@ -1086,16 +1164,12 @@ TRACE_EVENT(trans_restart_upgrade,
__entry->new_locks_want,
__entry->level,
__entry->path_seq,
- __entry->node_seq,
- __entry->path_alloc_seq,
- __entry->downgrade_seq)
+ __entry->node_seq)
);
-DEFINE_EVENT(transaction_restart_iter, trans_restart_relock,
- TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path)
+DEFINE_EVENT(trans_str, trans_restart_relock,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip, const char *str),
+ TP_ARGS(trans, caller_ip, str)
);
DEFINE_EVENT(transaction_restart_iter, trans_restart_relock_next_node,
@@ -1160,10 +1234,10 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_memory_allocation_failure,
TP_ARGS(trans, caller_ip, path)
);
-DEFINE_EVENT(transaction_event, trans_restart_would_deadlock,
+DEFINE_EVENT(trans_str_nocaller, trans_restart_would_deadlock,
TP_PROTO(struct btree_trans *trans,
- unsigned long caller_ip),
- TP_ARGS(trans, caller_ip)
+ const char *cycle),
+ TP_ARGS(trans, cycle)
);
DEFINE_EVENT(transaction_event, trans_restart_would_deadlock_recursion_limit,
@@ -1252,22 +1326,37 @@ TRACE_EVENT(trans_restart_key_cache_key_realloced,
TRACE_EVENT(path_downgrade,
TP_PROTO(struct btree_trans *trans,
unsigned long caller_ip,
- struct btree_path *path),
- TP_ARGS(trans, caller_ip, path),
+ struct btree_path *path,
+ unsigned old_locks_want),
+ TP_ARGS(trans, caller_ip, path, old_locks_want),
TP_STRUCT__entry(
__array(char, trans_fn, 32 )
__field(unsigned long, caller_ip )
+ __field(unsigned, old_locks_want )
+ __field(unsigned, new_locks_want )
+ __field(unsigned, btree )
+ TRACE_BPOS_entries(pos)
),
TP_fast_assign(
strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
__entry->caller_ip = caller_ip;
+ __entry->old_locks_want = old_locks_want;
+ __entry->new_locks_want = path->locks_want;
+ __entry->btree = path->btree_id;
+ TRACE_BPOS_assign(pos, path->pos);
),
- TP_printk("%s %pS",
+ TP_printk("%s %pS locks_want %u -> %u %s %llu:%llu:%u",
__entry->trans_fn,
- (void *) __entry->caller_ip)
+ (void *) __entry->caller_ip,
+ __entry->old_locks_want,
+ __entry->new_locks_want,
+ bch2_btree_id_str(__entry->btree),
+ __entry->pos_inode,
+ __entry->pos_offset,
+ __entry->pos_snapshot)
);
DEFINE_EVENT(transaction_event, trans_restart_write_buffer_flush,
@@ -1298,21 +1387,48 @@ TRACE_EVENT(write_buffer_flush,
__entry->nr, __entry->size, __entry->skipped, __entry->fast)
);
+TRACE_EVENT(write_buffer_flush_sync,
+ TP_PROTO(struct btree_trans *trans, unsigned long caller_ip),
+ TP_ARGS(trans, caller_ip),
+
+ TP_STRUCT__entry(
+ __array(char, trans_fn, 32 )
+ __field(unsigned long, caller_ip )
+ ),
+
+ TP_fast_assign(
+ strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn));
+ __entry->caller_ip = caller_ip;
+ ),
+
+ TP_printk("%s %pS", __entry->trans_fn, (void *) __entry->caller_ip)
+);
+
TRACE_EVENT(write_buffer_flush_slowpath,
- TP_PROTO(struct btree_trans *trans, size_t nr, size_t size),
- TP_ARGS(trans, nr, size),
+ TP_PROTO(struct btree_trans *trans, size_t slowpath, size_t total),
+ TP_ARGS(trans, slowpath, total),
TP_STRUCT__entry(
- __field(size_t, nr )
- __field(size_t, size )
+ __field(size_t, slowpath )
+ __field(size_t, total )
),
TP_fast_assign(
- __entry->nr = nr;
- __entry->size = size;
+ __entry->slowpath = slowpath;
+ __entry->total = total;
),
- TP_printk("%zu/%zu", __entry->nr, __entry->size)
+ TP_printk("%zu/%zu", __entry->slowpath, __entry->total)
+);
+
+DEFINE_EVENT(fs_str, rebalance_extent,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
+);
+
+DEFINE_EVENT(fs_str, data_update,
+ TP_PROTO(struct bch_fs *c, const char *str),
+ TP_ARGS(c, str)
);
#endif /* _TRACE_BCACHEFS_H */
diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c
index 84b142fcc3df..3a32faa86b5c 100644
--- a/fs/bcachefs/util.c
+++ b/fs/bcachefs/util.c
@@ -241,12 +241,17 @@ bool bch2_is_zero(const void *_p, size_t n)
return true;
}
-void bch2_prt_u64_binary(struct printbuf *out, u64 v, unsigned nr_bits)
+void bch2_prt_u64_base2_nbits(struct printbuf *out, u64 v, unsigned nr_bits)
{
while (nr_bits)
prt_char(out, '0' + ((v >> --nr_bits) & 1));
}
+void bch2_prt_u64_base2(struct printbuf *out, u64 v)
+{
+ bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1);
+}
+
void bch2_print_string_as_lines(const char *prefix, const char *lines)
{
const char *p;
@@ -267,14 +272,14 @@ void bch2_print_string_as_lines(const char *prefix, const char *lines)
console_unlock();
}
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr,
+ gfp_t gfp)
{
#ifdef CONFIG_STACKTRACE
unsigned nr_entries = 0;
- int ret = 0;
stack->nr = 0;
- ret = darray_make_room(stack, 32);
+ int ret = darray_make_room_gfp(stack, 32, gfp);
if (ret)
return ret;
@@ -282,9 +287,9 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
return -1;
do {
- nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, 0);
+ nr_entries = stack_trace_save_tsk(task, stack->data, stack->size, skipnr + 1);
} while (nr_entries == stack->size &&
- !(ret = darray_make_room(stack, stack->size * 2)));
+ !(ret = darray_make_room_gfp(stack, stack->size * 2, gfp)));
stack->nr = nr_entries;
up_read(&task->signal->exec_update_lock);
@@ -297,24 +302,74 @@ int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task)
void bch2_prt_backtrace(struct printbuf *out, bch_stacktrace *stack)
{
- unsigned long *i;
-
darray_for_each(*stack, i) {
prt_printf(out, "[<0>] %pB", (void *) *i);
prt_newline(out);
}
}
-int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task)
+int bch2_prt_task_backtrace(struct printbuf *out, struct task_struct *task, unsigned skipnr, gfp_t gfp)
{
bch_stacktrace stack = { 0 };
- int ret = bch2_save_backtrace(&stack, task);
+ int ret = bch2_save_backtrace(&stack, task, skipnr + 1, gfp);
bch2_prt_backtrace(out, &stack);
darray_exit(&stack);
return ret;
}
+#ifndef __KERNEL__
+#include <time.h>
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ time_t t = sec;
+ char buf[64];
+ ctime_r(&t, buf);
+ strim(buf);
+ prt_str(out, buf);
+}
+#else
+void bch2_prt_datetime(struct printbuf *out, time64_t sec)
+{
+ char buf[64];
+ snprintf(buf, sizeof(buf), "%ptT", &sec);
+ prt_u64(out, sec);
+}
+#endif
+
+static const struct time_unit {
+ const char *name;
+ u64 nsecs;
+} time_units[] = {
+ { "ns", 1 },
+ { "us", NSEC_PER_USEC },
+ { "ms", NSEC_PER_MSEC },
+ { "s", NSEC_PER_SEC },
+ { "m", (u64) NSEC_PER_SEC * 60},
+ { "h", (u64) NSEC_PER_SEC * 3600},
+ { "eon", U64_MAX },
+};
+
+static const struct time_unit *pick_time_units(u64 ns)
+{
+ const struct time_unit *u;
+
+ for (u = time_units;
+ u + 1 < time_units + ARRAY_SIZE(time_units) &&
+ ns >= u[1].nsecs << 1;
+ u++)
+ ;
+
+ return u;
+}
+
+void bch2_pr_time_units(struct printbuf *out, u64 ns)
+{
+ const struct time_unit *u = pick_time_units(ns);
+
+ prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
+}
+
/* time stats: */
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
@@ -359,42 +414,48 @@ static inline void bch2_time_stats_update_one(struct bch2_time_stats *stats,
mean_and_variance_weighted_update(&stats->duration_stats_weighted, duration);
stats->max_duration = max(stats->max_duration, duration);
stats->min_duration = min(stats->min_duration, duration);
+ stats->total_duration += duration;
bch2_quantiles_update(&stats->quantiles, duration);
}
- if (time_after64(end, stats->last_event)) {
+ if (stats->last_event && time_after64(end, stats->last_event)) {
freq = end - stats->last_event;
mean_and_variance_update(&stats->freq_stats, freq);
mean_and_variance_weighted_update(&stats->freq_stats_weighted, freq);
stats->max_freq = max(stats->max_freq, freq);
stats->min_freq = min(stats->min_freq, freq);
- stats->last_event = end;
}
+
+ stats->last_event = end;
+}
+
+static void __bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
+ struct bch2_time_stat_buffer *b)
+{
+ for (struct bch2_time_stat_buffer_entry *i = b->entries;
+ i < b->entries + ARRAY_SIZE(b->entries);
+ i++)
+ bch2_time_stats_update_one(stats, i->start, i->end);
+ b->nr = 0;
}
static noinline void bch2_time_stats_clear_buffer(struct bch2_time_stats *stats,
struct bch2_time_stat_buffer *b)
{
- struct bch2_time_stat_buffer_entry *i;
unsigned long flags;
spin_lock_irqsave(&stats->lock, flags);
- for (i = b->entries;
- i < b->entries + ARRAY_SIZE(b->entries);
- i++)
- bch2_time_stats_update_one(stats, i->start, i->end);
+ __bch2_time_stats_clear_buffer(stats, b);
spin_unlock_irqrestore(&stats->lock, flags);
-
- b->nr = 0;
}
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
{
unsigned long flags;
- WARN_RATELIMIT(!stats->min_duration || !stats->min_freq,
- "time_stats: min_duration = %llu, min_freq = %llu",
- stats->min_duration, stats->min_freq);
+ WARN_ONCE(!stats->duration_stats_weighted.weight ||
+ !stats->freq_stats_weighted.weight,
+ "uninitialized time_stats");
if (!stats->buffer) {
spin_lock_irqsave(&stats->lock, flags);
@@ -423,40 +484,6 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end)
preempt_enable();
}
}
-#endif
-
-static const struct time_unit {
- const char *name;
- u64 nsecs;
-} time_units[] = {
- { "ns", 1 },
- { "us", NSEC_PER_USEC },
- { "ms", NSEC_PER_MSEC },
- { "s", NSEC_PER_SEC },
- { "m", (u64) NSEC_PER_SEC * 60},
- { "h", (u64) NSEC_PER_SEC * 3600},
- { "eon", U64_MAX },
-};
-
-static const struct time_unit *pick_time_units(u64 ns)
-{
- const struct time_unit *u;
-
- for (u = time_units;
- u + 1 < time_units + ARRAY_SIZE(time_units) &&
- ns >= u[1].nsecs << 1;
- u++)
- ;
-
- return u;
-}
-
-void bch2_pr_time_units(struct printbuf *out, u64 ns)
-{
- const struct time_unit *u = pick_time_units(ns);
-
- prt_printf(out, "%llu %s", div_u64(ns, u->nsecs), u->name);
-}
static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
{
@@ -467,26 +494,6 @@ static void bch2_pr_time_units_aligned(struct printbuf *out, u64 ns)
prt_printf(out, "%s", u->name);
}
-#ifndef __KERNEL__
-#include <time.h>
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
- time_t t = sec;
- char buf[64];
- ctime_r(&t, buf);
- prt_str(out, buf);
-}
-#else
-void bch2_prt_datetime(struct printbuf *out, time64_t sec)
-{
- char buf[64];
- snprintf(buf, sizeof(buf), "%ptT", &sec);
- prt_u64(out, sec);
-}
-#endif
-
-#define TABSTOP_SIZE 12
-
static inline void pr_name_and_units(struct printbuf *out, const char *name, u64 ns)
{
prt_str(out, name);
@@ -495,12 +502,24 @@ static inline void pr_name_and_units(struct printbuf *out, const char *name, u64
prt_newline(out);
}
+#define TABSTOP_SIZE 12
+
void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats)
{
const struct time_unit *u;
s64 f_mean = 0, d_mean = 0;
u64 q, last_q = 0, f_stddev = 0, d_stddev = 0;
int i;
+
+ if (stats->buffer) {
+ int cpu;
+
+ spin_lock_irq(&stats->lock);
+ for_each_possible_cpu(cpu)
+ __bch2_time_stats_clear_buffer(stats, per_cpu_ptr(stats->buffer, cpu));
+ spin_unlock_irq(&stats->lock);
+ }
+
/*
* avoid divide by zero
*/
@@ -546,6 +565,7 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
pr_name_and_units(out, "min:", stats->min_duration);
pr_name_and_units(out, "max:", stats->max_duration);
+ pr_name_and_units(out, "total:", stats->total_duration);
prt_printf(out, "mean:");
prt_tab(out);
@@ -603,6 +623,9 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats
last_q = q;
}
}
+#else
+void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats) {}
+#endif
void bch2_time_stats_exit(struct bch2_time_stats *stats)
{
@@ -1157,3 +1180,39 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr)
return ret;
}
+
+void bch2_darray_str_exit(darray_str *d)
+{
+ darray_for_each(*d, i)
+ kfree(*i);
+ darray_exit(d);
+}
+
+int bch2_split_devs(const char *_dev_name, darray_str *ret)
+{
+ darray_init(ret);
+
+ char *dev_name, *s, *orig;
+
+ dev_name = orig = kstrdup(_dev_name, GFP_KERNEL);
+ if (!dev_name)
+ return -ENOMEM;
+
+ while ((s = strsep(&dev_name, ":"))) {
+ char *p = kstrdup(s, GFP_KERNEL);
+ if (!p)
+ goto err;
+
+ if (darray_push(ret, p)) {
+ kfree(p);
+ goto err;
+ }
+ }
+
+ kfree(orig);
+ return 0;
+err:
+ bch2_darray_str_exit(ret);
+ kfree(orig);
+ return -ENOMEM;
+}
diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h
index b701f7fe0784..b414736d59a5 100644
--- a/fs/bcachefs/util.h
+++ b/fs/bcachefs/util.h
@@ -342,14 +342,24 @@ bool bch2_is_zero(const void *, size_t);
u64 bch2_read_flag_list(char *, const char * const[]);
-void bch2_prt_u64_binary(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned);
+void bch2_prt_u64_base2(struct printbuf *, u64);
void bch2_print_string_as_lines(const char *prefix, const char *lines);
typedef DARRAY(unsigned long) bch_stacktrace;
-int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *);
+int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t);
void bch2_prt_backtrace(struct printbuf *, bch_stacktrace *);
-int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *);
+int bch2_prt_task_backtrace(struct printbuf *, struct task_struct *, unsigned, gfp_t);
+
+static inline void prt_bdevname(struct printbuf *out, struct block_device *bdev)
+{
+#ifdef __KERNEL__
+ prt_printf(out, "%pg", bdev);
+#else
+ prt_str(out, bdev->name);
+#endif
+}
#define NR_QUANTILES 15
#define QUANTILE_IDX(i) inorder_to_eytzinger0(i, NR_QUANTILES)
@@ -374,8 +384,9 @@ struct bch2_time_stat_buffer {
struct bch2_time_stats {
spinlock_t lock;
/* all fields are in nanoseconds */
- u64 max_duration;
u64 min_duration;
+ u64 max_duration;
+ u64 total_duration;
u64 max_freq;
u64 min_freq;
u64 last_event;
@@ -390,15 +401,39 @@ struct bch2_time_stats {
#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT
void __bch2_time_stats_update(struct bch2_time_stats *stats, u64, u64);
-#else
-static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
-#endif
static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start)
{
__bch2_time_stats_update(stats, start, local_clock());
}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+ u64 *start, bool v)
+{
+ if (v != !!*start) {
+ if (!v) {
+ bch2_time_stats_update(stats, *start);
+ *start = 0;
+ } else {
+ *start = local_clock() ?: 1;
+ return true;
+ }
+ }
+
+ return false;
+}
+#else
+static inline void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) {}
+static inline void bch2_time_stats_update(struct bch2_time_stats *stats, u64 start) {}
+static inline bool track_event_change(struct bch2_time_stats *stats,
+ u64 *start, bool v)
+{
+ bool ret = v && !*start;
+ *start = v;
+ return ret;
+}
+#endif
+
void bch2_time_stats_to_text(struct printbuf *, struct bch2_time_stats *);
void bch2_time_stats_exit(struct bch2_time_stats *);
@@ -831,4 +866,14 @@ static inline int cmp_le32(__le32 l, __le32 r)
#include <linux/uuid.h>
+#define QSTR(n) { { { .len = strlen(n) } }, .name = n }
+
+static inline bool qstr_eq(const struct qstr l, const struct qstr r)
+{
+ return l.len == r.len && !memcmp(l.name, r.name, l.len);
+}
+
+void bch2_darray_str_exit(darray_str *);
+int bch2_split_devs(const char *, darray_str *);
+
#endif /* _BCACHEFS_UTIL_H */
diff --git a/fs/bcachefs/vstructs.h b/fs/bcachefs/vstructs.h
index a6561b4b36a6..2ad338e282da 100644
--- a/fs/bcachefs/vstructs.h
+++ b/fs/bcachefs/vstructs.h
@@ -48,14 +48,14 @@
((void *) ((u64 *) (_s)->_data + __vstruct_u64s(_s)))
#define vstruct_for_each(_s, _i) \
- for (_i = (_s)->start; \
+ for (typeof(&(_s)->start[0]) _i = (_s)->start; \
_i < vstruct_last(_s); \
_i = vstruct_next(_i))
-#define vstruct_for_each_safe(_s, _i, _t) \
- for (_i = (_s)->start; \
- _i < vstruct_last(_s) && (_t = vstruct_next(_i), true); \
- _i = _t)
+#define vstruct_for_each_safe(_s, _i) \
+ for (typeof(&(_s)->start[0]) _next, _i = (_s)->start; \
+ _i < vstruct_last(_s) && (_next = vstruct_next(_i), true); \
+ _i = _next)
#define vstruct_idx(_s, _idx) \
((typeof(&(_s)->start[0])) ((_s)->_data + (_idx)))
diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c
index 5a1858fb9879..9c0d2316031b 100644
--- a/fs/bcachefs/xattr.c
+++ b/fs/bcachefs/xattr.c
@@ -590,8 +590,9 @@ err:
mutex_unlock(&inode->ei_update_lock);
if (value &&
- (opt_id == Opt_background_compression ||
- opt_id == Opt_background_target))
+ (opt_id == Opt_background_target ||
+ opt_id == Opt_background_compression ||
+ (opt_id == Opt_compression && !inode_opt_get(c, &inode->ei_inode, background_compression))))
bch2_set_rebalance_needs_scan(c, inode->ei_inode.bi_inum);
return bch2_err_class(ret);
diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h
new file mode 100644
index 000000000000..e9f810539552
--- /dev/null
+++ b/fs/bcachefs/xattr_format.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BCACHEFS_XATTR_FORMAT_H
+#define _BCACHEFS_XATTR_FORMAT_H
+
+#define KEY_TYPE_XATTR_INDEX_USER 0
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS 1
+#define KEY_TYPE_XATTR_INDEX_POSIX_ACL_DEFAULT 2
+#define KEY_TYPE_XATTR_INDEX_TRUSTED 3
+#define KEY_TYPE_XATTR_INDEX_SECURITY 4
+
+struct bch_xattr {
+ struct bch_val v;
+ __u8 x_type;
+ __u8 x_name_len;
+ __le16 x_val_len;
+ __u8 x_name[];
+} __packed __aligned(8);
+
+#endif /* _BCACHEFS_XATTR_FORMAT_H */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index a93d76df8ed8..2b4dda047450 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -671,9 +671,6 @@ static struct dentry *befs_get_parent(struct dentry *child)
parent = befs_iget(child->d_sb,
(unsigned long)befs_ino->i_parent.start);
- if (IS_ERR(parent))
- return ERR_CAST(parent);
-
return d_obtain_alias(parent);
}
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index fbc4ae80a4b2..c375e22c4c0c 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -275,11 +275,6 @@ static int bfs_add_entry(struct inode *dir, const struct qstr *child, int ino)
dprintf("name=%s, namelen=%d\n", name, namelen);
- if (!namelen)
- return -ENOENT;
- if (namelen > BFS_NAMELEN)
- return -ENAMETOOLONG;
-
sblock = BFS_I(dir)->i_sblock;
eblock = BFS_I(dir)->i_eblock;
for (block = sblock; block <= eblock; block++) {
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index adc2230079c6..a778411574a9 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -11,6 +11,7 @@
*/
#include <linux/fs.h>
+#include <linux/mpage.h>
#include <linux/buffer_head.h>
#include "bfs.h"
@@ -150,9 +151,10 @@ out:
return err;
}
-static int bfs_writepage(struct page *page, struct writeback_control *wbc)
+static int bfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, bfs_get_block, wbc);
+ return mpage_writepages(mapping, wbc, bfs_get_block);
}
static int bfs_read_folio(struct file *file, struct folio *folio)
@@ -190,9 +192,10 @@ const struct address_space_operations bfs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = bfs_read_folio,
- .writepage = bfs_writepage,
+ .writepages = bfs_writepages,
.write_begin = bfs_write_begin,
.write_end = generic_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = bfs_bmap,
};
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c
index 206cf1612c1d..1925a0919ca6 100644
--- a/fs/btrfs/accessors.c
+++ b/fs/btrfs/accessors.c
@@ -27,7 +27,7 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
{
token->eb = eb;
- token->kaddr = page_address(eb->pages[0]);
+ token->kaddr = folio_address(eb->folios[0]);
token->offset = 0;
}
@@ -50,7 +50,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e
* an offset into the extent buffer page array, cast to a specific type. This
* gives us all the type checking.
*
- * The extent buffer pages stored in the array pages do not form a contiguous
+ * The extent buffer pages stored in the array folios may not form a contiguous
* phyusical range, but the API functions assume the linear offset to the range
* from 0 to metadata node size.
*/
@@ -60,28 +60,30 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_page_index(member_offset); \
- const unsigned long oip = get_eb_offset_in_page(token->eb, \
- member_offset); \
+ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+ const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ member_offset);\
+ const int unit_size = folio_size(token->eb->folios[0]); \
+ const int unit_shift = folio_shift(token->eb->folios[0]); \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
\
ASSERT(token); \
ASSERT(token->kaddr); \
ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
if (token->offset <= member_offset && \
- member_offset + size <= token->offset + PAGE_SIZE) { \
- return get_unaligned_le##bits(token->kaddr + oip); \
+ member_offset + size <= token->offset + unit_size) { \
+ return get_unaligned_le##bits(token->kaddr + oil); \
} \
- token->kaddr = page_address(token->eb->pages[idx]); \
- token->offset = idx << PAGE_SHIFT; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \
- return get_unaligned_le##bits(token->kaddr + oip); \
+ token->kaddr = folio_address(token->eb->folios[idx]); \
+ token->offset = idx << unit_shift; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
+ return get_unaligned_le##bits(token->kaddr + oil); \
\
- memcpy(lebytes, token->kaddr + oip, part); \
- token->kaddr = page_address(token->eb->pages[idx + 1]); \
- token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(lebytes, token->kaddr + oil, part); \
+ token->kaddr = folio_address(token->eb->folios[idx + 1]); \
+ token->offset = (idx + 1) << unit_shift; \
memcpy(lebytes + part, token->kaddr, size - part); \
return get_unaligned_le##bits(lebytes); \
} \
@@ -89,19 +91,21 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \
const void *ptr, unsigned long off) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
- const unsigned long idx = get_eb_page_index(member_offset); \
- char *kaddr = page_address(eb->pages[idx]); \
+ const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+ const unsigned long oil = get_eb_offset_in_folio(eb, \
+ member_offset);\
+ const int unit_size = folio_size(eb->folios[0]); \
+ char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \
- return get_unaligned_le##bits(kaddr + oip); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \
+ return get_unaligned_le##bits(kaddr + oil); \
\
- memcpy(lebytes, kaddr + oip, part); \
- kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(lebytes, kaddr + oil, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
memcpy(lebytes + part, kaddr, size - part); \
return get_unaligned_le##bits(lebytes); \
} \
@@ -110,53 +114,59 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \
u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long idx = get_eb_page_index(member_offset); \
- const unsigned long oip = get_eb_offset_in_page(token->eb, \
- member_offset); \
+ const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \
+ const unsigned long oil = get_eb_offset_in_folio(token->eb, \
+ member_offset);\
+ const int unit_size = folio_size(token->eb->folios[0]); \
+ const int unit_shift = folio_shift(token->eb->folios[0]); \
const int size = sizeof(u##bits); \
u8 lebytes[sizeof(u##bits)]; \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
\
ASSERT(token); \
ASSERT(token->kaddr); \
ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \
if (token->offset <= member_offset && \
- member_offset + size <= token->offset + PAGE_SIZE) { \
- put_unaligned_le##bits(val, token->kaddr + oip); \
+ member_offset + size <= token->offset + unit_size) { \
+ put_unaligned_le##bits(val, token->kaddr + oil); \
return; \
} \
- token->kaddr = page_address(token->eb->pages[idx]); \
- token->offset = idx << PAGE_SHIFT; \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
- put_unaligned_le##bits(val, token->kaddr + oip); \
+ token->kaddr = folio_address(token->eb->folios[idx]); \
+ token->offset = idx << unit_shift; \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
+ oil + size <= unit_size) { \
+ put_unaligned_le##bits(val, token->kaddr + oil); \
return; \
} \
put_unaligned_le##bits(val, lebytes); \
- memcpy(token->kaddr + oip, lebytes, part); \
- token->kaddr = page_address(token->eb->pages[idx + 1]); \
- token->offset = (idx + 1) << PAGE_SHIFT; \
+ memcpy(token->kaddr + oil, lebytes, part); \
+ token->kaddr = folio_address(token->eb->folios[idx + 1]); \
+ token->offset = (idx + 1) << unit_shift; \
memcpy(token->kaddr, lebytes + part, size - part); \
} \
void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \
unsigned long off, u##bits val) \
{ \
const unsigned long member_offset = (unsigned long)ptr + off; \
- const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \
- const unsigned long idx = get_eb_page_index(member_offset); \
- char *kaddr = page_address(eb->pages[idx]); \
+ const unsigned long idx = get_eb_folio_index(eb, member_offset);\
+ const unsigned long oil = get_eb_offset_in_folio(eb, \
+ member_offset);\
+ const int unit_size = folio_size(eb->folios[0]); \
+ char *kaddr = folio_address(eb->folios[idx]); \
const int size = sizeof(u##bits); \
- const int part = PAGE_SIZE - oip; \
+ const int part = unit_size - oil; \
u8 lebytes[sizeof(u##bits)]; \
\
ASSERT(check_setget_bounds(eb, ptr, off, size)); \
- if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \
- put_unaligned_le##bits(val, kaddr + oip); \
+ if (INLINE_EXTENT_BUFFER_PAGES == 1 || \
+ oil + size <= unit_size) { \
+ put_unaligned_le##bits(val, kaddr + oil); \
return; \
} \
\
put_unaligned_le##bits(val, lebytes); \
- memcpy(kaddr + oip, lebytes, part); \
- kaddr = page_address(eb->pages[idx + 1]); \
+ memcpy(kaddr + oil, lebytes, part); \
+ kaddr = folio_address(eb->folios[idx + 1]); \
memcpy(kaddr, lebytes + part, size - part); \
}
diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h
index aa0844535644..ed7aa32972ad 100644
--- a/fs/btrfs/accessors.h
+++ b/fs/btrfs/accessors.h
@@ -90,14 +90,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\
#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \
static inline u##bits btrfs_##name(const struct extent_buffer *eb) \
{ \
- const type *p = page_address(eb->pages[0]) + \
+ const type *p = folio_address(eb->folios[0]) + \
offset_in_page(eb->start); \
return get_unaligned_le##bits(&p->member); \
} \
static inline void btrfs_set_##name(const struct extent_buffer *eb, \
u##bits val) \
{ \
- type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \
+ type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \
put_unaligned_le##bits(val, &p->member); \
}
diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c
index 4f3b693a16b1..928f512cdb4a 100644
--- a/fs/btrfs/bio.c
+++ b/fs/btrfs/bio.c
@@ -194,6 +194,12 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio);
int mirror = repair_bbio->mirror_num;
+ /*
+ * We can only trigger this for data bio, which doesn't support larger
+ * folios yet.
+ */
+ ASSERT(folio_order(page_folio(bv->bv_page)) == 0);
+
if (repair_bbio->bio.bi_status ||
!btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) {
bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ);
@@ -215,7 +221,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio,
btrfs_repair_io_failure(fs_info, btrfs_ino(inode),
repair_bbio->file_offset, fs_info->sectorsize,
repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT,
- bv->bv_page, bv->bv_offset, mirror);
+ page_folio(bv->bv_page), bv->bv_offset, mirror);
} while (mirror != fbio->bbio->mirror_num);
done:
@@ -626,7 +632,7 @@ static bool should_async_write(struct btrfs_bio *bbio)
/*
* Submit bio to an async queue.
*
- * Return true if the work has been succesfuly submitted, else false.
+ * Return true if the work has been successfully submitted, else false.
*/
static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio,
struct btrfs_io_context *bioc,
@@ -767,8 +773,8 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num)
* freeing the bio.
*/
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num)
+ u64 length, u64 logical, struct folio *folio,
+ unsigned int folio_offset, int mirror_num)
{
struct btrfs_io_stripe smap = { 0 };
struct bio_vec bvec;
@@ -799,7 +805,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
- __bio_add_page(&bio, page, length, pg_offset);
+ ret = bio_add_folio(&bio, folio, length, folio_offset);
+ ASSERT(ret);
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h
index ca79decee060..bbaed317161a 100644
--- a/fs/btrfs/bio.h
+++ b/fs/btrfs/bio.h
@@ -105,7 +105,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status);
void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num);
void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
- u64 length, u64 logical, struct page *page,
- unsigned int pg_offset, int mirror_num);
+ u64 length, u64 logical, struct folio *folio,
+ unsigned int folio_offset, int mirror_num);
#endif
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 6e5dc68ff661..378d9103a207 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -168,7 +168,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache)
cache);
kfree(cache->free_space_ctl);
- kfree(cache->physical_map);
+ btrfs_free_chunk_map(cache->physical_map);
kfree(cache);
}
}
@@ -1047,7 +1047,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans,
}
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em)
+ struct btrfs_chunk_map *map)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_path *path;
@@ -1059,10 +1059,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
int index;
int factor;
struct btrfs_caching_control *caching_ctl = NULL;
- bool remove_em;
+ bool remove_map;
bool remove_rsv = false;
- block_group = btrfs_lookup_block_group(fs_info, group_start);
+ block_group = btrfs_lookup_block_group(fs_info, map->start);
BUG_ON(!block_group);
BUG_ON(!block_group->ro);
@@ -1252,7 +1252,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* entries because we already removed them all when we called
* btrfs_remove_free_space_cache().
*
- * And we must not remove the extent map from the fs_info->mapping_tree
+ * And we must not remove the chunk map from the fs_info->mapping_tree
* to prevent the same logical address range and physical device space
* ranges from being reused for a new block group. This is needed to
* avoid races with trimming and scrub.
@@ -1268,19 +1268,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
* in place until the extents have been discarded completely when
* the transaction commit has completed.
*/
- remove_em = (atomic_read(&block_group->frozen) == 0);
+ remove_map = (atomic_read(&block_group->frozen) == 0);
spin_unlock(&block_group->lock);
- if (remove_em) {
- struct extent_map_tree *em_tree;
-
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
- /* once for the tree */
- free_extent_map(em);
- }
+ if (remove_map)
+ btrfs_remove_chunk_map(fs_info, map);
out:
/* Once for the lookup reference */
@@ -1295,15 +1287,12 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info, const u64 chunk_offset)
{
struct btrfs_root *root = btrfs_block_group_root(fs_info);
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
unsigned int num_items;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
- ASSERT(em && em->start == chunk_offset);
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(map != NULL);
+ ASSERT(map->start == chunk_offset);
/*
* We need to reserve 3 + N units from the metadata space info in order
@@ -1324,9 +1313,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
* more device items and remove one chunk item), but this is done at
* btrfs_remove_chunk() through a call to check_system_chunk().
*/
- map = em->map_lookup;
num_items = 3 + map->num_stripes;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return btrfs_start_transaction_fallback_global_rsv(root, num_items);
}
@@ -1467,6 +1455,7 @@ out:
*/
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
{
+ LIST_HEAD(retry_list);
struct btrfs_block_group *block_group;
struct btrfs_space_info *space_info;
struct btrfs_trans_handle *trans;
@@ -1488,6 +1477,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
spin_lock(&fs_info->unused_bgs_lock);
while (!list_empty(&fs_info->unused_bgs)) {
+ u64 used;
int trimming;
block_group = list_first_entry(&fs_info->unused_bgs,
@@ -1523,9 +1513,9 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
+ spin_lock(&space_info->lock);
spin_lock(&block_group->lock);
- if (block_group->reserved || block_group->pinned ||
- block_group->used || block_group->ro ||
+ if (btrfs_is_block_group_used(block_group) || block_group->ro ||
list_is_singular(&block_group->list)) {
/*
* We want to bail if we made new allocations or have
@@ -1535,10 +1525,49 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
*/
trace_btrfs_skip_unused_block_group(block_group);
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
+ /*
+ * The block group may be unused but there may be space reserved
+ * accounting with the existence of that block group, that is,
+ * space_info->bytes_may_use was incremented by a task but no
+ * space was yet allocated from the block group by the task.
+ * That space may or may not be allocated, as we are generally
+ * pessimistic about space reservation for metadata as well as
+ * for data when using compression (as we reserve space based on
+ * the worst case, when data can't be compressed, and before
+ * actually attempting compression, before starting writeback).
+ *
+ * So check if the total space of the space_info minus the size
+ * of this block group is less than the used space of the
+ * space_info - if that's the case, then it means we have tasks
+ * that might be relying on the block group in order to allocate
+ * extents, and add back the block group to the unused list when
+ * we finish, so that we retry later in case no tasks ended up
+ * needing to allocate extents from the block group.
+ */
+ used = btrfs_space_info_used(space_info, true);
+ if (space_info->total_bytes - block_group->length < used) {
+ /*
+ * Add a reference for the list, compensate for the ref
+ * drop under the "next" label for the
+ * fs_info->unused_bgs list.
+ */
+ btrfs_get_block_group(block_group);
+ list_add_tail(&block_group->bg_list, &retry_list);
+
+ trace_btrfs_skip_unused_block_group(block_group);
+ spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
up_write(&space_info->groups_sem);
goto next;
}
+
spin_unlock(&block_group->lock);
+ spin_unlock(&space_info->lock);
/* We don't want to force the issue, only flip if it's ok. */
ret = inc_block_group_ro(block_group, 0);
@@ -1662,12 +1691,16 @@ next:
btrfs_put_block_group(block_group);
spin_lock(&fs_info->unused_bgs_lock);
}
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
+ spin_lock(&fs_info->unused_bgs_lock);
+ list_splice_tail(&retry_list, &fs_info->unused_bgs);
+ spin_unlock(&fs_info->unused_bgs_lock);
mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
@@ -1927,8 +1960,7 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
struct btrfs_block_group_item bg;
struct extent_buffer *leaf;
int slot;
@@ -1938,23 +1970,20 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
slot = path->slots[0];
leaf = path->nodes[0];
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
- read_unlock(&em_tree->lock);
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset);
+ if (!map) {
btrfs_err(fs_info,
"logical %llu len %llu found bg but no related chunk",
key->objectid, key->offset);
return -ENOENT;
}
- if (em->start != key->objectid || em->len != key->offset) {
+ if (map->start != key->objectid || map->chunk_len != key->offset) {
btrfs_err(fs_info,
"block group %llu len %llu mismatch with chunk %llu len %llu",
- key->objectid, key->offset, em->start, em->len);
+ key->objectid, key->offset, map->start, map->chunk_len);
ret = -EUCLEAN;
- goto out_free_em;
+ goto out_free_map;
}
read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
@@ -1962,16 +1991,16 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
flags = btrfs_stack_block_group_flags(&bg) &
BTRFS_BLOCK_GROUP_TYPE_MASK;
- if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
key->objectid, key->offset, flags,
- (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
+ (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type));
ret = -EUCLEAN;
}
-out_free_em:
- free_extent_map(em);
+out_free_map:
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2024,8 +2053,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
u64 physical, u64 **logical, int *naddrs, int *stripe_len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 *buf;
u64 bytenr;
u64 data_stripe_length;
@@ -2033,14 +2061,13 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
int i, nr = 0;
int ret = 0;
- em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_start, 1);
+ if (IS_ERR(map))
return -EIO;
- map = em->map_lookup;
- data_stripe_length = em->orig_block_len;
+ data_stripe_length = map->stripe_size;
io_stripe_size = BTRFS_STRIPE_LEN;
- chunk_start = em->start;
+ chunk_start = map->start;
/* For RAID5/6 adjust to a full IO stripe length */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
@@ -2094,7 +2121,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
*naddrs = nr;
*stripe_len = io_stripe_size;
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2199,49 +2226,47 @@ static struct btrfs_block_group *btrfs_create_block_group_cache(
*/
static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct btrfs_block_group *bg;
u64 start = 0;
int ret = 0;
while (1) {
- read_lock(&map_tree->lock);
+ struct btrfs_chunk_map *map;
+ struct btrfs_block_group *bg;
+
/*
- * lookup_extent_mapping will return the first extent map
- * intersecting the range, so setting @len to 1 is enough to
+ * btrfs_find_chunk_map() will return the first chunk map
+ * intersecting the range, so setting @length to 1 is enough to
* get the first chunk.
*/
- em = lookup_extent_mapping(map_tree, start, 1);
- read_unlock(&map_tree->lock);
- if (!em)
+ map = btrfs_find_chunk_map(fs_info, start, 1);
+ if (!map)
break;
- bg = btrfs_lookup_block_group(fs_info, em->start);
+ bg = btrfs_lookup_block_group(fs_info, map->start);
if (!bg) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu doesn't have corresponding block group",
- em->start, em->len);
+ map->start, map->chunk_len);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
break;
}
- if (bg->start != em->start || bg->length != em->len ||
+ if (bg->start != map->start || bg->length != map->chunk_len ||
(bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
- (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
+ (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
btrfs_err(fs_info,
"chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
- em->start, em->len,
- em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
+ map->start, map->chunk_len,
+ map->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
bg->start, bg->length,
bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
ret = -EUCLEAN;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
break;
}
- start = em->start + em->len;
- free_extent_map(em);
+ start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
btrfs_put_block_group(bg);
}
return ret;
@@ -2369,28 +2394,25 @@ error:
static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
struct rb_node *node;
int ret = 0;
- for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
- struct extent_map *em;
- struct map_lookup *map;
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
struct btrfs_block_group *bg;
- em = rb_entry(node, struct extent_map, rb_node);
- map = em->map_lookup;
- bg = btrfs_create_block_group_cache(fs_info, em->start);
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ bg = btrfs_create_block_group_cache(fs_info, map->start);
if (!bg) {
ret = -ENOMEM;
break;
}
/* Fill dummy cache as FULL */
- bg->length = em->len;
+ bg->length = map->chunk_len;
bg->flags = map->type;
bg->cached = BTRFS_CACHE_FINISHED;
- bg->used = em->len;
+ bg->used = map->chunk_len;
bg->flags = map->type;
ret = btrfs_add_block_group_cache(fs_info, bg);
/*
@@ -2618,19 +2640,14 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_device *device;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 dev_offset;
- u64 stripe_size;
int i;
int ret = 0;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
- if (IS_ERR(em))
- return PTR_ERR(em);
-
- map = em->map_lookup;
- stripe_size = em->orig_block_len;
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
/*
* Take the device list mutex to prevent races with the final phase of
@@ -2647,13 +2664,13 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans,
dev_offset = map->stripes[i].physical;
ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
- stripe_size);
+ map->stripe_size);
if (ret)
break;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -2712,6 +2729,37 @@ next:
btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info);
list_del_init(&block_group->bg_list);
clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
+
+ /*
+ * If the block group is still unused, add it to the list of
+ * unused block groups. The block group may have been created in
+ * order to satisfy a space reservation, in which case the
+ * extent allocation only happens later. But often we don't
+ * actually need to allocate space that we previously reserved,
+ * so the block group may become unused for a long time. For
+ * example for metadata we generally reserve space for a worst
+ * possible scenario, but then don't end up allocating all that
+ * space or none at all (due to no need to COW, extent buffers
+ * were already COWed in the current transaction and still
+ * unwritten, tree heights lower than the maximum possible
+ * height, etc). For data we generally reserve the axact amount
+ * of space we are going to allocate later, the exception is
+ * when using compression, as we must reserve space based on the
+ * uncompressed data size, because the compression is only done
+ * when writeback triggered and we don't know how much space we
+ * are actually going to need, so we reserve the uncompressed
+ * size because the data may be uncompressible in the worst case.
+ */
+ if (ret == 0) {
+ bool used;
+
+ spin_lock(&block_group->lock);
+ used = btrfs_is_block_group_used(block_group);
+ spin_unlock(&block_group->lock);
+
+ if (!used)
+ btrfs_mark_bg_unused(block_group);
+ }
}
btrfs_trans_release_chunk_metadata(trans);
}
@@ -2910,7 +2958,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
goto unlock_out;
/*
- * Skip chunk alloction if the bg is SYSTEM, this is to avoid system
+ * Skip chunk allocation if the bg is SYSTEM, this is to avoid system
* chunk allocation storm to exhaust the system chunk array. Otherwise
* we still want to try our best to mark the block group read-only.
*/
@@ -4406,8 +4454,6 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache)
void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct extent_map_tree *em_tree;
- struct extent_map *em;
bool cleanup;
spin_lock(&block_group->lock);
@@ -4416,17 +4462,16 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
spin_unlock(&block_group->lock);
if (cleanup) {
- em_tree = &fs_info->mapping_tree;
- write_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, block_group->start,
- 1);
- BUG_ON(!em); /* logic error, can't happen */
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* once for us and once for the tree */
- free_extent_map(em);
- free_extent_map(em);
+ struct btrfs_chunk_map *map;
+
+ map = btrfs_find_chunk_map(fs_info, block_group->start, 1);
+ /* Logic error, can't happen. */
+ ASSERT(map);
+
+ btrfs_remove_chunk_map(fs_info, map);
+
+ /* Once for our lookup reference. */
+ btrfs_free_chunk_map(map);
/*
* We may have left one free space entry and other possible
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 2bdbcb834f95..962b11983901 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -5,6 +5,8 @@
#include "free-space-cache.h"
+struct btrfs_chunk_map;
+
enum btrfs_disk_cache_state {
BTRFS_DC_WRITTEN,
BTRFS_DC_ERROR,
@@ -243,7 +245,7 @@ struct btrfs_block_group {
u64 zone_unusable;
u64 zone_capacity;
u64 meta_write_pointer;
- struct map_lookup *physical_map;
+ struct btrfs_chunk_map *physical_map;
struct list_head active_bg_list;
struct work_struct zone_finish_work;
struct extent_buffer *last_eb;
@@ -255,6 +257,13 @@ static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
return (block_group->start + block_group->length);
}
+static inline bool btrfs_is_block_group_used(const struct btrfs_block_group *bg)
+{
+ lockdep_assert_held(&bg->lock);
+
+ return (bg->used > 0 || bg->reserved > 0 || bg->pinned > 0);
+}
+
static inline bool btrfs_is_block_group_data_only(
struct btrfs_block_group *block_group)
{
@@ -297,7 +306,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
struct btrfs_fs_info *fs_info,
const u64 chunk_offset);
int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
- u64 group_start, struct extent_map *em);
+ struct btrfs_chunk_map *map);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
void btrfs_reclaim_bgs_work(struct work_struct *work);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index ceb5f586a2d5..1043a8142351 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -494,7 +494,7 @@ struct btrfs_block_rsv *btrfs_use_block_rsv(struct btrfs_trans_handle *trans,
block_rsv = get_block_rsv(trans, root);
- if (unlikely(block_rsv->size == 0))
+ if (unlikely(btrfs_block_rsv_size(block_rsv) == 0))
goto try_reserve;
again:
ret = btrfs_block_rsv_use_bytes(block_rsv, blocksize);
diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h
index b0bd12b8652f..43a9a6b5a79f 100644
--- a/fs/btrfs/block-rsv.h
+++ b/fs/btrfs/block-rsv.h
@@ -101,4 +101,36 @@ static inline bool btrfs_block_rsv_full(const struct btrfs_block_rsv *rsv)
return data_race(rsv->full);
}
+/*
+ * Get the reserved mount of a block reserve in a context where getting a stale
+ * value is acceptable, instead of accessing it directly and trigger data race
+ * warning from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_reserved(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->reserved;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
+/*
+ * Get the size of a block reserve in a context where getting a stale value is
+ * acceptable, instead of accessing it directly and trigger data race warning
+ * from KCSAN.
+ */
+static inline u64 btrfs_block_rsv_size(struct btrfs_block_rsv *rsv)
+{
+ u64 ret;
+
+ spin_lock(&rsv->lock);
+ ret = rsv->size;
+ spin_unlock(&rsv->lock);
+
+ return ret;
+}
+
#endif /* BTRFS_BLOCK_RSV_H */
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 5572ae52444e..7f7c5a92d2b8 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -69,6 +69,8 @@ enum {
BTRFS_INODE_VERITY_IN_PROGRESS,
/* Set when this inode is a free space inode. */
BTRFS_INODE_FREE_SPACE_INODE,
+ /* Set when there are no capabilities in XATTs for the inode. */
+ BTRFS_INODE_NO_CAP_XATTR,
};
/* in memory btrfs inode */
@@ -107,9 +109,11 @@ struct btrfs_inode {
/*
* Keep track of where the inode has extent items mapped in order to
- * make sure the i_size adjustments are accurate
+ * make sure the i_size adjustments are accurate. Not required when the
+ * filesystem is NO_HOLES, the status can't be set while mounted as
+ * it's a mkfs-time feature.
*/
- struct extent_io_tree file_extent_tree;
+ struct extent_io_tree *file_extent_tree;
/* held while logging the inode in tree-log.c */
struct mutex log_mutex;
@@ -487,7 +491,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
- u64 start, u64 end);
+ u64 start, u64 len);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 19b22b4653c8..68345f73d429 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -20,6 +20,7 @@
#include <linux/slab.h>
#include <linux/sched/mm.h>
#include <linux/log2.h>
+#include <linux/shrinker.h>
#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
@@ -140,16 +141,16 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
- const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ const u8 *data_in, struct page *dest_page,
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
switch (type) {
case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_LZO: return lzo_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
case BTRFS_COMPRESS_NONE:
default:
/*
@@ -163,13 +164,107 @@ static int compression_decompress(int type, struct list_head *ws,
static void btrfs_free_compressed_pages(struct compressed_bio *cb)
{
for (unsigned int i = 0; i < cb->nr_pages; i++)
- put_page(cb->compressed_pages[i]);
+ btrfs_free_compr_page(cb->compressed_pages[i]);
kfree(cb->compressed_pages);
}
static int btrfs_decompress_bio(struct compressed_bio *cb);
-static void end_compressed_bio_read(struct btrfs_bio *bbio)
+/*
+ * Global cache of last unused pages for compression/decompression.
+ */
+static struct btrfs_compr_pool {
+ struct shrinker *shrinker;
+ spinlock_t lock;
+ struct list_head list;
+ int count;
+ int thresh;
+} compr_pool;
+
+static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc)
+{
+ int ret;
+
+ /*
+ * We must not read the values more than once if 'ret' gets expanded in
+ * the return statement so we don't accidentally return a negative
+ * number, even if the first condition finds it positive.
+ */
+ ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh);
+
+ return ret > 0 ? ret : 0;
+}
+
+static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc)
+{
+ struct list_head remove;
+ struct list_head *tmp, *next;
+ int freed;
+
+ if (compr_pool.count == 0)
+ return SHRINK_STOP;
+
+ INIT_LIST_HEAD(&remove);
+
+ /* For now, just simply drain the whole list. */
+ spin_lock(&compr_pool.lock);
+ list_splice_init(&compr_pool.list, &remove);
+ freed = compr_pool.count;
+ compr_pool.count = 0;
+ spin_unlock(&compr_pool.lock);
+
+ list_for_each_safe(tmp, next, &remove) {
+ struct page *page = list_entry(tmp, struct page, lru);
+
+ ASSERT(page_ref_count(page) == 1);
+ put_page(page);
+ }
+
+ return freed;
+}
+
+/*
+ * Common wrappers for page allocation from compression wrappers
+ */
+struct page *btrfs_alloc_compr_page(void)
+{
+ struct page *page = NULL;
+
+ spin_lock(&compr_pool.lock);
+ if (compr_pool.count > 0) {
+ page = list_first_entry(&compr_pool.list, struct page, lru);
+ list_del_init(&page->lru);
+ compr_pool.count--;
+ }
+ spin_unlock(&compr_pool.lock);
+
+ if (page)
+ return page;
+
+ return alloc_page(GFP_NOFS);
+}
+
+void btrfs_free_compr_page(struct page *page)
+{
+ bool do_free = false;
+
+ spin_lock(&compr_pool.lock);
+ if (compr_pool.count > compr_pool.thresh) {
+ do_free = true;
+ } else {
+ list_add(&page->lru, &compr_pool.list);
+ compr_pool.count++;
+ }
+ spin_unlock(&compr_pool.lock);
+
+ if (!do_free)
+ return;
+
+ ASSERT(page_ref_count(page) == 1);
+ put_page(page);
+}
+
+static void end_bbio_comprssed_read(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
blk_status_t status = bbio->bio.bi_status;
@@ -211,8 +306,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb)
for (i = 0; i < ret; i++) {
struct folio *folio = fbatch.folios[i];
- btrfs_page_clamp_clear_writeback(fs_info, &folio->page,
- cb->start, cb->len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio,
+ cb->start, cb->len);
}
folio_batch_release(&fbatch);
}
@@ -242,7 +337,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work)
* This also calls the writeback end hooks for the file pages so that metadata
* and checksums can be updated in the file.
*/
-static void end_compressed_bio_write(struct btrfs_bio *bbio)
+static void end_bbio_comprssed_write(struct btrfs_bio *bbio)
{
struct compressed_bio *cb = to_compressed_bio(bbio);
struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info;
@@ -289,7 +384,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered,
cb = alloc_compressed_bio(inode, ordered->file_offset,
REQ_OP_WRITE | write_flags,
- end_compressed_bio_write);
+ end_bbio_comprssed_write);
cb->start = ordered->file_offset;
cb->len = ordered->num_bytes;
cb->compressed_pages = compressed_pages;
@@ -446,7 +541,8 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* subpage::readers and to unlock the page.
*/
if (fs_info->sectorsize < PAGE_SIZE)
- btrfs_subpage_start_reader(fs_info, page, cur, add_size);
+ btrfs_subpage_start_reader(fs_info, page_folio(page),
+ cur, add_size);
put_page(page);
cur += add_size;
}
@@ -489,11 +585,11 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
goto out;
}
- ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
+ ASSERT(extent_map_is_compressed(em));
compressed_len = em->block_len;
cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ,
- end_compressed_bio_read);
+ end_bbio_comprssed_read);
cb->start = em->orig_start;
em_len = em->len;
@@ -501,7 +597,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
cb->len = bbio->bio.bi_iter.bi_size;
cb->compressed_len = compressed_len;
- cb->compress_type = em->compress_type;
+ cb->compress_type = extent_map_compression(em);
cb->orig_bbio = bbio;
free_extent_map(em);
@@ -513,7 +609,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio)
goto out_free_bio;
}
- ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
+ ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0);
if (ret2) {
ret = BLK_STS_RESOURCE;
goto out_free_compressed_pages;
@@ -941,14 +1037,23 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* start_byte tells us the offset into the compressed data we're interested in
*/
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
- unsigned long start_byte, size_t srclen, size_t destlen)
+ unsigned long dest_pgoff, size_t srclen, size_t destlen)
{
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
struct list_head *workspace;
+ const u32 sectorsize = fs_info->sectorsize;
int ret;
+ /*
+ * The full destination page range should not exceed the page size.
+ * And the @destlen should not exceed sectorsize, as this is only called for
+ * inline file extents, which should not exceed sectorsize.
+ */
+ ASSERT(dest_pgoff + destlen <= PAGE_SIZE && destlen <= sectorsize);
+
workspace = get_workspace(type, 0);
ret = compression_decompress(type, workspace, data_in, dest_page,
- start_byte, srclen, destlen);
+ dest_pgoff, srclen, destlen);
put_workspace(type, workspace);
return ret;
@@ -960,15 +1065,36 @@ int __init btrfs_init_compress(void)
offsetof(struct compressed_bio, bbio.bio),
BIOSET_NEED_BVECS))
return -ENOMEM;
+
+ compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages");
+ if (!compr_pool.shrinker)
+ return -ENOMEM;
+
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
zstd_init_workspace_manager();
+
+ spin_lock_init(&compr_pool.lock);
+ INIT_LIST_HEAD(&compr_pool.list);
+ compr_pool.count = 0;
+ /* 128K / 4K = 32, for 8 threads is 256 pages. */
+ compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8;
+ compr_pool.shrinker->count_objects = btrfs_compr_pool_count;
+ compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan;
+ compr_pool.shrinker->batch = 32;
+ compr_pool.shrinker->seeks = DEFAULT_SEEKS;
+ shrinker_register(compr_pool.shrinker);
+
return 0;
}
void __cold btrfs_exit_compress(void)
{
+ /* For now scan drains all pages and does not touch the parameters. */
+ btrfs_compr_pool_scan(NULL, NULL);
+ shrinker_free(compr_pool.shrinker);
+
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 03bb9d143fa7..afd7e50d073d 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -32,6 +32,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0);
#define BTRFS_ZLIB_DEFAULT_LEVEL 3
+struct page;
+
struct compressed_bio {
/* Number of compressed pages in the array */
unsigned int nr_pages;
@@ -96,6 +98,9 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);
+struct page *btrfs_alloc_compr_page(void);
+void btrfs_free_compr_page(struct page *page);
+
enum btrfs_compression_type {
BTRFS_COMPRESS_NONE = 0,
BTRFS_COMPRESS_ZLIB = 1,
@@ -143,7 +148,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
void zlib_free_workspace(struct list_head *ws);
@@ -154,7 +159,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
void lzo_free_workspace(struct list_head *ws);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35c1d24d4a78..e65e012bac55 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -370,33 +370,41 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
/*
* check if the tree block can be shared by multiple trees
*/
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf)
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf)
{
+ const u64 buf_gen = btrfs_header_generation(buf);
+
/*
* Tree blocks not in shareable trees and tree roots are never shared.
* If a block was allocated after the last snapshot and the block was
* not allocated by tree relocation, we know the block is not shared.
*/
- if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
- buf != root->node &&
- (btrfs_header_generation(buf) <=
- btrfs_root_last_snapshot(&root->root_item) ||
- btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) {
- if (buf != root->commit_root)
- return 1;
- /*
- * An extent buffer that used to be the commit root may still be
- * shared because the tree height may have increased and it
- * became a child of a higher level root. This can happen when
- * snapshotting a subvolume created in the current transaction.
- */
- if (btrfs_header_generation(buf) == trans->transid)
- return 1;
- }
- return 0;
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
+ return false;
+
+ if (buf == root->node)
+ return false;
+
+ if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
+ !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
+ return false;
+
+ if (buf != root->commit_root)
+ return true;
+
+ /*
+ * An extent buffer that used to be the commit root may still be shared
+ * because the tree height may have increased and it became a child of a
+ * higher level root. This can happen when snapshotting a subvolume
+ * created in the current transaction.
+ */
+ if (buf_gen == trans->transid)
+ return true;
+
+ return false;
}
static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
@@ -812,7 +820,8 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
}
while (low < high) {
- unsigned long oip;
+ const int unit_size = folio_size(eb->folios[0]);
+ unsigned long oil;
unsigned long offset;
struct btrfs_disk_key *tmp;
struct btrfs_disk_key unaligned;
@@ -820,14 +829,14 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
mid = (low + high) / 2;
offset = p + mid * item_size;
- oip = offset_in_page(offset);
+ oil = get_eb_offset_in_folio(eb, offset);
- if (oip + key_size <= PAGE_SIZE) {
- const unsigned long idx = get_eb_page_index(offset);
- char *kaddr = page_address(eb->pages[idx]);
+ if (oil + key_size <= unit_size) {
+ const unsigned long idx = get_eb_folio_index(eb, offset);
+ char *kaddr = folio_address(eb->folios[idx]);
- oip = get_eb_offset_in_page(eb, offset);
- tmp = (struct btrfs_disk_key *)(kaddr + oip);
+ oil = get_eb_offset_in_folio(eb, offset);
+ tmp = (struct btrfs_disk_key *)(kaddr + oil);
} else {
read_extent_buffer(eb, &unaligned, offset, key_size);
tmp = &unaligned;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 196c005c31f6..70e828d33177 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -212,8 +212,6 @@ struct btrfs_root {
u64 last_trans;
- u32 type;
-
u64 free_objectid;
struct btrfs_key defrag_progress;
@@ -224,18 +222,15 @@ struct btrfs_root {
struct list_head root_list;
- spinlock_t log_extents_lock[2];
- struct list_head logged_list[2];
-
spinlock_t inode_lock;
/* red-black tree that keeps track of in-memory inodes */
struct rb_root inode_tree;
/*
- * radix tree that keeps track of delayed nodes of every inode,
- * protected by inode_lock
+ * Xarray that keeps track of delayed nodes of every inode, protected
+ * by @inode_lock.
*/
- struct radix_tree_root delayed_nodes_tree;
+ struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@@ -561,9 +556,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer **cow_ret, u64 new_root_objectid);
-int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct extent_buffer *buf);
+bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct extent_buffer *buf);
int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct btrfs_path *path, int level, int slot);
void btrfs_extend_item(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c
index 5244561e2016..5b0b64571418 100644
--- a/fs/btrfs/defrag.c
+++ b/fs/btrfs/defrag.c
@@ -775,7 +775,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
* this em, as either we don't care about the generation, or the
* merged extent map will be rejected anyway.
*/
- if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) &&
+ if (em && (em->flags & EXTENT_FLAG_MERGED) &&
newer_than && em->generation >= newer_than) {
free_extent_map(em);
em = NULL;
@@ -802,7 +802,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info,
const struct extent_map *em)
{
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+ if (extent_map_is_compressed(em))
return BTRFS_MAX_COMPRESSED;
return fs_info->max_extent_size;
}
@@ -828,7 +828,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
/* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
goto out;
- if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags))
+ if (next->flags & EXTENT_FLAG_PREALLOC)
goto out;
/*
* If the next extent is at its max capacity, defragging current extent
@@ -996,10 +996,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
em->len <= inode->root->fs_info->max_inline)
goto next;
- /* Skip hole/delalloc/preallocated extents */
+ /* Skip holes and preallocated extents. */
if (em->block_start == EXTENT_MAP_HOLE ||
- em->block_start == EXTENT_MAP_DELALLOC ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ (em->flags & EXTENT_FLAG_PREALLOC))
goto next;
/* Skip older extent */
@@ -1047,7 +1046,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
goto add;
/* Skip too large extent */
- if (range_len >= extent_thresh)
+ if (em->len >= extent_thresh)
goto next;
/*
@@ -1190,7 +1189,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode,
/* Update the page status */
for (i = start_index - first_index; i <= last_index - first_index; i++) {
ClearPageChecked(pages[i]);
- btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
+ btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len);
}
btrfs_delalloc_release_extents(inode, len);
extent_changeset_free(data_reserved);
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index 2833e8ef4c09..acf9f4b6c044 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -245,7 +245,6 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
u64 reserve_size = 0;
u64 qgroup_rsv_size = 0;
- u64 csum_leaves;
unsigned outstanding_extents;
lockdep_assert_held(&inode->lock);
@@ -260,10 +259,12 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
outstanding_extents);
reserve_size += btrfs_calc_metadata_size(fs_info, 1);
}
- csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
- inode->csum_bytes);
- reserve_size += btrfs_calc_insert_metadata_size(fs_info,
- csum_leaves);
+ if (!(inode->flags & BTRFS_INODE_NODATASUM)) {
+ u64 csum_leaves;
+
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
+ reserve_size += btrfs_calc_insert_metadata_size(fs_info, csum_leaves);
+ }
/*
* For qgroup rsv, the calculation is very simple:
* account one nodesize for each outstanding extent
@@ -278,14 +279,20 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
spin_unlock(&block_rsv->lock);
}
-static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
+static void calc_inode_reservations(struct btrfs_inode *inode,
u64 num_bytes, u64 disk_num_bytes,
u64 *meta_reserve, u64 *qgroup_reserve)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
u64 nr_extents = count_max_extents(fs_info, num_bytes);
- u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+ u64 csum_leaves;
u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ csum_leaves = 0;
+ else
+ csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
+
*meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
nr_extents + csum_leaves);
@@ -337,7 +344,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* everything out and try again, which is bad. This way we just
* over-reserve slightly, and clean up the mess when we are done.
*/
- calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
+ calc_inode_reservations(inode, num_bytes, disk_num_bytes,
&meta_reserve, &qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
noflush);
@@ -359,7 +366,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
nr_extents = count_max_extents(fs_info, num_bytes);
spin_lock(&inode->lock);
btrfs_mod_outstanding_extents(inode, nr_extents);
- inode->csum_bytes += disk_num_bytes;
+ if (!(inode->flags & BTRFS_INODE_NODATASUM))
+ inode->csum_bytes += disk_num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
@@ -393,7 +401,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
spin_lock(&inode->lock);
- inode->csum_bytes -= num_bytes;
+ if (!(inode->flags & BTRFS_INODE_NODATASUM))
+ inode->csum_bytes -= num_bytes;
btrfs_calculate_inode_block_rsv_size(fs_info, inode);
spin_unlock(&inode->lock);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 7381241334e8..08102883f560 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -71,7 +71,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
}
spin_lock(&root->inode_lock);
- node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+ node = xa_load(&root->delayed_nodes, ino);
if (node) {
if (btrfs_inode->delayed_node) {
@@ -83,9 +83,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/*
* It's possible that we're racing into the middle of removing
- * this node from the radix tree. In this case, the refcount
+ * this node from the xarray. In this case, the refcount
* was zero and it should never go back to one. Just return
- * NULL like it was never in the radix at all; our release
+ * NULL like it was never in the xarray at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@@ -93,7 +93,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
- * If this node is properly in the radix, we want to bump the
+ * If this node is properly in the xarray, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@@ -120,6 +120,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
struct btrfs_root *root = btrfs_inode->root;
u64 ino = btrfs_ino(btrfs_inode);
int ret;
+ void *ptr;
again:
node = btrfs_get_delayed_node(btrfs_inode);
@@ -131,26 +132,30 @@ again:
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
- /* cached in the btrfs inode and can be accessed */
+ /* Cached in the inode and can be accessed. */
refcount_set(&node->refs, 2);
- ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
+ /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */
+ ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS);
+ if (ret == -ENOMEM) {
kmem_cache_free(delayed_node_cache, node);
- return ERR_PTR(ret);
+ return ERR_PTR(-ENOMEM);
}
-
spin_lock(&root->inode_lock);
- ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
- if (ret == -EEXIST) {
+ ptr = xa_load(&root->delayed_nodes, ino);
+ if (ptr) {
+ /* Somebody inserted it, go back and read it. */
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, node);
- radix_tree_preload_end();
+ node = NULL;
goto again;
}
+ ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC);
+ ASSERT(xa_err(ptr) != -EINVAL);
+ ASSERT(xa_err(ptr) != -ENOMEM);
+ ASSERT(ptr == NULL);
btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock);
- radix_tree_preload_end();
return node;
}
@@ -269,8 +274,7 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
- radix_tree_delete(&root->delayed_nodes_tree,
- delayed_node->inode_id);
+ xa_erase(&root->delayed_nodes, delayed_node->inode_id);
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
@@ -1036,14 +1040,33 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags))
goto out;
- path->slots[0]++;
- if (path->slots[0] >= btrfs_header_nritems(leaf))
- goto search;
-again:
+ /*
+ * Now we're going to delete the INODE_REF/EXTREF, which should be the
+ * only one ref left. Check if the next item is an INODE_REF/EXTREF.
+ *
+ * But if we're the last item already, release and search for the last
+ * INODE_REF/EXTREF.
+ */
+ if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) {
+ key.objectid = node->inode_id;
+ key.type = BTRFS_INODE_EXTREF_KEY;
+ key.offset = (u64)-1;
+
+ btrfs_release_path(path);
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret < 0)
+ goto err_out;
+ ASSERT(ret > 0);
+ ASSERT(path->slots[0] > 0);
+ ret = 0;
+ path->slots[0]--;
+ leaf = path->nodes[0];
+ } else {
+ path->slots[0]++;
+ }
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != node->inode_id)
goto out;
-
if (key.type != BTRFS_INODE_REF_KEY &&
key.type != BTRFS_INODE_EXTREF_KEY)
goto out;
@@ -1070,22 +1093,6 @@ err_out:
btrfs_abort_transaction(trans, ret);
return ret;
-
-search:
- btrfs_release_path(path);
-
- key.type = BTRFS_INODE_EXTREF_KEY;
- key.offset = -1;
-
- ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
- if (ret < 0)
- goto err_out;
- ASSERT(ret);
-
- ret = 0;
- leaf = path->nodes[0];
- path->slots[0]--;
- goto again;
}
static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
@@ -2035,34 +2042,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
- u64 inode_id = 0;
+ unsigned long index = 0;
struct btrfs_delayed_node *delayed_nodes[8];
- int i, n;
while (1) {
+ struct btrfs_delayed_node *node;
+ int count;
+
spin_lock(&root->inode_lock);
- n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
- (void **)delayed_nodes, inode_id,
- ARRAY_SIZE(delayed_nodes));
- if (!n) {
+ if (xa_empty(&root->delayed_nodes)) {
spin_unlock(&root->inode_lock);
- break;
+ return;
}
- inode_id = delayed_nodes[n - 1]->inode_id + 1;
- for (i = 0; i < n; i++) {
+ count = 0;
+ xa_for_each_start(&root->delayed_nodes, index, node, index) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
- if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
- delayed_nodes[i] = NULL;
+ if (refcount_inc_not_zero(&node->refs)) {
+ delayed_nodes[count] = node;
+ count++;
+ }
+ if (count >= ARRAY_SIZE(delayed_nodes))
+ break;
}
spin_unlock(&root->inode_lock);
+ index++;
- for (i = 0; i < n; i++) {
- if (!delayed_nodes[i])
- continue;
+ for (int i = 0; i < count; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f9544fda38e9..79c4293ddf37 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -550,8 +550,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
u64 physical)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 chunk_offset = cache->start;
int num_extents, cur_extent;
int i;
@@ -567,9 +566,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
}
spin_unlock(&cache->lock);
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- ASSERT(!IS_ERR(em));
- map = em->map_lookup;
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ ASSERT(!IS_ERR(map));
num_extents = 0;
cur_extent = 0;
@@ -583,7 +581,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
cur_extent = i;
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
if (num_extents > 1 && cur_extent < num_extents - 1) {
/*
@@ -727,6 +725,23 @@ leave:
return ret;
}
+static int btrfs_check_replace_dev_names(struct btrfs_ioctl_dev_replace_args *args)
+{
+ if (args->start.srcdevid == 0) {
+ if (memchr(args->start.srcdev_name, 0,
+ sizeof(args->start.srcdev_name)) == NULL)
+ return -ENAMETOOLONG;
+ } else {
+ args->start.srcdev_name[0] = 0;
+ }
+
+ if (memchr(args->start.tgtdev_name, 0,
+ sizeof(args->start.tgtdev_name)) == NULL)
+ return -ENAMETOOLONG;
+
+ return 0;
+}
+
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args)
{
@@ -739,10 +754,9 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
default:
return -EINVAL;
}
-
- if ((args->start.srcdevid == 0 && args->start.srcdev_name[0] == '\0') ||
- args->start.tgtdev_name[0] == '\0')
- return -EINVAL;
+ ret = btrfs_check_replace_dev_names(args);
+ if (ret < 0)
+ return ret;
ret = btrfs_dev_replace_start(fs_info, args->start.tgtdev_name,
args->start.srcdevid,
@@ -812,25 +826,23 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
struct btrfs_device *srcdev,
struct btrfs_device *tgtdev)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
u64 start = 0;
int i;
- write_lock(&em_tree->lock);
+ write_lock(&fs_info->mapping_tree_lock);
do {
- em = lookup_extent_mapping(em_tree, start, (u64)-1);
- if (!em)
+ struct btrfs_chunk_map *map;
+
+ map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX);
+ if (!map)
break;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++)
if (srcdev == map->stripes[i].dev)
map->stripes[i].dev = tgtdev;
- start = em->start + em->len;
- free_extent_map(em);
+ start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
} while (start);
- write_unlock(&em_tree->lock);
+ write_unlock(&fs_info->mapping_tree_lock);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 62cb97f7c94f..c843563914ca 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,20 +74,37 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
static void csum_tree_block(struct extent_buffer *buf, u8 *result)
{
struct btrfs_fs_info *fs_info = buf->fs_info;
- const int num_pages = num_extent_pages(buf);
- const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ int num_pages;
+ u32 first_page_part;
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
char *kaddr;
int i;
shash->tfm = fs_info->csum_shash;
crypto_shash_init(shash);
- kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
+
+ if (buf->addr) {
+ /* Pages are contiguous, handle them as a big one. */
+ kaddr = buf->addr;
+ first_page_part = fs_info->nodesize;
+ num_pages = 1;
+ } else {
+ kaddr = folio_address(buf->folios[0]);
+ first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
+ num_pages = num_extent_pages(buf);
+ }
+
crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
first_page_part - BTRFS_CSUM_SIZE);
+ /*
+ * Multiple single-page folios case would reach here.
+ *
+ * nodesize <= PAGE_SIZE and large folio all handled by above
+ * crypto_shash_update() already.
+ */
for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
- kaddr = page_address(buf->pages[i]);
+ kaddr = folio_address(buf->folios[i]);
crypto_shash_update(shash, kaddr, PAGE_SIZE);
}
memset(result, 0, BTRFS_CSUM_SIZE);
@@ -166,20 +183,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i, num_pages = num_extent_pages(eb);
+ int num_folios = num_extent_folios(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
- u64 start = max_t(u64, eb->start, page_offset(p));
- u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE);
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+ u64 start = max_t(u64, eb->start, folio_pos(folio));
+ u64 end = min_t(u64, eb->start + eb->len,
+ folio_pos(folio) + folio_size(folio));
u32 len = end - start;
ret = btrfs_repair_io_failure(fs_info, 0, start, len,
- start, p, offset_in_page(start), mirror_num);
+ start, folio, offset_in_folio(folio, start),
+ mirror_num);
if (ret)
break;
}
@@ -254,15 +273,20 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio)
if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len))
return BLK_STS_IOERR;
- if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) {
- WARN_ON_ONCE(found_start != 0);
+ /*
+ * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't
+ * checksum it but zero-out its content. This is done to preserve
+ * ordering of I/O without unnecessarily writing out data.
+ */
+ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) {
+ memzero_extent_buffer(eb, 0, eb->len);
return BLK_STS_OK;
}
if (WARN_ON_ONCE(found_start != eb->start))
return BLK_STS_IOERR;
- if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start,
- eb->len)))
+ if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0],
+ eb->start, eb->len)))
return BLK_STS_IOERR;
ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
@@ -371,8 +395,8 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb,
}
csum_tree_block(eb, result);
- header_csum = page_address(eb->pages[0]) +
- get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum));
+ header_csum = folio_address(eb->folios[0]) +
+ get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum));
if (memcmp(result, header_csum, csum_size) != 0) {
btrfs_warn_rl(fs_info,
@@ -639,7 +663,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
- INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
+ /* GFP flags are compatible with XA_FLAGS_*. */
+ xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
btrfs_init_root_block_rsv(root);
@@ -650,14 +675,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&root->ordered_extents);
INIT_LIST_HEAD(&root->ordered_root);
INIT_LIST_HEAD(&root->reloc_dirty_list);
- INIT_LIST_HEAD(&root->logged_list[0]);
- INIT_LIST_HEAD(&root->logged_list[1]);
spin_lock_init(&root->inode_lock);
spin_lock_init(&root->delalloc_lock);
spin_lock_init(&root->ordered_extent_lock);
spin_lock_init(&root->accounting_lock);
- spin_lock_init(&root->log_extents_lock[0]);
- spin_lock_init(&root->log_extents_lock[1]);
spin_lock_init(&root->qgroup_meta_rsv_lock);
mutex_init(&root->objectid_mutex);
mutex_init(&root->log_mutex);
@@ -1286,12 +1307,12 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info)
*
* @objectid: root id
* @anon_dev: preallocated anonymous block device number for new roots,
- * pass 0 for new allocation.
+ * pass NULL for a new allocation.
* @check_ref: whether to check root item references, If true, return -ENOENT
* for orphan roots
*/
static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev,
+ u64 objectid, dev_t *anon_dev,
bool check_ref)
{
struct btrfs_root *root;
@@ -1315,8 +1336,17 @@ static struct btrfs_root *btrfs_get_root_ref(struct btrfs_fs_info *fs_info,
again:
root = btrfs_lookup_fs_root(fs_info, objectid);
if (root) {
- /* Shouldn't get preallocated anon_dev for cached roots */
- ASSERT(!anon_dev);
+ /*
+ * Some other caller may have read out the newly inserted
+ * subvolume already (for things like backref walk etc). Not
+ * that common but still possible. In that case, we just need
+ * to free the anon_dev.
+ */
+ if (unlikely(anon_dev && *anon_dev)) {
+ free_anon_bdev(*anon_dev);
+ *anon_dev = 0;
+ }
+
if (check_ref && btrfs_root_refs(&root->root_item) == 0) {
btrfs_put_root(root);
return ERR_PTR(-ENOENT);
@@ -1336,7 +1366,7 @@ again:
goto fail;
}
- ret = btrfs_init_fs_root(root, anon_dev);
+ ret = btrfs_init_fs_root(root, anon_dev ? *anon_dev : 0);
if (ret)
goto fail;
@@ -1372,7 +1402,7 @@ fail:
* root's anon_dev to 0 to avoid a double free, once by btrfs_put_root()
* and once again by our caller.
*/
- if (anon_dev)
+ if (anon_dev && *anon_dev)
root->anon_dev = 0;
btrfs_put_root(root);
return ERR_PTR(ret);
@@ -1388,7 +1418,7 @@ fail:
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref)
{
- return btrfs_get_root_ref(fs_info, objectid, 0, check_ref);
+ return btrfs_get_root_ref(fs_info, objectid, NULL, check_ref);
}
/*
@@ -1396,11 +1426,11 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
* the anonymous block device id
*
* @objectid: tree objectid
- * @anon_dev: if zero, allocate a new anonymous block device or use the
- * parameter value
+ * @anon_dev: if NULL, allocate a new anonymous block device or use the
+ * parameter value if not NULL
*/
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev)
+ u64 objectid, dev_t *anon_dev)
{
return btrfs_get_root_ref(fs_info, objectid, anon_dev, true);
}
@@ -2618,9 +2648,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
*/
btrfs_set_super_log_root(sb, 0);
- /* We can't trust the free space cache either */
- btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
-
btrfs_warn(fs_info, "try to load backup roots slot %d", i);
ret = read_backup_root(fs_info, i);
backup_index = ret;
@@ -2724,7 +2751,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->allocated_ebs);
spin_lock_init(&fs_info->eb_leak_lock);
#endif
- extent_map_tree_init(&fs_info->mapping_tree);
+ fs_info->mapping_tree = RB_ROOT_CACHED;
+ rwlock_init(&fs_info->mapping_tree_lock);
btrfs_init_block_rsv(&fs_info->global_block_rsv,
BTRFS_BLOCK_RSV_GLOBAL);
btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
@@ -2794,6 +2822,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->sectorsize_bits = ilog2(4096);
fs_info->stripesize = 4096;
+ /* Default compress algorithm when user does -o compress */
+ fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+
fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE;
spin_lock_init(&fs_info->swapfile_pins_lock);
@@ -2931,17 +2962,6 @@ out:
}
/*
- * Some options only have meaning at mount time and shouldn't persist across
- * remounts, or be displayed. Clear these at the end of mount and remount
- * code paths.
- */
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
-{
- btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
- btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
-}
-
-/*
* Mounting logic specific to read-write file systems. Shared by open_ctree
* and btrfs_remount when remounting from read-only to read-write.
*/
@@ -2953,7 +2973,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
if (btrfs_test_opt(fs_info, CLEAR_CACHE) &&
btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- rebuild_free_space_tree = true;
+ if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
+ btrfs_warn(fs_info,
+ "'clear_cache' option is ignored with extent tree v2");
+ else
+ rebuild_free_space_tree = true;
} else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) {
btrfs_warn(fs_info, "free space tree is invalid");
@@ -3276,13 +3300,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
WRITE_ONCE(fs_info->fs_error, -EUCLEAN);
- /*
- * In the long term, we'll store the compression type in the super
- * block, and it'll be used for per file compression control.
- */
- fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
-
-
/* Set up fs_info before parsing mount options */
nodesize = btrfs_super_nodesize(disk_super);
sectorsize = btrfs_super_sectorsize(disk_super);
@@ -3296,28 +3313,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size;
fs_info->stripesize = stripesize;
- ret = btrfs_parse_options(fs_info, options, sb->s_flags);
- if (ret)
+ /*
+ * Handle the space caching options appropriately now that we have the
+ * super block loaded and validated.
+ */
+ btrfs_set_free_space_cache_settings(fs_info);
+
+ if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) {
+ ret = -EINVAL;
goto fail_alloc;
+ }
ret = btrfs_check_features(fs_info, !sb_rdonly(sb));
if (ret < 0)
goto fail_alloc;
+ /*
+ * At this point our mount options are validated, if we set ->max_inline
+ * to something non-standard make sure we truncate it to sectorsize.
+ */
+ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize);
+
if (sectorsize < PAGE_SIZE) {
struct btrfs_subpage_info *subpage_info;
- /*
- * V1 space cache has some hardcoded PAGE_SIZE usage, and is
- * going to be deprecated.
- *
- * Force to use v2 cache for subpage case.
- */
- btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
- btrfs_set_and_info(fs_info, FREE_SPACE_TREE,
- "forcing free space tree for sector size %u with page size %lu",
- sectorsize, PAGE_SIZE);
-
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
@@ -3494,29 +3513,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
goto fail_cleaner;
}
- if (!btrfs_test_opt(fs_info, NOSSD) &&
- !fs_info->fs_devices->rotating) {
- btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations");
- }
-
- /*
- * For devices supporting discard turn on discard=async automatically,
- * unless it's already set or disabled. This could be turned off by
- * nodiscard for the same mount.
- *
- * The zoned mode piggy backs on the discard functionality for
- * resetting a zone. There is no reason to delay the zone reset as it is
- * fast enough. So, do not enable async discard for zoned mode.
- */
- if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
- btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
- btrfs_test_opt(fs_info, NODISCARD)) &&
- fs_info->fs_devices->discardable &&
- !btrfs_is_zoned(fs_info)) {
- btrfs_set_and_info(fs_info, DISCARD_ASYNC,
- "auto enabling async discard");
- }
-
ret = btrfs_read_qgroup_config(fs_info);
if (ret)
goto fail_trans_kthread;
@@ -3542,7 +3538,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
if (sb_rdonly(sb))
- goto clear_oneshot;
+ return 0;
ret = btrfs_start_pre_rw_mount(fs_info);
if (ret) {
@@ -3570,8 +3566,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags))
wake_up_process(fs_info->cleaner_kthread);
-clear_oneshot:
- btrfs_clear_oneshot_options(fs_info);
return 0;
fail_qgroup:
@@ -3608,7 +3602,7 @@ fail_sb_buffer:
btrfs_stop_all_workers(fs_info);
btrfs_free_block_groups(fs_info);
fail_alloc:
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
iput(fs_info->btree_inode);
fail:
@@ -4391,7 +4385,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
iput(fs_info->btree_inode);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
btrfs_close_devices(fs_info->fs_devices);
}
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 50dab8f639dc..eb3473d1c1ac 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -37,9 +37,6 @@ struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root,
int level);
-void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
- struct extent_buffer *buf);
-void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info);
int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info);
int btrfs_check_super_csum(struct btrfs_fs_info *fs_info,
const struct btrfs_super_block *disk_sb);
@@ -64,7 +61,7 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
u64 objectid, bool check_ref);
struct btrfs_root *btrfs_get_new_fs_root(struct btrfs_fs_info *fs_info,
- u64 objectid, dev_t anon_dev);
+ u64 objectid, dev_t *anon_dev);
struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
u64 objectid);
diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c
index ea149be28dff..e3ee5449cc4a 100644
--- a/fs/btrfs/extent-io-tree.c
+++ b/fs/btrfs/extent-io-tree.c
@@ -58,12 +58,13 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree,
u64 start, u64 end)
{
- struct btrfs_inode *inode = tree->inode;
+ const struct btrfs_inode *inode;
u64 isize;
- if (!inode)
+ if (tree->owner != IO_TREE_INODE_IO)
return;
+ inode = extent_io_tree_to_inode_const(tree);
isize = i_size_read(&inode->vfs_inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
btrfs_debug_rl(inode->root->fs_info,
@@ -78,31 +79,46 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
#define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0)
#endif
+
/*
- * For the file_extent_tree, we want to hold the inode lock when we lookup and
- * update the disk_i_size, but lockdep will complain because our io_tree we hold
- * the tree lock and get the inode lock when setting delalloc. These two things
- * are unrelated, so make a class for the file_extent_tree so we don't get the
- * two locking patterns mixed up.
+ * The only tree allowed to set the inode is IO_TREE_INODE_IO.
*/
-static struct lock_class_key file_extent_tree_class;
+static bool is_inode_io_tree(const struct extent_io_tree *tree)
+{
+ return tree->owner == IO_TREE_INODE_IO;
+}
+
+/* Return the inode if it's valid for the given tree, otherwise NULL. */
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode;
+ return NULL;
+}
+
+/* Read-only access to the inode. */
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode;
+ return NULL;
+}
-struct tree_entry {
- u64 start;
- u64 end;
- struct rb_node rb_node;
-};
+/* For read-only access to fs_info. */
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree)
+{
+ if (tree->owner == IO_TREE_INODE_IO)
+ return tree->inode->root->fs_info;
+ return tree->fs_info;
+}
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner)
{
- tree->fs_info = fs_info;
tree->state = RB_ROOT;
spin_lock_init(&tree->lock);
- tree->inode = NULL;
+ tree->fs_info = fs_info;
tree->owner = owner;
- if (owner == IO_TREE_INODE_FILE_EXTENT)
- lockdep_set_class(&tree->lock, &file_extent_tree_class);
}
/*
@@ -329,10 +345,14 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64
return tree_search_for_insert(tree, offset, NULL, NULL);
}
-static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
+static void extent_io_tree_panic(const struct extent_io_tree *tree,
+ const struct extent_state *state,
+ const char *opname,
+ int err)
{
- btrfs_panic(tree->fs_info, err,
- "locking error: extent tree was modified by another thread while locked");
+ btrfs_panic(extent_io_tree_to_fs_info(tree), err,
+ "extent io tree error on %s state start %llu end %llu",
+ opname, state->start, state->end);
}
static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state)
@@ -341,8 +361,9 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s
prev = prev_state(state);
if (prev && prev->end == state->start - 1 && prev->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, prev);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+ state, prev);
state->start = prev->start;
rb_erase(&prev->rb_node, &tree->state);
RB_CLEAR_NODE(&prev->rb_node);
@@ -356,8 +377,9 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s
next = next_state(state);
if (next && next->start == state->end + 1 && next->state == state->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode, state, next);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree),
+ state, next);
state->end = next->end;
rb_erase(&next->rb_node, &tree->state);
RB_CLEAR_NODE(&next->rb_node);
@@ -390,8 +412,8 @@ static void set_state_bits(struct extent_io_tree *tree,
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->inode)
- btrfs_set_delalloc_extent(tree->inode, state, bits);
+ if (is_inode_io_tree(tree))
+ btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits);
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0);
@@ -436,9 +458,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
if (state->end < entry->start) {
if (try_merge && end == entry->start &&
state->state == entry->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode,
- state, entry);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(
+ extent_io_tree_to_inode(tree),
+ state, entry);
entry->start = state->start;
merge_prev_state(tree, entry);
state->state = 0;
@@ -448,9 +471,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
} else if (state->end > entry->end) {
if (try_merge && entry->end == start &&
state->state == entry->state) {
- if (tree->inode)
- btrfs_merge_delalloc_extent(tree->inode,
- state, entry);
+ if (is_inode_io_tree(tree))
+ btrfs_merge_delalloc_extent(
+ extent_io_tree_to_inode(tree),
+ state, entry);
entry->end = state->end;
merge_next_state(tree, entry);
state->state = 0;
@@ -458,9 +482,6 @@ static struct extent_state *insert_state(struct extent_io_tree *tree,
}
node = &(*node)->rb_right;
} else {
- btrfs_err(tree->fs_info,
- "found node %llu %llu on insert of %llu %llu",
- entry->start, entry->end, state->start, state->end);
return ERR_PTR(-EEXIST);
}
}
@@ -505,8 +526,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct rb_node *parent = NULL;
struct rb_node **node;
- if (tree->inode)
- btrfs_split_delalloc_extent(tree->inode, orig, split);
+ if (is_inode_io_tree(tree))
+ btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig,
+ split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@@ -553,8 +575,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
- if (tree->inode)
- btrfs_clear_delalloc_extent(tree->inode, state, bits);
+ if (is_inode_io_tree(tree))
+ btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state,
+ bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
@@ -695,7 +718,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
prealloc = NULL;
if (err)
@@ -717,7 +740,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
if (wake)
wake_up(&state->wq);
@@ -939,6 +962,8 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state *state;
int ret = 1;
+ ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES));
+
spin_lock(&tree->lock);
state = find_first_extent_bit_state(tree, start, bits);
if (state) {
@@ -1152,7 +1177,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
prealloc = NULL;
if (err)
@@ -1200,7 +1225,7 @@ hit_next:
inserted_state = insert_state(tree, prealloc, bits, changeset);
if (IS_ERR(inserted_state)) {
err = PTR_ERR(inserted_state);
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, prealloc, "insert", err);
}
cache_state(inserted_state, cached_state);
@@ -1228,7 +1253,7 @@ hit_next:
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
set_state_bits(tree, prealloc, bits, changeset);
cache_state(prealloc, cached_state);
@@ -1382,7 +1407,7 @@ hit_next:
}
err = split_state(tree, state, prealloc, start);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
prealloc = NULL;
if (err)
goto out;
@@ -1430,7 +1455,7 @@ hit_next:
inserted_state = insert_state(tree, prealloc, bits, NULL);
if (IS_ERR(inserted_state)) {
err = PTR_ERR(inserted_state);
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, prealloc, "insert", err);
}
cache_state(inserted_state, cached_state);
if (inserted_state == prealloc)
@@ -1453,7 +1478,7 @@ hit_next:
err = split_state(tree, state, prealloc, end + 1);
if (err)
- extent_io_tree_panic(tree, err);
+ extent_io_tree_panic(tree, state, "split", err);
set_state_bits(tree, prealloc, bits, NULL);
cache_state(prealloc, cached_state);
diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h
index 5602b0137fcd..ebe6390d65e9 100644
--- a/fs/btrfs/extent-io-tree.h
+++ b/fs/btrfs/extent-io-tree.h
@@ -87,9 +87,17 @@ enum {
struct extent_io_tree {
struct rb_root state;
- struct btrfs_fs_info *fs_info;
- /* Inode associated with this tree, or NULL. */
- struct btrfs_inode *inode;
+ /*
+ * The fs_info is needed for trace points, a tree attached to an inode
+ * needs the inode.
+ *
+ * owner == IO_TREE_INODE_IO - then inode is valid and fs_info can be
+ * accessed as inode->root->fs_info
+ */
+ union {
+ struct btrfs_fs_info *fs_info;
+ struct btrfs_inode *inode;
+ };
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@@ -112,6 +120,10 @@ struct extent_state {
#endif
};
+struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree);
+const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree);
+const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree);
+
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner);
void extent_io_tree_release(struct extent_io_tree *tree);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 01423670bc8a..8e8cc1111277 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1260,7 +1260,8 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
u64 bytes_left, end;
u64 aligned_start = ALIGN(start, 1 << SECTOR_SHIFT);
- if (WARN_ON(start != aligned_start)) {
+ /* Adjust the range to be aligned to 512B sectors if necessary. */
+ if (start != aligned_start) {
len -= aligned_start - start;
len = round_down(len, 1 << SECTOR_SHIFT);
start = aligned_start;
@@ -3447,6 +3448,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_ref generic_ref = { 0 };
+ struct btrfs_block_group *bg;
int ret;
btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
@@ -3460,67 +3462,64 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(ret); /* -ENOMEM */
}
- if (last_ref && btrfs_header_generation(buf) == trans->transid) {
- struct btrfs_block_group *cache;
- bool must_pin = false;
-
- if (root_id != BTRFS_TREE_LOG_OBJECTID) {
- ret = check_ref_cleanup(trans, buf->start);
- if (!ret) {
- btrfs_redirty_list_add(trans->transaction, buf);
- goto out;
- }
- }
+ if (!last_ref)
+ return;
- cache = btrfs_lookup_block_group(fs_info, buf->start);
+ if (btrfs_header_generation(buf) != trans->transid)
+ goto out;
- if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- pin_down_extent(trans, cache, buf->start, buf->len, 1);
- btrfs_put_block_group(cache);
+ if (root_id != BTRFS_TREE_LOG_OBJECTID) {
+ ret = check_ref_cleanup(trans, buf->start);
+ if (!ret)
goto out;
- }
+ }
- /*
- * If there are tree mod log users we may have recorded mod log
- * operations for this node. If we re-allocate this node we
- * could replay operations on this node that happened when it
- * existed in a completely different root. For example if it
- * was part of root A, then was reallocated to root B, and we
- * are doing a btrfs_old_search_slot(root b), we could replay
- * operations that happened when the block was part of root A,
- * giving us an inconsistent view of the btree.
- *
- * We are safe from races here because at this point no other
- * node or root points to this extent buffer, so if after this
- * check a new tree mod log user joins we will not have an
- * existing log of operations on this node that we have to
- * contend with.
- */
- if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
- must_pin = true;
+ bg = btrfs_lookup_block_group(fs_info, buf->start);
- if (must_pin || btrfs_is_zoned(fs_info)) {
- btrfs_redirty_list_add(trans->transaction, buf);
- pin_down_extent(trans, cache, buf->start, buf->len, 1);
- btrfs_put_block_group(cache);
- goto out;
- }
+ if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+ pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ btrfs_put_block_group(bg);
+ goto out;
+ }
- WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+ /*
+ * If there are tree mod log users we may have recorded mod log
+ * operations for this node. If we re-allocate this node we
+ * could replay operations on this node that happened when it
+ * existed in a completely different root. For example if it
+ * was part of root A, then was reallocated to root B, and we
+ * are doing a btrfs_old_search_slot(root b), we could replay
+ * operations that happened when the block was part of root A,
+ * giving us an inconsistent view of the btree.
+ *
+ * We are safe from races here because at this point no other
+ * node or root points to this extent buffer, so if after this
+ * check a new tree mod log user joins we will not have an
+ * existing log of operations on this node that we have to
+ * contend with.
+ */
- btrfs_add_free_space(cache, buf->start, buf->len);
- btrfs_free_reserved_bytes(cache, buf->len, 0);
- btrfs_put_block_group(cache);
- trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+ if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)
+ || btrfs_is_zoned(fs_info)) {
+ pin_down_extent(trans, bg, buf->start, buf->len, 1);
+ btrfs_put_block_group(bg);
+ goto out;
}
+
+ WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+
+ btrfs_add_free_space(bg, buf->start, buf->len);
+ btrfs_free_reserved_bytes(bg, buf->len, 0);
+ btrfs_put_block_group(bg);
+ trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
+
out:
- if (last_ref) {
- /*
- * Deleting the buffer, clear the corrupt flag since it doesn't
- * matter anymore.
- */
- clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
- }
+
+ /*
+ * Deleting the buffer, clear the corrupt flag since it doesn't
+ * matter anymore.
+ */
+ clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
}
/* Can return -ENOMEM */
@@ -4300,6 +4299,42 @@ static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info,
return 0;
}
+static int prepare_allocation_zoned(struct btrfs_fs_info *fs_info,
+ struct find_free_extent_ctl *ffe_ctl)
+{
+ if (ffe_ctl->for_treelog) {
+ spin_lock(&fs_info->treelog_bg_lock);
+ if (fs_info->treelog_bg)
+ ffe_ctl->hint_byte = fs_info->treelog_bg;
+ spin_unlock(&fs_info->treelog_bg_lock);
+ } else if (ffe_ctl->for_data_reloc) {
+ spin_lock(&fs_info->relocation_bg_lock);
+ if (fs_info->data_reloc_bg)
+ ffe_ctl->hint_byte = fs_info->data_reloc_bg;
+ spin_unlock(&fs_info->relocation_bg_lock);
+ } else if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) {
+ struct btrfs_block_group *block_group;
+
+ spin_lock(&fs_info->zone_active_bgs_lock);
+ list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
+ /*
+ * No lock is OK here because avail is monotinically
+ * decreasing, and this is just a hint.
+ */
+ u64 avail = block_group->zone_capacity - block_group->alloc_offset;
+
+ if (block_group_bits(block_group, ffe_ctl->flags) &&
+ avail >= ffe_ctl->num_bytes) {
+ ffe_ctl->hint_byte = block_group->start;
+ break;
+ }
+ }
+ spin_unlock(&fs_info->zone_active_bgs_lock);
+ }
+
+ return 0;
+}
+
static int prepare_allocation(struct btrfs_fs_info *fs_info,
struct find_free_extent_ctl *ffe_ctl,
struct btrfs_space_info *space_info,
@@ -4310,19 +4345,7 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info,
return prepare_allocation_clustered(fs_info, ffe_ctl,
space_info, ins);
case BTRFS_EXTENT_ALLOC_ZONED:
- if (ffe_ctl->for_treelog) {
- spin_lock(&fs_info->treelog_bg_lock);
- if (fs_info->treelog_bg)
- ffe_ctl->hint_byte = fs_info->treelog_bg;
- spin_unlock(&fs_info->treelog_bg_lock);
- }
- if (ffe_ctl->for_data_reloc) {
- spin_lock(&fs_info->relocation_bg_lock);
- if (fs_info->data_reloc_bg)
- ffe_ctl->hint_byte = fs_info->data_reloc_bg;
- spin_unlock(&fs_info->relocation_bg_lock);
- }
- return 0;
+ return prepare_allocation_zoned(fs_info, ffe_ctl);
default:
BUG();
}
@@ -5061,7 +5084,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
__btrfs_tree_lock(buf, nest);
btrfs_clear_buffer_dirty(trans, buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
- clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags);
+ clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags);
set_extent_buffer_uptodate(buf);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8f724c54fc8e..8b4bef05e222 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -184,22 +184,23 @@ static void process_one_page(struct btrfs_fs_info *fs_info,
struct page *page, struct page *locked_page,
unsigned long page_ops, u64 start, u64 end)
{
+ struct folio *folio = page_folio(page);
u32 len;
ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
len = end + 1 - start;
if (page_ops & PAGE_SET_ORDERED)
- btrfs_page_clamp_set_ordered(fs_info, page, start, len);
+ btrfs_folio_clamp_set_ordered(fs_info, folio, start, len);
if (page_ops & PAGE_START_WRITEBACK) {
- btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
- btrfs_page_clamp_set_writeback(fs_info, page, start, len);
+ btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len);
+ btrfs_folio_clamp_set_writeback(fs_info, folio, start, len);
}
if (page_ops & PAGE_END_WRITEBACK)
- btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
+ btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len);
if (page != locked_page && (page_ops & PAGE_UNLOCK))
- btrfs_page_end_writer_lock(fs_info, page, start, len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}
static void __process_pages_contig(struct address_space *mapping,
@@ -271,19 +272,20 @@ static noinline int lock_delalloc_pages(struct inode *inode,
goto out;
for (i = 0; i < found_folios; i++) {
- struct page *page = &fbatch.folios[i]->page;
+ struct folio *folio = fbatch.folios[i];
+ struct page *page = folio_page(folio, 0);
u32 len = end + 1 - start;
if (page == locked_page)
continue;
- if (btrfs_page_start_writer_lock(fs_info, page, start,
- len))
+ if (btrfs_folio_start_writer_lock(fs_info, folio, start,
+ len))
goto out;
if (!PageDirty(page) || page->mapping != mapping) {
- btrfs_page_end_writer_lock(fs_info, page, start,
- len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start,
+ len);
goto out;
}
@@ -432,60 +434,65 @@ static bool btrfs_verify_page(struct page *page, u64 start)
static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct folio *folio = page_folio(page);
ASSERT(page_offset(page) <= start &&
start + len <= page_offset(page) + PAGE_SIZE);
if (uptodate && btrfs_verify_page(page, start))
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_folio_clear_uptodate(fs_info, folio, start, len);
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, page->mapping))
unlock_page(page);
else
- btrfs_subpage_end_reader(fs_info, page, start, len);
+ btrfs_subpage_end_reader(fs_info, folio, start, len);
}
/*
- * after a writepage IO is done, we need to:
- * clear the uptodate bits on error
- * clear the writeback bits in the extent tree for this IO
- * end_page_writeback if the page has no more pending IO
+ * After a write IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - clear the writeback bits in the extent tree for the range
+ * - filio_end_writeback() if there is no more pending io for the folio
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_writepage(struct btrfs_bio *bbio)
+static void end_bbio_data_write(struct btrfs_bio *bbio)
{
struct bio *bio = &bbio->bio;
int error = blk_status_to_errno(bio->bi_status);
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
- struct inode *inode = page->mapping->host;
+ bio_for_each_folio_all(fi, bio) {
+ struct folio *folio = fi.folio;
+ struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
- u64 start = page_offset(page) + bvec->bv_offset;
- u32 len = bvec->bv_len;
+ u64 start = folio_pos(folio) + fi.offset;
+ u32 len = fi.length;
+
+ /* Only order 0 (single page) folios are allowed for data. */
+ ASSERT(folio_order(folio) == 0);
/* Our read/write should always be sector aligned. */
- if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
- "partial page write in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
+ "partial page write in btrfs with offset %zu and length %zu",
+ fi.offset, fi.length);
+ else if (!IS_ALIGNED(fi.length, sectorsize))
btrfs_info(fs_info,
- "incomplete page write with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
+ "incomplete page write with offset %zu and length %zu",
+ fi.offset, fi.length);
- btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error);
+ btrfs_finish_ordered_extent(bbio->ordered,
+ folio_page(folio, 0), start, len, !error);
if (error)
- mapping_set_error(page->mapping, error);
- btrfs_page_clear_writeback(fs_info, page, start, len);
+ mapping_set_error(folio->mapping, error);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
}
bio_put(bio);
@@ -562,98 +569,102 @@ update:
static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
{
- ASSERT(PageLocked(page));
- if (!btrfs_is_subpage(fs_info, page))
+ struct folio *folio = page_folio(page);
+
+ ASSERT(folio_test_locked(folio));
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page));
- btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
+ ASSERT(folio_test_private(folio));
+ btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE);
}
/*
- * after a readpage IO is done, we need to:
- * clear the uptodate bits on error
- * set the uptodate bits if things worked
- * set the page up to date if all extents in the tree are uptodate
- * clear the lock bit in the extent tree
- * unlock the page if there are no other extents locked for it
+ * After a data read IO is done, we need to:
+ *
+ * - clear the uptodate bits on error
+ * - set the uptodate bits if things worked
+ * - set the folio up to date if all extents in the tree are uptodate
+ * - clear the lock bit in the extent tree
+ * - unlock the folio if there are no other extents locked for it
*
* Scheduling is not allowed, so the extent state tree is expected
* to have one and only one object corresponding to this IO.
*/
-static void end_bio_extent_readpage(struct btrfs_bio *bbio)
+static void end_bbio_data_read(struct btrfs_bio *bbio)
{
struct bio *bio = &bbio->bio;
- struct bio_vec *bvec;
struct processed_extent processed = { 0 };
+ struct folio_iter fi;
/*
* The offset to the beginning of a bio, since one bio can never be
* larger than UINT_MAX, u32 here is enough.
*/
u32 bio_offset = 0;
- struct bvec_iter_all iter_all;
ASSERT(!bio_flagged(bio, BIO_CLONED));
- bio_for_each_segment_all(bvec, bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
bool uptodate = !bio->bi_status;
- struct page *page = bvec->bv_page;
- struct inode *inode = page->mapping->host;
+ struct folio *folio = fi.folio;
+ struct inode *inode = folio->mapping->host;
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
const u32 sectorsize = fs_info->sectorsize;
u64 start;
u64 end;
u32 len;
+ /* For now only order 0 folios are supported for data. */
+ ASSERT(folio_order(folio) == 0);
btrfs_debug(fs_info,
- "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
- bio->bi_iter.bi_sector, bio->bi_status,
+ "%s: bi_sector=%llu, err=%d, mirror=%u",
+ __func__, bio->bi_iter.bi_sector, bio->bi_status,
bbio->mirror_num);
/*
* We always issue full-sector reads, but if some block in a
- * page fails to read, blk_update_request() will advance
+ * folio fails to read, blk_update_request() will advance
* bv_offset and adjust bv_len to compensate. Print a warning
* for unaligned offsets, and an error if they don't add up to
* a full sector.
*/
- if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
+ if (!IS_ALIGNED(fi.offset, sectorsize))
btrfs_err(fs_info,
- "partial page read in btrfs with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
- else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
- sectorsize))
+ "partial page read in btrfs with offset %zu and length %zu",
+ fi.offset, fi.length);
+ else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize))
btrfs_info(fs_info,
- "incomplete page read with offset %u and length %u",
- bvec->bv_offset, bvec->bv_len);
+ "incomplete page read with offset %zu and length %zu",
+ fi.offset, fi.length);
- start = page_offset(page) + bvec->bv_offset;
- end = start + bvec->bv_len - 1;
- len = bvec->bv_len;
+ start = folio_pos(folio) + fi.offset;
+ end = start + fi.length - 1;
+ len = fi.length;
if (likely(uptodate)) {
loff_t i_size = i_size_read(inode);
- pgoff_t end_index = i_size >> PAGE_SHIFT;
+ pgoff_t end_index = i_size >> folio_shift(folio);
/*
* Zero out the remaining part if this range straddles
* i_size.
*
- * Here we should only zero the range inside the bvec,
+ * Here we should only zero the range inside the folio,
* not touch anything else.
*
* NOTE: i_size is exclusive while end is inclusive.
*/
- if (page->index == end_index && i_size <= end) {
- u32 zero_start = max(offset_in_page(i_size),
- offset_in_page(start));
+ if (folio_index(folio) == end_index && i_size <= end) {
+ u32 zero_start = max(offset_in_folio(folio, i_size),
+ offset_in_folio(folio, start));
+ u32 zero_len = offset_in_folio(folio, end) + 1 -
+ zero_start;
- zero_user_segment(page, zero_start,
- offset_in_page(end) + 1);
+ folio_zero_range(folio, zero_start, zero_len);
}
}
/* Update page status and unlock. */
- end_page_read(page, uptodate, start, len);
+ end_page_read(folio_page(folio, 0), uptodate, start, len);
endio_readpage_release_extent(&processed, BTRFS_I(inode),
start, end, uptodate);
@@ -672,19 +683,22 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio)
* @nr_pages: number of pages to allocate
* @page_array: the array to fill with pages; any existing non-null entries in
* the array will be skipped
+ * @extra_gfp: the extra GFP flags for the allocation.
*
* Return: 0 if all pages were able to be allocated;
* -ENOMEM otherwise, the partially allocated pages would be freed and
* the array slots zeroed
*/
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+ gfp_t extra_gfp)
{
unsigned int allocated;
for (allocated = 0; allocated < nr_pages;) {
unsigned int last = allocated;
- allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
+ allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp,
+ nr_pages, page_array);
if (allocated == nr_pages)
return 0;
@@ -707,6 +721,26 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
return 0;
}
+/*
+ * Populate needed folios for the extent buffer.
+ *
+ * For now, the folios populated are always in order 0 (aka, single page).
+ */
+static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp)
+{
+ struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 };
+ int num_pages = num_extent_pages(eb);
+ int ret;
+
+ ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp);
+ if (ret < 0)
+ return ret;
+
+ for (int i = 0; i < num_pages; i++)
+ eb->folios[i] = page_folio(page_array[i]);
+ return 0;
+}
+
static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl,
struct page *page, u64 disk_bytenr,
unsigned int pg_offset)
@@ -861,9 +895,9 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl,
} while (size);
}
-static int attach_extent_buffer_page(struct extent_buffer *eb,
- struct page *page,
- struct btrfs_subpage *prealloc)
+static int attach_extent_buffer_folio(struct extent_buffer *eb,
+ struct folio *folio,
+ struct btrfs_subpage *prealloc)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
int ret = 0;
@@ -874,65 +908,66 @@ static int attach_extent_buffer_page(struct extent_buffer *eb,
* For cloned or dummy extent buffers, their pages are not mapped and
* will not race with any other ebs.
*/
- if (page->mapping)
- lockdep_assert_held(&page->mapping->private_lock);
+ if (folio->mapping)
+ lockdep_assert_held(&folio->mapping->i_private_lock);
if (fs_info->nodesize >= PAGE_SIZE) {
- if (!PagePrivate(page))
- attach_page_private(page, eb);
+ if (!folio_test_private(folio))
+ folio_attach_private(folio, eb);
else
- WARN_ON(page->private != (unsigned long)eb);
+ WARN_ON(folio_get_private(folio) != eb);
return 0;
}
/* Already mapped, just free prealloc */
- if (PagePrivate(page)) {
+ if (folio_test_private(folio)) {
btrfs_free_subpage(prealloc);
return 0;
}
if (prealloc)
/* Has preallocated memory for subpage */
- attach_page_private(page, prealloc);
+ folio_attach_private(folio, prealloc);
else
/* Do new allocation to attach subpage */
- ret = btrfs_attach_subpage(fs_info, page,
- BTRFS_SUBPAGE_METADATA);
+ ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA);
return ret;
}
int set_page_extent_mapped(struct page *page)
{
+ struct folio *folio = page_folio(page);
struct btrfs_fs_info *fs_info;
ASSERT(page->mapping);
- if (PagePrivate(page))
+ if (folio_test_private(folio))
return 0;
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (btrfs_is_subpage(fs_info, page))
- return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
+ if (btrfs_is_subpage(fs_info, page->mapping))
+ return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA);
- attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
+ folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE);
return 0;
}
void clear_page_extent_mapped(struct page *page)
{
+ struct folio *folio = page_folio(page);
struct btrfs_fs_info *fs_info;
ASSERT(page->mapping);
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return;
fs_info = btrfs_sb(page->mapping->host->i_sb);
- if (btrfs_is_subpage(fs_info, page))
- return btrfs_detach_subpage(fs_info, page);
+ if (btrfs_is_subpage(fs_info, page->mapping))
+ return btrfs_detach_subpage(fs_info, folio);
- detach_page_private(page);
+ folio_detach_private(folio);
}
static struct extent_map *
@@ -1001,7 +1036,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
memzero_page(page, zero_offset, iosize);
}
}
- bio_ctrl->end_io_func = end_bio_extent_readpage;
+ bio_ctrl->end_io_func = end_bbio_data_read;
begin_page_read(fs_info, page);
while (cur <= end) {
enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE;
@@ -1027,8 +1062,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
BUG_ON(extent_map_end(em) <= cur);
BUG_ON(end < cur);
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
- compress_type = em->compress_type;
+ compress_type = extent_map_compression(em);
iosize = min(extent_map_end(em) - cur, end - cur + 1);
iosize = ALIGN(iosize, blocksize);
@@ -1037,7 +1071,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
else
disk_bytenr = em->block_start + extent_offset;
block_start = em->block_start;
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
block_start = EXTENT_MAP_HOLE;
/*
@@ -1074,7 +1108,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
* is a corner case so we prioritize correctness over
* non-optimal behavior (submitting 2 bios for the same extent).
*/
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
+ if (compress_type != BTRFS_COMPRESS_NONE &&
prev_em_start && *prev_em_start != (u64)-1 &&
*prev_em_start != em->start)
force_bio_submit = true;
@@ -1240,7 +1274,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
struct page *page, u64 *start, u64 *end)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct folio *folio = page_folio(page);
+ struct btrfs_subpage *subpage = folio_get_private(folio);
struct btrfs_subpage_info *spi = fs_info->subpage_info;
u64 orig_start = *start;
/* Declare as unsigned long so we can use bitmap ops */
@@ -1252,7 +1287,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
* For regular sector size == page size case, since one page only
* contains one sector, we return the page offset directly.
*/
- if (!btrfs_is_subpage(fs_info, page)) {
+ if (!btrfs_is_subpage(fs_info, page->mapping)) {
*start = page_offset(page);
*end = page_offset(page) + PAGE_SIZE;
return;
@@ -1305,7 +1340,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
return 1;
}
- bio_ctrl->end_io_func = end_bio_extent_writepage;
+ bio_ctrl->end_io_func = end_bbio_data_write;
while (cur <= end) {
u32 len = end - cur + 1;
u64 disk_bytenr;
@@ -1325,7 +1360,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* writeback the sectors with subpage dirty bits,
* causing writeback without ordered extent.
*/
- btrfs_page_clear_dirty(fs_info, page, cur, len);
+ btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len);
break;
}
@@ -1352,7 +1387,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
block_start = em->block_start;
disk_bytenr = em->block_start + extent_offset;
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(!extent_map_is_compressed(em));
ASSERT(block_start != EXTENT_MAP_HOLE);
ASSERT(block_start != EXTENT_MAP_INLINE);
@@ -1377,7 +1412,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
* So clear subpage dirty bit here so next time we won't submit
* page for range already written to disk.
*/
- btrfs_page_clear_dirty(fs_info, page, cur, iosize);
+ btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize);
submit_extent_page(bio_ctrl, disk_bytenr, page, iosize,
cur - page_offset(page));
@@ -1385,7 +1420,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
nr++;
}
- btrfs_page_assert_not_dirty(fs_info, page);
+ btrfs_folio_assert_not_dirty(fs_info, page_folio(page));
*nr_ret = nr;
return 0;
@@ -1607,24 +1642,23 @@ static struct extent_buffer *find_extent_buffer_nolock(
return NULL;
}
-static void extent_buffer_write_end_io(struct btrfs_bio *bbio)
+static void end_bbio_meta_write(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ struct folio_iter fi;
u32 bio_offset = 0;
if (!uptodate)
set_btree_ioerr(eb);
- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
u64 start = eb->start + bio_offset;
- struct page *page = bvec->bv_page;
- u32 len = bvec->bv_len;
+ struct folio *folio = fi.folio;
+ u32 len = fi.length;
- btrfs_page_clear_writeback(fs_info, page, start, len);
+ btrfs_folio_clear_writeback(fs_info, folio, start, len);
bio_offset += len;
}
@@ -1673,36 +1707,44 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc),
- eb->fs_info, extent_buffer_write_end_io, eb);
+ eb->fs_info, end_bbio_meta_write, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev);
wbc_init_bio(wbc, &bbio->bio);
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
if (fs_info->nodesize < PAGE_SIZE) {
- struct page *p = eb->pages[0];
+ struct folio *folio = eb->folios[0];
+ bool ret;
- lock_page(p);
- btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len);
- if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start,
+ folio_lock(folio);
+ btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len);
+ if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start,
eb->len)) {
- clear_page_dirty_for_io(p);
+ folio_clear_dirty_for_io(folio);
wbc->nr_to_write--;
}
- __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p));
- wbc_account_cgroup_owner(wbc, p, eb->len);
- unlock_page(p);
+ ret = bio_add_folio(&bbio->bio, folio, eb->len,
+ eb->start - folio_pos(folio));
+ ASSERT(ret);
+ wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len);
+ folio_unlock(folio);
} else {
- for (int i = 0; i < num_extent_pages(eb); i++) {
- struct page *p = eb->pages[i];
-
- lock_page(p);
- clear_page_dirty_for_io(p);
- set_page_writeback(p);
- __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0);
- wbc_account_cgroup_owner(wbc, p, PAGE_SIZE);
- wbc->nr_to_write--;
- unlock_page(p);
+ int num_folios = num_extent_folios(eb);
+
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+ bool ret;
+
+ folio_lock(folio);
+ folio_clear_dirty_for_io(folio);
+ folio_start_writeback(folio);
+ ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+ ASSERT(ret);
+ wbc_account_cgroup_owner(wbc, folio_page(folio, 0),
+ folio_size(folio));
+ wbc->nr_to_write -= folio_nr_pages(folio);
+ folio_unlock(folio);
}
}
btrfs_submit_bio(bbio, 0);
@@ -1725,6 +1767,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb,
static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct folio *folio = page_folio(page);
int submitted = 0;
u64 page_start = page_offset(page);
int bit_start = 0;
@@ -1732,7 +1775,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
/* Lock and write each dirty extent buffers in the range */
while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
struct extent_buffer *eb;
unsigned long flags;
u64 start;
@@ -1741,16 +1784,16 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
* Take private lock to ensure the subpage won't be detached
* in the meantime.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&page->mapping->private_lock);
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&page->mapping->i_private_lock);
break;
}
spin_lock_irqsave(&subpage->lock, flags);
if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
subpage->bitmaps)) {
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
bit_start++;
continue;
}
@@ -1764,7 +1807,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc)
*/
eb = find_extent_buffer_nolock(fs_info, start);
spin_unlock_irqrestore(&subpage->lock, flags);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
/*
* The eb has already reached 0 refs thus find_extent_buffer()
@@ -1807,38 +1850,39 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx)
{
struct writeback_control *wbc = ctx->wbc;
struct address_space *mapping = page->mapping;
+ struct folio *folio = page_folio(page);
struct extent_buffer *eb;
int ret;
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return 0;
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return submit_eb_subpage(page, wbc);
- spin_lock(&mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
/*
* Shouldn't happen and normally this would be a BUG_ON but no point
* crashing the machine for something we can survive anyway.
*/
if (WARN_ON(!eb)) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
if (eb == ctx->eb) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return 0;
}
ret = atomic_inc_not_zero(&eb->refs);
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
if (!ret)
return 0;
@@ -2201,7 +2245,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page,
cur, cur_len, !ret);
mapping_set_error(page->mapping, ret);
}
- btrfs_page_unlock_writer(fs_info, page, cur, cur_len);
+ btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len);
if (ret < 0)
found_error = true;
next_page:
@@ -2352,7 +2396,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
write_unlock(&map->lock);
break;
}
- if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
+ if ((em->flags & EXTENT_FLAG_PINNED) ||
em->start != start) {
write_unlock(&map->lock);
free_extent_map(em);
@@ -2369,7 +2413,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
* extra reference on the em.
*/
if (list_empty(&em->list) ||
- test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ (em->flags & EXTENT_FLAG_LOGGING))
goto remove_em;
/*
* If it's in the list of modified extents, remove it
@@ -2436,6 +2480,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
struct fiemap_cache *cache,
u64 offset, u64 phys, u64 len, u32 flags)
{
+ u64 cache_end;
int ret = 0;
/* Set at the end of extent_fiemap(). */
@@ -2445,15 +2490,102 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
goto assign;
/*
- * Sanity check, extent_fiemap() should have ensured that new
- * fiemap extent won't overlap with cached one.
- * Not recoverable.
+ * When iterating the extents of the inode, at extent_fiemap(), we may
+ * find an extent that starts at an offset behind the end offset of the
+ * previous extent we processed. This happens if fiemap is called
+ * without FIEMAP_FLAG_SYNC and there are ordered extents completing
+ * while we call btrfs_next_leaf() (through fiemap_next_leaf_item()).
*
- * NOTE: Physical address can overlap, due to compression
+ * For example we are in leaf X processing its last item, which is the
+ * file extent item for file range [512K, 1M[, and after
+ * btrfs_next_leaf() releases the path, there's an ordered extent that
+ * completes for the file range [768K, 2M[, and that results in trimming
+ * the file extent item so that it now corresponds to the file range
+ * [512K, 768K[ and a new file extent item is inserted for the file
+ * range [768K, 2M[, which may end up as the last item of leaf X or as
+ * the first item of the next leaf - in either case btrfs_next_leaf()
+ * will leave us with a path pointing to the new extent item, for the
+ * file range [768K, 2M[, since that's the first key that follows the
+ * last one we processed. So in order not to report overlapping extents
+ * to user space, we trim the length of the previously cached extent and
+ * emit it.
+ *
+ * Upon calling btrfs_next_leaf() we may also find an extent with an
+ * offset smaller than or equals to cache->offset, and this happens
+ * when we had a hole or prealloc extent with several delalloc ranges in
+ * it, but after btrfs_next_leaf() released the path, delalloc was
+ * flushed and the resulting ordered extents were completed, so we can
+ * now have found a file extent item for an offset that is smaller than
+ * or equals to what we have in cache->offset. We deal with this as
+ * described below.
*/
- if (cache->offset + cache->len > offset) {
- WARN_ON(1);
- return -EINVAL;
+ cache_end = cache->offset + cache->len;
+ if (cache_end > offset) {
+ if (offset == cache->offset) {
+ /*
+ * We cached a dealloc range (found in the io tree) for
+ * a hole or prealloc extent and we have now found a
+ * file extent item for the same offset. What we have
+ * now is more recent and up to date, so discard what
+ * we had in the cache and use what we have just found.
+ */
+ goto assign;
+ } else if (offset > cache->offset) {
+ /*
+ * The extent range we previously found ends after the
+ * offset of the file extent item we found and that
+ * offset falls somewhere in the middle of that previous
+ * extent range. So adjust the range we previously found
+ * to end at the offset of the file extent item we have
+ * just found, since this extent is more up to date.
+ * Emit that adjusted range and cache the file extent
+ * item we have just found. This corresponds to the case
+ * where a previously found file extent item was split
+ * due to an ordered extent completing.
+ */
+ cache->len = offset - cache->offset;
+ goto emit;
+ } else {
+ const u64 range_end = offset + len;
+
+ /*
+ * The offset of the file extent item we have just found
+ * is behind the cached offset. This means we were
+ * processing a hole or prealloc extent for which we
+ * have found delalloc ranges (in the io tree), so what
+ * we have in the cache is the last delalloc range we
+ * found while the file extent item we found can be
+ * either for a whole delalloc range we previously
+ * emmitted or only a part of that range.
+ *
+ * We have two cases here:
+ *
+ * 1) The file extent item's range ends at or behind the
+ * cached extent's end. In this case just ignore the
+ * current file extent item because we don't want to
+ * overlap with previous ranges that may have been
+ * emmitted already;
+ *
+ * 2) The file extent item starts behind the currently
+ * cached extent but its end offset goes beyond the
+ * end offset of the cached extent. We don't want to
+ * overlap with a previous range that may have been
+ * emmitted already, so we emit the currently cached
+ * extent and then partially store the current file
+ * extent item's range in the cache, for the subrange
+ * going the cached extent's end to the end of the
+ * file extent item.
+ */
+ if (range_end <= cache_end)
+ return 0;
+
+ if (!(flags & (FIEMAP_EXTENT_ENCODED | FIEMAP_EXTENT_DELALLOC)))
+ phys += cache_end - offset;
+
+ offset = cache_end;
+ len = range_end - cache_end;
+ goto emit;
+ }
}
/*
@@ -2473,6 +2605,7 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
return 0;
}
+emit:
/* Not mergeable, need to submit cached one */
ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
cache->len, cache->flags);
@@ -2645,16 +2778,34 @@ static int fiemap_process_hole(struct btrfs_inode *inode,
* it beyond i_size.
*/
while (cur_offset < end && cur_offset < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
u64 prealloc_start;
+ u64 lockstart;
+ u64 lockend;
u64 prealloc_len = 0;
bool delalloc;
+ lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize);
+ lockend = round_up(end, inode->root->fs_info->sectorsize);
+
+ /*
+ * We are only locking for the delalloc range because that's the
+ * only thing that can change here. With fiemap we have a lock
+ * on the inode, so no buffered or direct writes can happen.
+ *
+ * However mmaps and normal page writeback will cause this to
+ * change arbitrarily. We have to lock the extent lock here to
+ * make sure that nobody messes with the tree while we're doing
+ * btrfs_find_delalloc_in_range.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end,
delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
break;
@@ -2822,15 +2973,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
const u64 ino = btrfs_ino(inode);
- struct extent_state *cached_state = NULL;
struct extent_state *delalloc_cached_state = NULL;
struct btrfs_path *path;
struct fiemap_cache cache = { 0 };
struct btrfs_backref_share_check_ctx *backref_ctx;
u64 last_extent_end;
u64 prev_extent_end;
- u64 lockstart;
- u64 lockend;
+ u64 range_start;
+ u64 range_end;
+ const u64 sectorsize = inode->root->fs_info->sectorsize;
bool stopped = false;
int ret;
@@ -2841,22 +2992,19 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto out;
}
- lockstart = round_down(start, inode->root->fs_info->sectorsize);
- lockend = round_up(start + len, inode->root->fs_info->sectorsize);
- prev_extent_end = lockstart;
-
- btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
+ range_start = round_down(start, sectorsize);
+ range_end = round_up(start + len, sectorsize);
+ prev_extent_end = range_start;
ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end);
if (ret < 0)
- goto out_unlock;
+ goto out;
btrfs_release_path(path);
path->reada = READA_FORWARD;
- ret = fiemap_search_slot(inode, path, lockstart);
+ ret = fiemap_search_slot(inode, path, range_start);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/*
* No file extent item found, but we may have delalloc between
@@ -2866,7 +3014,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto check_eof_delalloc;
}
- while (prev_extent_end < lockend) {
+ while (prev_extent_end < range_end) {
struct extent_buffer *leaf = path->nodes[0];
struct btrfs_file_extent_item *ei;
struct btrfs_key key;
@@ -2889,21 +3037,21 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
* The first iteration can leave us at an extent item that ends
* before our range's start. Move to the next item.
*/
- if (extent_end <= lockstart)
+ if (extent_end <= range_start)
goto next_item;
backref_ctx->curr_leaf_bytenr = leaf->start;
/* We have in implicit hole (NO_HOLES feature enabled). */
if (prev_extent_end < key.offset) {
- const u64 range_end = min(key.offset, lockend) - 1;
+ const u64 hole_end = min(key.offset, range_end) - 1;
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state,
backref_ctx, 0, 0, 0,
- prev_extent_end, range_end);
+ prev_extent_end, hole_end);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* fiemap_fill_next_extent() told us to stop. */
stopped = true;
@@ -2911,7 +3059,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
}
/* We've reached the end of the fiemap range, stop. */
- if (key.offset >= lockend) {
+ if (key.offset >= range_end) {
stopped = true;
break;
}
@@ -2959,7 +3107,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
extent_gen,
backref_ctx);
if (ret < 0)
- goto out_unlock;
+ goto out;
else if (ret > 0)
flags |= FIEMAP_EXTENT_SHARED;
}
@@ -2970,7 +3118,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
}
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* fiemap_fill_next_extent() told us to stop. */
stopped = true;
@@ -2981,12 +3129,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
next_item:
if (fatal_signal_pending(current)) {
ret = -EINTR;
- goto out_unlock;
+ goto out;
}
ret = fiemap_next_leaf_item(inode, path);
if (ret < 0) {
- goto out_unlock;
+ goto out;
} else if (ret > 0) {
/* No more file extent items for this inode. */
break;
@@ -3005,29 +3153,41 @@ check_eof_delalloc:
btrfs_free_path(path);
path = NULL;
- if (!stopped && prev_extent_end < lockend) {
+ if (!stopped && prev_extent_end < range_end) {
ret = fiemap_process_hole(inode, fieinfo, &cache,
&delalloc_cached_state, backref_ctx,
- 0, 0, 0, prev_extent_end, lockend - 1);
+ 0, 0, 0, prev_extent_end, range_end - 1);
if (ret < 0)
- goto out_unlock;
- prev_extent_end = lockend;
+ goto out;
+ prev_extent_end = range_end;
}
if (cache.cached && cache.offset + cache.len >= last_extent_end) {
const u64 i_size = i_size_read(&inode->vfs_inode);
if (prev_extent_end < i_size) {
+ struct extent_state *cached_state = NULL;
u64 delalloc_start;
u64 delalloc_end;
+ u64 lockstart;
+ u64 lockend;
bool delalloc;
+ lockstart = round_down(prev_extent_end, sectorsize);
+ lockend = round_up(i_size, sectorsize);
+
+ /*
+ * See the comment in fiemap_process_hole as to why
+ * we're doing the locking here.
+ */
+ lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
delalloc = btrfs_find_delalloc_in_range(inode,
prev_extent_end,
i_size - 1,
&delalloc_cached_state,
&delalloc_start,
&delalloc_end);
+ unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
if (!delalloc)
cache.flags |= FIEMAP_EXTENT_LAST;
} else {
@@ -3036,10 +3196,6 @@ check_eof_delalloc:
}
ret = emit_last_fiemap_cache(fieinfo, &cache);
-
-out_unlock:
- unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
- btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
out:
free_extent_state(delalloc_cached_state);
btrfs_free_backref_share_ctx(backref_ctx);
@@ -3058,14 +3214,14 @@ static int extent_buffer_under_io(const struct extent_buffer *eb)
test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
}
-static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
+static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- lockdep_assert_held(&page->mapping->private_lock);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- if (PagePrivate(page)) {
- subpage = (struct btrfs_subpage *)page->private;
+ if (folio_test_private(folio)) {
+ subpage = folio_get_private(folio);
if (atomic_read(&subpage->eb_refs))
return true;
/*
@@ -3078,21 +3234,21 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
return false;
}
-static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
+static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
/*
- * For mapped eb, we're going to change the page private, which should
- * be done under the private_lock.
+ * For mapped eb, we're going to change the folio private, which should
+ * be done under the i_private_lock.
*/
if (mapped)
- spin_lock(&page->mapping->private_lock);
+ spin_lock(&folio->mapping->i_private_lock);
- if (!PagePrivate(page)) {
+ if (!folio_test_private(folio)) {
if (mapped)
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return;
}
@@ -3101,66 +3257,58 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag
* We do this since we'll remove the pages after we've
* removed the eb from the radix tree, so we could race
* and have this page now attached to the new eb. So
- * only clear page_private if it's still connected to
+ * only clear folio if it's still connected to
* this eb.
*/
- if (PagePrivate(page) &&
- page->private == (unsigned long)eb) {
+ if (folio_test_private(folio) && folio_get_private(folio) == eb) {
BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
- BUG_ON(PageDirty(page));
- BUG_ON(PageWriteback(page));
- /*
- * We need to make sure we haven't be attached
- * to a new eb.
- */
- detach_page_private(page);
+ BUG_ON(folio_test_dirty(folio));
+ BUG_ON(folio_test_writeback(folio));
+ /* We need to make sure we haven't be attached to a new eb. */
+ folio_detach_private(folio);
}
if (mapped)
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return;
}
/*
- * For subpage, we can have dummy eb with page private. In this case,
- * we can directly detach the private as such page is only attached to
- * one dummy eb, no sharing.
+ * For subpage, we can have dummy eb with folio private attached. In
+ * this case, we can directly detach the private as such folio is only
+ * attached to one dummy eb, no sharing.
*/
if (!mapped) {
- btrfs_detach_subpage(fs_info, page);
+ btrfs_detach_subpage(fs_info, folio);
return;
}
- btrfs_page_dec_eb_refs(fs_info, page);
+ btrfs_folio_dec_eb_refs(fs_info, folio);
/*
- * We can only detach the page private if there are no other ebs in the
+ * We can only detach the folio private if there are no other ebs in the
* page range and no unfinished IO.
*/
- if (!page_range_has_eb(fs_info, page))
- btrfs_detach_subpage(fs_info, page);
+ if (!folio_range_has_eb(fs_info, folio))
+ btrfs_detach_subpage(fs_info, folio);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
}
/* Release all pages attached to the extent buffer */
static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
{
- int i;
- int num_pages;
-
ASSERT(!extent_buffer_under_io(eb));
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *page = eb->pages[i];
+ for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) {
+ struct folio *folio = eb->folios[i];
- if (!page)
+ if (!folio)
continue;
- detach_extent_buffer_page(eb, page);
+ detach_extent_buffer_folio(eb, folio);
- /* One for when we allocated the page */
- put_page(page);
+ /* One for when we allocated the folio. */
+ folio_put(folio);
}
}
@@ -3198,9 +3346,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
{
- int i;
struct extent_buffer *new;
- int num_pages = num_extent_pages(src);
+ int num_folios = num_extent_folios(src);
int ret;
new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
@@ -3214,22 +3361,22 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
*/
set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
- ret = btrfs_alloc_page_array(num_pages, new->pages);
+ ret = alloc_eb_folio_array(new, 0);
if (ret) {
btrfs_release_extent_buffer(new);
return NULL;
}
- for (i = 0; i < num_pages; i++) {
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = new->folios[i];
int ret;
- struct page *p = new->pages[i];
- ret = attach_extent_buffer_page(new, p, NULL);
+ ret = attach_extent_buffer_folio(new, folio, NULL);
if (ret < 0) {
btrfs_release_extent_buffer(new);
return NULL;
}
- WARN_ON(PageDirty(p));
+ WARN_ON(folio_test_dirty(folio));
}
copy_extent_buffer_full(new, src);
set_extent_buffer_uptodate(new);
@@ -3241,23 +3388,20 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, unsigned long len)
{
struct extent_buffer *eb;
- int num_pages;
- int i;
+ int num_folios = 0;
int ret;
eb = __alloc_extent_buffer(fs_info, start, len);
if (!eb)
return NULL;
- num_pages = num_extent_pages(eb);
- ret = btrfs_alloc_page_array(num_pages, eb->pages);
+ ret = alloc_eb_folio_array(eb, 0);
if (ret)
goto err;
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- ret = attach_extent_buffer_page(eb, p, NULL);
+ num_folios = num_extent_folios(eb);
+ for (int i = 0; i < num_folios; i++) {
+ ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL);
if (ret < 0)
goto err;
}
@@ -3268,10 +3412,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
return eb;
err:
- for (i = 0; i < num_pages; i++) {
- if (eb->pages[i]) {
- detach_extent_buffer_page(eb, eb->pages[i]);
- __free_page(eb->pages[i]);
+ for (int i = 0; i < num_folios; i++) {
+ if (eb->folios[i]) {
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ __folio_put(eb->folios[i]);
}
}
__free_extent_buffer(eb);
@@ -3320,20 +3464,14 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
spin_unlock(&eb->refs_lock);
}
-static void mark_extent_buffer_accessed(struct extent_buffer *eb,
- struct page *accessed)
+static void mark_extent_buffer_accessed(struct extent_buffer *eb)
{
- int num_pages, i;
+ int num_folios= num_extent_folios(eb);
check_buffer_tree_ref(eb);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- struct page *p = eb->pages[i];
-
- if (p != accessed)
- mark_page_accessed(p);
- }
+ for (int i = 0; i < num_folios; i++)
+ folio_mark_accessed(eb->folios[i]);
}
struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
@@ -3361,7 +3499,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
spin_lock(&eb->refs_lock);
spin_unlock(&eb->refs_lock);
}
- mark_extent_buffer_accessed(eb, NULL);
+ mark_extent_buffer_accessed(eb);
return eb;
}
@@ -3410,6 +3548,7 @@ free_eb:
static struct extent_buffer *grab_extent_buffer(
struct btrfs_fs_info *fs_info, struct page *page)
{
+ struct folio *folio = page_folio(page);
struct extent_buffer *exists;
/*
@@ -3421,21 +3560,21 @@ static struct extent_buffer *grab_extent_buffer(
return NULL;
/* Page not yet attached to an extent buffer */
- if (!PagePrivate(page))
+ if (!folio_test_private(folio))
return NULL;
/*
* We could have already allocated an eb for this page and attached one
* so lets see if we can get a ref on the existing eb, and if we can we
* know it's good and we can just return that one, else we know we can
- * just overwrite page->private.
+ * just overwrite folio private.
*/
- exists = (struct extent_buffer *)page->private;
+ exists = folio_get_private(folio);
if (atomic_inc_not_zero(&exists->refs))
return exists;
WARN_ON(PageDirty(page));
- detach_page_private(page);
+ folio_detach_private(folio);
return NULL;
}
@@ -3469,19 +3608,88 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
return 0;
}
+
+/*
+ * Return 0 if eb->folios[i] is attached to btree inode successfully.
+ * Return >0 if there is already another extent buffer for the range,
+ * and @found_eb_ret would be updated.
+ * Return -EAGAIN if the filemap has an existing folio but with different size
+ * than @eb.
+ * The caller needs to free the existing folios and retry using the same order.
+ */
+static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i,
+ struct extent_buffer **found_eb_ret)
+{
+
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct address_space *mapping = fs_info->btree_inode->i_mapping;
+ const unsigned long index = eb->start >> PAGE_SHIFT;
+ struct folio *existing_folio;
+ int ret;
+
+ ASSERT(found_eb_ret);
+
+ /* Caller should ensure the folio exists. */
+ ASSERT(eb->folios[i]);
+
+retry:
+ ret = filemap_add_folio(mapping, eb->folios[i], index + i,
+ GFP_NOFS | __GFP_NOFAIL);
+ if (!ret)
+ return 0;
+
+ existing_folio = filemap_lock_folio(mapping, index + i);
+ /* The page cache only exists for a very short time, just retry. */
+ if (IS_ERR(existing_folio))
+ goto retry;
+
+ /* For now, we should only have single-page folios for btree inode. */
+ ASSERT(folio_nr_pages(existing_folio) == 1);
+
+ if (folio_size(existing_folio) != folio_size(eb->folios[0])) {
+ folio_unlock(existing_folio);
+ folio_put(existing_folio);
+ return -EAGAIN;
+ }
+
+ if (fs_info->nodesize < PAGE_SIZE) {
+ /*
+ * We're going to reuse the existing page, can drop our page
+ * and subpage structure now.
+ */
+ __free_page(folio_page(eb->folios[i], 0));
+ eb->folios[i] = existing_folio;
+ } else {
+ struct extent_buffer *existing_eb;
+
+ existing_eb = grab_extent_buffer(fs_info,
+ folio_page(existing_folio, 0));
+ if (existing_eb) {
+ /* The extent buffer still exists, we can use it directly. */
+ *found_eb_ret = existing_eb;
+ folio_unlock(existing_folio);
+ folio_put(existing_folio);
+ return 1;
+ }
+ /* The extent buffer no longer exists, we can reuse the folio. */
+ __free_page(folio_page(eb->folios[i], 0));
+ eb->folios[i] = existing_folio;
+ }
+ return 0;
+}
+
struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
u64 start, u64 owner_root, int level)
{
unsigned long len = fs_info->nodesize;
- int num_pages;
- int i;
- unsigned long index = start >> PAGE_SHIFT;
+ int num_folios;
+ int attached = 0;
struct extent_buffer *eb;
- struct extent_buffer *exists = NULL;
- struct page *p;
+ struct extent_buffer *existing_eb = NULL;
struct address_space *mapping = fs_info->btree_inode->i_mapping;
struct btrfs_subpage *prealloc = NULL;
u64 lockdep_owner = owner_root;
+ bool page_contig = true;
int uptodate = 1;
int ret;
@@ -3516,11 +3724,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
- num_pages = num_extent_pages(eb);
-
/*
- * Preallocate page->private for subpage case, so that we won't
- * allocate memory with private_lock nor page lock hold.
+ * Preallocate folio private for subpage case, so that we won't
+ * allocate memory with i_private_lock nor page lock hold.
*
* The memory will be freed by attach_extent_buffer_page() or freed
* manually if we exit earlier.
@@ -3528,47 +3734,89 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
if (fs_info->nodesize < PAGE_SIZE) {
prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
if (IS_ERR(prealloc)) {
- exists = ERR_CAST(prealloc);
- goto free_eb;
+ ret = PTR_ERR(prealloc);
+ goto out;
}
}
- for (i = 0; i < num_pages; i++, index++) {
- p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
- if (!p) {
- exists = ERR_PTR(-ENOMEM);
- btrfs_free_subpage(prealloc);
- goto free_eb;
+reallocate:
+ /* Allocate all pages first. */
+ ret = alloc_eb_folio_array(eb, __GFP_NOFAIL);
+ if (ret < 0) {
+ btrfs_free_subpage(prealloc);
+ goto out;
+ }
+
+ num_folios = num_extent_folios(eb);
+ /* Attach all pages to the filemap. */
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio;
+
+ ret = attach_eb_folio_to_filemap(eb, i, &existing_eb);
+ if (ret > 0) {
+ ASSERT(existing_eb);
+ goto out;
}
- spin_lock(&mapping->private_lock);
- exists = grab_extent_buffer(fs_info, p);
- if (exists) {
- spin_unlock(&mapping->private_lock);
- unlock_page(p);
- put_page(p);
- mark_extent_buffer_accessed(exists, p);
- btrfs_free_subpage(prealloc);
- goto free_eb;
+ /*
+ * TODO: Special handling for a corner case where the order of
+ * folios mismatch between the new eb and filemap.
+ *
+ * This happens when:
+ *
+ * - the new eb is using higher order folio
+ *
+ * - the filemap is still using 0-order folios for the range
+ * This can happen at the previous eb allocation, and we don't
+ * have higher order folio for the call.
+ *
+ * - the existing eb has already been freed
+ *
+ * In this case, we have to free the existing folios first, and
+ * re-allocate using the same order.
+ * Thankfully this is not going to happen yet, as we're still
+ * using 0-order folios.
+ */
+ if (unlikely(ret == -EAGAIN)) {
+ ASSERT(0);
+ goto reallocate;
}
+ attached++;
+
+ /*
+ * Only after attach_eb_folio_to_filemap(), eb->folios[] is
+ * reliable, as we may choose to reuse the existing page cache
+ * and free the allocated page.
+ */
+ folio = eb->folios[i];
+ spin_lock(&mapping->i_private_lock);
/* Should not fail, as we have preallocated the memory */
- ret = attach_extent_buffer_page(eb, p, prealloc);
+ ret = attach_extent_buffer_folio(eb, folio, prealloc);
ASSERT(!ret);
/*
* To inform we have extra eb under allocation, so that
- * detach_extent_buffer_page() won't release the page private
+ * detach_extent_buffer_page() won't release the folio private
* when the eb hasn't yet been inserted into radix tree.
*
* The ref will be decreased when the eb released the page, in
* detach_extent_buffer_page().
* Thus needs no special handling in error path.
*/
- btrfs_page_inc_eb_refs(fs_info, p);
- spin_unlock(&mapping->private_lock);
+ btrfs_folio_inc_eb_refs(fs_info, folio);
+ spin_unlock(&mapping->i_private_lock);
+
+ WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len));
- WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
- eb->pages[i] = p;
- if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len))
+ /*
+ * Check if the current page is physically contiguous with previous eb
+ * page.
+ * At this stage, either we allocated a large folio, thus @i
+ * would only be 0, or we fall back to per-page allocation.
+ */
+ if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0))
+ page_contig = false;
+
+ if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len))
uptodate = 0;
/*
@@ -3581,12 +3829,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
}
if (uptodate)
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+ /* All pages are physically contiguous, can skip cross page handling. */
+ if (page_contig)
+ eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start);
again:
ret = radix_tree_preload(GFP_NOFS);
- if (ret) {
- exists = ERR_PTR(ret);
- goto free_eb;
- }
+ if (ret)
+ goto out;
spin_lock(&fs_info->buffer_lock);
ret = radix_tree_insert(&fs_info->buffer_radix,
@@ -3594,9 +3843,10 @@ again:
spin_unlock(&fs_info->buffer_lock);
radix_tree_preload_end();
if (ret == -EEXIST) {
- exists = find_extent_buffer(fs_info, start);
- if (exists)
- goto free_eb;
+ ret = 0;
+ existing_eb = find_extent_buffer(fs_info, start);
+ if (existing_eb)
+ goto out;
else
goto again;
}
@@ -3609,19 +3859,46 @@ again:
* btree_release_folio will correctly detect that a page belongs to a
* live buffer and won't free them prematurely.
*/
- for (i = 0; i < num_pages; i++)
- unlock_page(eb->pages[i]);
+ for (int i = 0; i < num_folios; i++)
+ unlock_page(folio_page(eb->folios[i], 0));
return eb;
-free_eb:
+out:
WARN_ON(!atomic_dec_and_test(&eb->refs));
- for (i = 0; i < num_pages; i++) {
- if (eb->pages[i])
- unlock_page(eb->pages[i]);
+
+ /*
+ * Any attached folios need to be detached before we unlock them. This
+ * is because when we're inserting our new folios into the mapping, and
+ * then attaching our eb to that folio. If we fail to insert our folio
+ * we'll lookup the folio for that index, and grab that EB. We do not
+ * want that to grab this eb, as we're getting ready to free it. So we
+ * have to detach it first and then unlock it.
+ *
+ * We have to drop our reference and NULL it out here because in the
+ * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb.
+ * Below when we call btrfs_release_extent_buffer() we will call
+ * detach_extent_buffer_folio() on our remaining pages in the !subpage
+ * case. If we left eb->folios[i] populated in the subpage case we'd
+ * double put our reference and be super sad.
+ */
+ for (int i = 0; i < attached; i++) {
+ ASSERT(eb->folios[i]);
+ detach_extent_buffer_folio(eb, eb->folios[i]);
+ unlock_page(folio_page(eb->folios[i], 0));
+ folio_put(eb->folios[i]);
+ eb->folios[i] = NULL;
}
+ /*
+ * Now all pages of that extent buffer is unmapped, set UNMAPPED flag,
+ * so it can be cleaned up without utlizing page->mapping.
+ */
+ set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
btrfs_release_extent_buffer(eb);
- return exists;
+ if (ret < 0)
+ return ERR_PTR(ret);
+ ASSERT(existing_eb);
+ return existing_eb;
}
static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
@@ -3713,31 +3990,30 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
-static void btree_clear_page_dirty(struct page *page)
+static void btree_clear_folio_dirty(struct folio *folio)
{
- ASSERT(PageDirty(page));
- ASSERT(PageLocked(page));
- clear_page_dirty_for_io(page);
- xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page))
- __xa_clear_mark(&page->mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(&page->mapping->i_pages);
+ ASSERT(folio_test_dirty(folio));
+ ASSERT(folio_test_locked(folio));
+ folio_clear_dirty_for_io(folio);
+ xa_lock_irq(&folio->mapping->i_pages);
+ if (!folio_test_dirty(folio))
+ __xa_clear_mark(&folio->mapping->i_pages,
+ folio_index(folio), PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(&folio->mapping->i_pages);
}
static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page = eb->pages[0];
+ struct folio *folio = eb->folios[0];
bool last;
- /* btree_clear_page_dirty() needs page locked */
- lock_page(page);
- last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
- eb->len);
+ /* btree_clear_folio_dirty() needs page locked. */
+ folio_lock(folio);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len);
if (last)
- btree_clear_page_dirty(page);
- unlock_page(page);
+ btree_clear_folio_dirty(folio);
+ folio_unlock(folio);
WARN_ON(atomic_read(&eb->refs) == 0);
}
@@ -3745,15 +4021,27 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- int i;
- int num_pages;
- struct page *page;
+ int num_folios;
btrfs_assert_tree_write_locked(eb);
if (trans && btrfs_header_generation(eb) != trans->transid)
return;
+ /*
+ * Instead of clearing the dirty flag off of the buffer, mark it as
+ * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve
+ * write-ordering in zoned mode, without the need to later re-dirty
+ * the extent_buffer.
+ *
+ * The actual zeroout of the buffer will happen later in
+ * btree_csum_one_bio.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags);
+ return;
+ }
+
if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
return;
@@ -3763,30 +4051,29 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
if (eb->fs_info->nodesize < PAGE_SIZE)
return clear_subpage_extent_buffer_dirty(eb);
- num_pages = num_extent_pages(eb);
+ num_folios = num_extent_folios(eb);
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (!PageDirty(page))
+ if (!folio_test_dirty(folio))
continue;
- lock_page(page);
- btree_clear_page_dirty(page);
- unlock_page(page);
+ folio_lock(folio);
+ btree_clear_folio_dirty(folio);
+ folio_unlock(folio);
}
WARN_ON(atomic_read(&eb->refs) == 0);
}
void set_extent_buffer_dirty(struct extent_buffer *eb)
{
- int i;
- int num_pages;
+ int num_folios;
bool was_dirty;
check_buffer_tree_ref(eb);
was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
- num_pages = num_extent_pages(eb);
+ num_folios = num_extent_folios(eb);
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
@@ -3805,34 +4092,32 @@ void set_extent_buffer_dirty(struct extent_buffer *eb)
* the above race.
*/
if (subpage)
- lock_page(eb->pages[0]);
- for (i = 0; i < num_pages; i++)
- btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
- eb->start, eb->len);
+ lock_page(folio_page(eb->folios[0], 0));
+ for (int i = 0; i < num_folios; i++)
+ btrfs_folio_set_dirty(eb->fs_info, eb->folios[i],
+ eb->start, eb->len);
if (subpage)
- unlock_page(eb->pages[0]);
+ unlock_page(folio_page(eb->folios[0], 0));
percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes,
eb->len,
eb->fs_info->dirty_metadata_batch);
}
#ifdef CONFIG_BTRFS_DEBUG
- for (i = 0; i < num_pages; i++)
- ASSERT(PageDirty(eb->pages[i]));
+ for (int i = 0; i < num_folios; i++)
+ ASSERT(folio_test_dirty(eb->folios[i]));
#endif
}
void clear_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page;
- int num_pages;
- int i;
+ int num_folios = num_extent_folios(eb);
clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
- if (!page)
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+
+ if (!folio)
continue;
/*
@@ -3840,44 +4125,40 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb)
* btrfs_is_subpage() can not handle cloned/dummy metadata.
*/
if (fs_info->nodesize >= PAGE_SIZE)
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
else
- btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
- eb->len);
+ btrfs_subpage_clear_uptodate(fs_info, folio,
+ eb->start, eb->len);
}
}
void set_extent_buffer_uptodate(struct extent_buffer *eb)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
- struct page *page;
- int num_pages;
- int i;
+ int num_folios = num_extent_folios(eb);
set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
- num_pages = num_extent_pages(eb);
- for (i = 0; i < num_pages; i++) {
- page = eb->pages[i];
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
/*
* This is special handling for metadata subpage, as regular
* btrfs_is_subpage() can not handle cloned/dummy metadata.
*/
if (fs_info->nodesize >= PAGE_SIZE)
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
else
- btrfs_subpage_set_uptodate(fs_info, page, eb->start,
- eb->len);
+ btrfs_subpage_set_uptodate(fs_info, folio,
+ eb->start, eb->len);
}
}
-static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
+static void end_bbio_meta_read(struct btrfs_bio *bbio)
{
struct extent_buffer *eb = bbio->private;
struct btrfs_fs_info *fs_info = eb->fs_info;
bool uptodate = !bbio->bio.bi_status;
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ struct folio_iter fi;
u32 bio_offset = 0;
eb->read_mirror = bbio->mirror_num;
@@ -3893,15 +4174,15 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
}
- bio_for_each_segment_all(bvec, &bbio->bio, iter_all) {
+ bio_for_each_folio_all(fi, &bbio->bio) {
+ struct folio *folio = fi.folio;
u64 start = eb->start + bio_offset;
- struct page *page = bvec->bv_page;
- u32 len = bvec->bv_len;
+ u32 len = fi.length;
if (uptodate)
- btrfs_page_set_uptodate(fs_info, page, start, len);
+ btrfs_folio_set_uptodate(fs_info, folio, start, len);
else
- btrfs_page_clear_uptodate(fs_info, page, start, len);
+ btrfs_folio_clear_uptodate(fs_info, folio, start, len);
bio_offset += len;
}
@@ -3917,8 +4198,8 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio)
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
struct btrfs_tree_parent_check *check)
{
- int num_pages = num_extent_pages(eb), i;
struct btrfs_bio *bbio;
+ bool ret;
if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
return 0;
@@ -3942,17 +4223,24 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES,
REQ_OP_READ | REQ_META, eb->fs_info,
- extent_buffer_read_end_io, eb);
+ end_bbio_meta_read, eb);
bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT;
bbio->inode = BTRFS_I(eb->fs_info->btree_inode);
bbio->file_offset = eb->start;
memcpy(&bbio->parent_check, check, sizeof(*check));
if (eb->fs_info->nodesize < PAGE_SIZE) {
- __bio_add_page(&bbio->bio, eb->pages[0], eb->len,
- eb->start - page_offset(eb->pages[0]));
+ ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len,
+ eb->start - folio_pos(eb->folios[0]));
+ ASSERT(ret);
} else {
- for (i = 0; i < num_pages; i++)
- __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0);
+ int num_folios = num_extent_folios(eb);
+
+ for (int i = 0; i < num_folios; i++) {
+ struct folio *folio = eb->folios[i];
+
+ ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0);
+ ASSERT(ret);
+ }
}
btrfs_submit_bio(bbio, mirror_num);
@@ -3999,29 +4287,33 @@ static inline int check_eb_range(const struct extent_buffer *eb,
void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
- char *kaddr;
char *dst = (char *)dstv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
if (check_eb_range(eb, start, len)) {
/*
* Invalid range hit, reset the memory, so callers won't get
- * some random garbage for their uninitialzed memory.
+ * some random garbage for their uninitialized memory.
*/
memset(dstv, 0, len);
return;
}
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ memcpy(dstv, eb->addr + start, len);
+ return;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
+ char *kaddr;
- cur = min(len, (PAGE_SIZE - offset));
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
memcpy(dst, kaddr + offset, cur);
dst += cur;
@@ -4035,24 +4327,29 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
void __user *dstv,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
- char *kaddr;
char __user *dst = (char __user *)dstv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
int ret = 0;
WARN_ON(start > eb->len);
WARN_ON(start + len > eb->start + eb->len);
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ if (copy_to_user_nofault(dstv, eb->addr + start, len))
+ ret = -EFAULT;
+ return ret;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
+ char *kaddr;
- cur = min(len, (PAGE_SIZE - offset));
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
@@ -4070,25 +4367,25 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
char *ptr = (char *)ptrv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
int ret = 0;
if (check_eb_range(eb, start, len))
return -EINVAL;
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr)
+ return memcmp(ptrv, eb->addr + start, len);
- while (len > 0) {
- page = eb->pages[i];
-
- cur = min(len, (PAGE_SIZE - offset));
+ offset = get_eb_offset_in_folio(eb, start);
- kaddr = page_address(page);
+ while (len > 0) {
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
ret = memcmp(ptr, kaddr + offset, cur);
if (ret)
break;
@@ -4107,10 +4404,12 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
* For regular sector size == PAGE_SIZE case, check if @page is uptodate.
* For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
*/
-static void assert_eb_page_uptodate(const struct extent_buffer *eb,
- struct page *page)
+static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct folio *folio = eb->folios[i];
+
+ ASSERT(folio);
/*
* If we are using the commit root we could potentially clear a page
@@ -4124,11 +4423,14 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb,
return;
if (fs_info->nodesize < PAGE_SIZE) {
- if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page,
+ struct folio *folio = eb->folios[0];
+
+ ASSERT(i == 0);
+ if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio,
eb->start, eb->len)))
- btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len);
+ btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len);
} else {
- WARN_ON(!PageUptodate(page));
+ WARN_ON(!folio_test_uptodate(folio));
}
}
@@ -4136,29 +4438,34 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
const void *srcv, unsigned long start,
unsigned long len, bool use_memmove)
{
+ const int unit_size = folio_size(eb->folios[0]);
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
char *src = (char *)srcv;
- unsigned long i = get_eb_page_index(start);
+ unsigned long i = get_eb_folio_index(eb, start);
/* For unmapped (dummy) ebs, no need to check their uptodate status. */
const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
- WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
-
if (check_eb_range(eb, start, len))
return;
- offset = get_eb_offset_in_page(eb, start);
+ if (eb->addr) {
+ if (use_memmove)
+ memmove(eb->addr + start, srcv, len);
+ else
+ memcpy(eb->addr + start, srcv, len);
+ return;
+ }
+
+ offset = get_eb_offset_in_folio(eb, start);
while (len > 0) {
- page = eb->pages[i];
if (check_uptodate)
- assert_eb_page_uptodate(eb, page);
+ assert_eb_folio_uptodate(eb, i);
- cur = min(len, PAGE_SIZE - offset);
- kaddr = page_address(page);
+ cur = min(len, unit_size - offset);
+ kaddr = folio_address(eb->folios[i]);
if (use_memmove)
memmove(kaddr + offset, src, cur);
else
@@ -4180,16 +4487,21 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
static void memset_extent_buffer(const struct extent_buffer *eb, int c,
unsigned long start, unsigned long len)
{
+ const int unit_size = folio_size(eb->folios[0]);
unsigned long cur = start;
+ if (eb->addr) {
+ memset(eb->addr + start, c, len);
+ return;
+ }
+
while (cur < start + len) {
- unsigned long index = get_eb_page_index(cur);
- unsigned int offset = get_eb_offset_in_page(eb, cur);
- unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset);
- struct page *page = eb->pages[index];
+ unsigned long index = get_eb_folio_index(eb, cur);
+ unsigned int offset = get_eb_offset_in_folio(eb, cur);
+ unsigned int cur_len = min(start + len - cur, unit_size - offset);
- assert_eb_page_uptodate(eb, page);
- memset(page_address(page) + offset, c, cur_len);
+ assert_eb_folio_uptodate(eb, index);
+ memset(folio_address(eb->folios[index]) + offset, c, cur_len);
cur += cur_len;
}
@@ -4206,15 +4518,16 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
void copy_extent_buffer_full(const struct extent_buffer *dst,
const struct extent_buffer *src)
{
+ const int unit_size = folio_size(src->folios[0]);
unsigned long cur = 0;
ASSERT(dst->len == src->len);
while (cur < src->len) {
- unsigned long index = get_eb_page_index(cur);
- unsigned long offset = get_eb_offset_in_page(src, cur);
- unsigned long cur_len = min(src->len, PAGE_SIZE - offset);
- void *addr = page_address(src->pages[index]) + offset;
+ unsigned long index = get_eb_folio_index(src, cur);
+ unsigned long offset = get_eb_offset_in_folio(src, cur);
+ unsigned long cur_len = min(src->len, unit_size - offset);
+ void *addr = folio_address(src->folios[index]) + offset;
write_extent_buffer(dst, addr, cur, cur_len);
@@ -4227,12 +4540,12 @@ void copy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
+ const int unit_size = folio_size(dst->folios[0]);
u64 dst_len = dst->len;
size_t cur;
size_t offset;
- struct page *page;
char *kaddr;
- unsigned long i = get_eb_page_index(dst_offset);
+ unsigned long i = get_eb_folio_index(dst, dst_offset);
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(src, src_offset, len))
@@ -4240,15 +4553,14 @@ void copy_extent_buffer(const struct extent_buffer *dst,
WARN_ON(src->len != dst_len);
- offset = get_eb_offset_in_page(dst, dst_offset);
+ offset = get_eb_offset_in_folio(dst, dst_offset);
while (len > 0) {
- page = dst->pages[i];
- assert_eb_page_uptodate(dst, page);
+ assert_eb_folio_uptodate(dst, i);
- cur = min(len, (unsigned long)(PAGE_SIZE - offset));
+ cur = min(len, (unsigned long)(unit_size - offset));
- kaddr = page_address(page);
+ kaddr = folio_address(dst->folios[i]);
read_extent_buffer(src, kaddr + offset, src_offset, cur);
src_offset += cur;
@@ -4259,22 +4571,22 @@ void copy_extent_buffer(const struct extent_buffer *dst,
}
/*
- * Calculate the page and offset of the byte containing the given bit number.
+ * Calculate the folio and offset of the byte containing the given bit number.
*
* @eb: the extent buffer
* @start: offset of the bitmap item in the extent buffer
* @nr: bit number
- * @page_index: return index of the page in the extent buffer that contains
+ * @folio_index: return index of the folio in the extent buffer that contains
* the given bit number
- * @page_offset: return offset into the page given by page_index
+ * @folio_offset: return offset into the folio given by folio_index
*
* This helper hides the ugliness of finding the byte in an extent buffer which
* contains a given bit.
*/
static inline void eb_bitmap_offset(const struct extent_buffer *eb,
unsigned long start, unsigned long nr,
- unsigned long *page_index,
- size_t *page_offset)
+ unsigned long *folio_index,
+ size_t *folio_offset)
{
size_t byte_offset = BIT_BYTE(nr);
size_t offset;
@@ -4284,10 +4596,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
* the bitmap item in the extent buffer + the offset of the byte in the
* bitmap item.
*/
- offset = start + offset_in_page(eb->start) + byte_offset;
+ offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset;
- *page_index = offset >> PAGE_SHIFT;
- *page_offset = offset_in_page(offset);
+ *folio_index = offset >> folio_shift(eb->folios[0]);
+ *folio_offset = offset_in_folio(eb->folios[0], offset);
}
/*
@@ -4300,25 +4612,23 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb,
int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
unsigned long nr)
{
- u8 *kaddr;
- struct page *page;
unsigned long i;
size_t offset;
+ u8 *kaddr;
eb_bitmap_offset(eb, start, nr, &i, &offset);
- page = eb->pages[i];
- assert_eb_page_uptodate(eb, page);
- kaddr = page_address(page);
+ assert_eb_folio_uptodate(eb, i);
+ kaddr = folio_address(eb->folios[i]);
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr)
{
- unsigned long index = get_eb_page_index(bytenr);
+ unsigned long index = get_eb_folio_index(eb, bytenr);
if (check_eb_range(eb, bytenr, 1))
return NULL;
- return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr);
+ return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr);
}
/*
@@ -4403,19 +4713,30 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
unsigned long dst_offset, unsigned long src_offset,
unsigned long len)
{
+ const int unit_size = folio_size(dst->folios[0]);
unsigned long cur_off = 0;
if (check_eb_range(dst, dst_offset, len) ||
check_eb_range(dst, src_offset, len))
return;
+ if (dst->addr) {
+ const bool use_memmove = areas_overlap(src_offset, dst_offset, len);
+
+ if (use_memmove)
+ memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+ else
+ memcpy(dst->addr + dst_offset, dst->addr + src_offset, len);
+ return;
+ }
+
while (cur_off < len) {
unsigned long cur_src = cur_off + src_offset;
- unsigned long pg_index = get_eb_page_index(cur_src);
- unsigned long pg_off = get_eb_offset_in_page(dst, cur_src);
+ unsigned long folio_index = get_eb_folio_index(dst, cur_src);
+ unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src);
unsigned long cur_len = min(src_offset + len - cur_src,
- PAGE_SIZE - pg_off);
- void *src_addr = page_address(dst->pages[pg_index]) + pg_off;
+ unit_size - folio_off);
+ void *src_addr = folio_address(dst->folios[folio_index]) + folio_off;
const bool use_memmove = areas_overlap(src_offset + cur_off,
dst_offset + cur_off, cur_len);
@@ -4441,24 +4762,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
return;
}
+ if (dst->addr) {
+ memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+ return;
+ }
+
while (len > 0) {
unsigned long src_i;
size_t cur;
- size_t dst_off_in_page;
- size_t src_off_in_page;
+ size_t dst_off_in_folio;
+ size_t src_off_in_folio;
void *src_addr;
bool use_memmove;
- src_i = get_eb_page_index(src_end);
+ src_i = get_eb_folio_index(dst, src_end);
- dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
- src_off_in_page = get_eb_offset_in_page(dst, src_end);
+ dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end);
+ src_off_in_folio = get_eb_offset_in_folio(dst, src_end);
- cur = min_t(unsigned long, len, src_off_in_page + 1);
- cur = min(cur, dst_off_in_page + 1);
+ cur = min_t(unsigned long, len, src_off_in_folio + 1);
+ cur = min(cur, dst_off_in_folio + 1);
- src_addr = page_address(dst->pages[src_i]) + src_off_in_page -
- cur + 1;
+ src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio -
+ cur + 1;
use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1,
cur);
@@ -4520,7 +4846,7 @@ static int try_release_subpage_extent_buffer(struct page *page)
struct extent_buffer *eb = NULL;
/*
- * Unlike try_release_extent_buffer() which uses page->private
+ * Unlike try_release_extent_buffer() which uses folio private
* to grab buffer, for subpage case we rely on radix tree, thus
* we need to ensure radix tree consistency.
*
@@ -4560,43 +4886,44 @@ static int try_release_subpage_extent_buffer(struct page *page)
/*
* Here we don't care about the return value, we will always
- * check the page private at the end. And
+ * check the folio private at the end. And
* release_extent_buffer() will release the refs_lock.
*/
release_extent_buffer(eb);
}
/*
- * Finally to check if we have cleared page private, as if we have
- * released all ebs in the page, the page private should be cleared now.
+ * Finally to check if we have cleared folio private, as if we have
+ * released all ebs in the page, the folio private should be cleared now.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page))
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(page_folio(page)))
ret = 1;
else
ret = 0;
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
return ret;
}
int try_release_extent_buffer(struct page *page)
{
+ struct folio *folio = page_folio(page);
struct extent_buffer *eb;
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return try_release_subpage_extent_buffer(page);
/*
- * We need to make sure nobody is changing page->private, as we rely on
- * page->private as the pointer to extent buffer.
+ * We need to make sure nobody is changing folio private, as we rely on
+ * folio private as the pointer to extent buffer.
*/
- spin_lock(&page->mapping->private_lock);
- if (!PagePrivate(page)) {
- spin_unlock(&page->mapping->private_lock);
+ spin_lock(&page->mapping->i_private_lock);
+ if (!folio_test_private(folio)) {
+ spin_unlock(&page->mapping->i_private_lock);
return 1;
}
- eb = (struct extent_buffer *)page->private;
+ eb = folio_get_private(folio);
BUG_ON(!eb);
/*
@@ -4607,10 +4934,10 @@ int try_release_extent_buffer(struct page *page)
spin_lock(&eb->refs_lock);
if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
spin_unlock(&eb->refs_lock);
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
return 0;
}
- spin_unlock(&page->mapping->private_lock);
+ spin_unlock(&page->mapping->i_private_lock);
/*
* If tree ref isn't set then we know the ref on this eb is a real ref,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2171057a4477..46050500529b 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -28,7 +28,8 @@ enum {
EXTENT_BUFFER_IN_TREE,
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
- EXTENT_BUFFER_NO_CHECK,
+ /* Indicate the extent buffer is written zeroed out (for zoned) */
+ EXTENT_BUFFER_ZONED_ZEROOUT,
/* Indicate that extent buffer pages a being read */
EXTENT_BUFFER_READING,
};
@@ -43,10 +44,10 @@ enum {
};
/*
- * page->private values. Every page that is controlled by the extent
- * map has page->private set to one.
+ * Folio private values. Every page that is controlled by the extent map has
+ * folio private set to this value.
*/
-#define EXTENT_PAGE_PRIVATE 1
+#define EXTENT_FOLIO_PRIVATE 1
/*
* The extent buffer bitmap operations are done with byte granularity instead of
@@ -77,6 +78,13 @@ struct extent_buffer {
unsigned long len;
unsigned long bflags;
struct btrfs_fs_info *fs_info;
+
+ /*
+ * The address where the eb can be accessed without any cross-page handling.
+ * This can be NULL if not possible.
+ */
+ void *addr;
+
spinlock_t refs_lock;
atomic_t refs;
int read_mirror;
@@ -86,7 +94,12 @@ struct extent_buffer {
struct rw_semaphore lock;
- struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
+ /*
+ * Pointers to all the folios of the extent buffer.
+ *
+ * For now the folio is always order 0 (aka, a single page).
+ */
+ struct folio *folios[INLINE_EXTENT_BUFFER_PAGES];
#ifdef CONFIG_BTRFS_DEBUG
struct list_head leak_list;
pid_t lock_owner;
@@ -108,29 +121,43 @@ struct btrfs_eb_write_context {
*
* Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
*/
-static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
- unsigned long offset)
+static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb,
+ unsigned long offset)
{
/*
- * For sectorsize == PAGE_SIZE case, eb->start will always be aligned
- * to PAGE_SIZE, thus adding it won't cause any difference.
+ * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+ * 1.1) One large folio covering the whole eb
+ * The eb->start is aligned to folio size, thus adding it
+ * won't cause any difference.
+ * 1.2) Several page sized folios
+ * The eb->start is aligned to folio (page) size, thus
+ * adding it won't cause any difference.
*
- * For sectorsize < PAGE_SIZE, we must only read the data that belongs
- * to the eb, thus we have to take the eb->start into consideration.
+ * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+ * In this case there would only be one page sized folio, and there
+ * may be several different extent buffers in the page/folio.
+ * We need to add eb->start to properly access the offset inside
+ * that eb.
*/
- return offset_in_page(offset + eb->start);
+ return offset_in_folio(eb->folios[0], offset + eb->start);
}
-static inline unsigned long get_eb_page_index(unsigned long offset)
+static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb,
+ unsigned long offset)
{
/*
- * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
+ * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case
+ * 1.1) One large folio covering the whole eb.
+ * the folio_shift would be large enough to always make us
+ * return 0 as index.
+ * 1.2) Several page sized folios
+ * The folio_shift() would be PAGE_SHIFT, giving us the correct
+ * index.
*
- * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
- * and have ensured that all tree blocks are contained in one page,
- * thus we always get index == 0.
+ * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case
+ * The folio would only be page sized, and always give us 0 as index.
*/
- return offset >> PAGE_SHIFT;
+ return offset >> folio_shift(eb->folios[0]);
}
/*
@@ -230,6 +257,20 @@ static inline int num_extent_pages(const struct extent_buffer *eb)
return (eb->len >> PAGE_SHIFT) ?: 1;
}
+/*
+ * This can only be determined at runtime by checking eb::folios[0].
+ *
+ * As we can have either one large folio covering the whole eb
+ * (either nodesize <= PAGE_SIZE, or high order folio), or multiple
+ * single-paged folios.
+ */
+static inline int num_extent_folios(const struct extent_buffer *eb)
+{
+ if (folio_order(eb->folios[0]))
+ return 1;
+ return num_extent_pages(eb);
+}
+
static inline int extent_buffer_uptodate(const struct extent_buffer *eb)
{
return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
@@ -294,7 +335,8 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans,
struct extent_buffer *buf);
-int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
+int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array,
+ gfp_t extra_gfp);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
bool find_lock_delalloc_range(struct inode *inode,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index a6d8368ed0ed..b61099bf97a8 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -50,7 +50,6 @@ struct extent_map *alloc_extent_map(void)
if (!em)
return NULL;
RB_CLEAR_NODE(&em->rb_node);
- em->compress_type = BTRFS_COMPRESS_NONE;
refcount_set(&em->refs, 1);
INIT_LIST_HEAD(&em->list);
return em;
@@ -67,8 +66,6 @@ void free_extent_map(struct extent_map *em)
if (refcount_dec_and_test(&em->refs)) {
WARN_ON(extent_map_in_tree(em));
WARN_ON(!list_empty(&em->list));
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- kfree(em->map_lookup);
kmem_cache_free(extent_map_cache, em);
}
}
@@ -182,50 +179,50 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return NULL;
}
-/* Check to see if two extent_map structs are adjacent and safe to merge. */
-static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+static inline u64 extent_map_block_end(const struct extent_map *em)
{
- if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
- return 0;
+ if (em->block_start + em->block_len < em->block_start)
+ return (u64)-1;
+ return em->block_start + em->block_len;
+}
- /*
- * don't merge compressed extents, we need to know their
- * actual size
- */
- if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
- return 0;
+static bool can_merge_extent_map(const struct extent_map *em)
+{
+ if (em->flags & EXTENT_FLAG_PINNED)
+ return false;
+
+ /* Don't merge compressed extents, we need to know their actual size. */
+ if (extent_map_is_compressed(em))
+ return false;
- if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
- test_bit(EXTENT_FLAG_LOGGING, &next->flags))
- return 0;
+ if (em->flags & EXTENT_FLAG_LOGGING)
+ return false;
/*
* We don't want to merge stuff that hasn't been written to the log yet
* since it may not reflect exactly what is on disk, and that would be
* bad.
*/
- if (!list_empty(&prev->list) || !list_empty(&next->list))
- return 0;
-
- ASSERT(next->block_start != EXTENT_MAP_DELALLOC &&
- prev->block_start != EXTENT_MAP_DELALLOC);
-
- if (prev->map_lookup || next->map_lookup)
- ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) &&
- test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags));
-
- if (extent_map_end(prev) == next->start &&
- prev->flags == next->flags &&
- prev->map_lookup == next->map_lookup &&
- ((next->block_start == EXTENT_MAP_HOLE &&
- prev->block_start == EXTENT_MAP_HOLE) ||
- (next->block_start == EXTENT_MAP_INLINE &&
- prev->block_start == EXTENT_MAP_INLINE) ||
- (next->block_start < EXTENT_MAP_LAST_BYTE - 1 &&
- next->block_start == extent_map_block_end(prev)))) {
- return 1;
- }
- return 0;
+ if (!list_empty(&em->list))
+ return false;
+
+ return true;
+}
+
+/* Check to see if two extent_map structs are adjacent and safe to merge. */
+static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next)
+{
+ if (extent_map_end(prev) != next->start)
+ return false;
+
+ if (prev->flags != next->flags)
+ return false;
+
+ if (next->block_start < EXTENT_MAP_LAST_BYTE - 1)
+ return next->block_start == extent_map_block_end(prev);
+
+ /* HOLES and INLINE extents. */
+ return next->block_start == prev->block_start;
}
static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
@@ -244,11 +241,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
if (refcount_read(&em->refs) > 2)
return;
+ if (!can_merge_extent_map(em))
+ return;
+
if (em->start != 0) {
rb = rb_prev(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(merge, em)) {
+ if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) {
em->start = merge->start;
em->orig_start = merge->orig_start;
em->len += merge->len;
@@ -257,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start;
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
- set_bit(EXTENT_FLAG_MERGED, &em->flags);
+ em->flags |= EXTENT_FLAG_MERGED;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
@@ -268,14 +268,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
rb = rb_next(&em->rb_node);
if (rb)
merge = rb_entry(rb, struct extent_map, rb_node);
- if (rb && mergable_maps(em, merge)) {
+ if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) {
em->len += merge->len;
em->block_len += merge->block_len;
rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
- set_bit(EXTENT_FLAG_MERGED, &em->flags);
+ em->flags |= EXTENT_FLAG_MERGED;
free_extent_map(merge);
}
}
@@ -283,7 +283,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
/*
* Unpin an extent from the cache.
*
- * @tree: tree to unpin the extent in
+ * @inode: the inode from which we are unpinning an extent range
* @start: logical offset in the file
* @len: length of the extent
* @gen: generation that this extent has been modified in
@@ -292,9 +292,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
* to the generation that actually added the file item to the inode so we know
* we need to sync this extent when we call fsync().
*/
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
- u64 gen)
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen)
{
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+ struct extent_map_tree *tree = &inode->extent_tree;
int ret = 0;
struct extent_map *em;
bool prealloc = false;
@@ -302,19 +303,28 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
write_lock(&tree->lock);
em = lookup_extent_mapping(tree, start, len);
- WARN_ON(!em || em->start != start);
-
- if (!em)
+ if (WARN_ON(!em)) {
+ btrfs_warn(fs_info,
+"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu",
+ btrfs_ino(inode), btrfs_root_id(inode->root),
+ start, len, gen);
goto out;
+ }
+
+ if (WARN_ON(em->start != start))
+ btrfs_warn(fs_info,
+"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu",
+ btrfs_ino(inode), btrfs_root_id(inode->root),
+ em->start, start, len, gen);
em->generation = gen;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags &= ~EXTENT_FLAG_PINNED;
em->mod_start = em->start;
em->mod_len = em->len;
- if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) {
+ if (em->flags & EXTENT_FLAG_FILLING) {
prealloc = true;
- clear_bit(EXTENT_FLAG_FILLING, &em->flags);
+ em->flags &= ~EXTENT_FLAG_FILLING;
}
try_merge_map(tree, em);
@@ -335,7 +345,7 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
{
lockdep_assert_held_write(&tree->lock);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags &= ~EXTENT_FLAG_LOGGING;
if (extent_map_in_tree(em))
try_merge_map(tree, em);
}
@@ -348,45 +358,14 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree,
em->mod_start = em->start;
em->mod_len = em->len;
+ ASSERT(list_empty(&em->list));
+
if (modified)
- list_move(&em->list, &tree->modified_extents);
+ list_add(&em->list, &tree->modified_extents);
else
try_merge_map(tree, em);
}
-static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
-{
- struct map_lookup *map = em->map_lookup;
- u64 stripe_size = em->orig_block_len;
- int i;
-
- for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_io_stripe *stripe = &map->stripes[i];
- struct btrfs_device *device = stripe->dev;
-
- set_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1,
- bits | EXTENT_NOWAIT, NULL);
- }
-}
-
-static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
-{
- struct map_lookup *map = em->map_lookup;
- u64 stripe_size = em->orig_block_len;
- int i;
-
- for (i = 0; i < map->num_stripes; i++) {
- struct btrfs_io_stripe *stripe = &map->stripes[i];
- struct btrfs_device *device = stripe->dev;
-
- __clear_extent_bit(&device->alloc_state, stripe->physical,
- stripe->physical + stripe_size - 1,
- bits | EXTENT_NOWAIT,
- NULL, NULL);
- }
-}
-
/*
* Add new extent map to the extent tree
*
@@ -400,8 +379,8 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
* into the tree directly, with an additional reference taken, or a
* reference dropped if the merge attempt was successful.
*/
-int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em, int modified)
+static int add_extent_mapping(struct extent_map_tree *tree,
+ struct extent_map *em, int modified)
{
int ret = 0;
@@ -412,10 +391,6 @@ int add_extent_mapping(struct extent_map_tree *tree,
goto out;
setup_extent_mapping(tree, em, modified);
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) {
- extent_map_device_set_bits(em, CHUNK_ALLOCATED);
- extent_map_device_clear_bits(em, CHUNK_TRIMMED);
- }
out:
return ret;
}
@@ -495,12 +470,10 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
lockdep_assert_held_write(&tree->lock);
- WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
+ WARN_ON(em->flags & EXTENT_FLAG_PINNED);
rb_erase_cached(&em->rb_node, &tree->map);
- if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+ if (!(em->flags & EXTENT_FLAG_LOGGING))
list_del_init(&em->list);
- if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
- extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
RB_CLEAR_NODE(&em->rb_node);
}
@@ -511,9 +484,9 @@ static void replace_extent_mapping(struct extent_map_tree *tree,
{
lockdep_assert_held_write(&tree->lock);
- WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags));
+ WARN_ON(cur->flags & EXTENT_FLAG_PINNED);
ASSERT(extent_map_in_tree(cur));
- if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
+ if (!(cur->flags & EXTENT_FLAG_LOGGING))
list_del_init(&cur->list);
rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
RB_CLEAR_NODE(&cur->rb_node);
@@ -576,7 +549,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
em->start = start;
em->len = end - start;
if (em->block_start < EXTENT_MAP_LAST_BYTE &&
- !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ !extent_map_is_compressed(em)) {
em->block_start += start_diff;
em->block_len = em->len;
}
@@ -626,8 +599,6 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
if (ret == -EEXIST) {
struct extent_map *existing;
- ret = 0;
-
existing = search_extent_mapping(em_tree, start, len);
trace_btrfs_handle_em_exist(fs_info, existing, em, start, len);
@@ -681,8 +652,7 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree)
node = rb_first_cached(&tree->map);
em = rb_entry(node, struct extent_map, rb_node);
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
- clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
remove_extent_mapping(tree, em);
free_extent_map(em);
cond_resched_rwlock_write(&tree->lock);
@@ -758,19 +728,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
}
}
- if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
+ if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) {
start = em_end;
goto next;
}
flags = em->flags;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
/*
* In case we split the extent map, we want to preserve the
* EXTENT_FLAG_LOGGING flag on our extent map, but we don't want
* it on the new extent maps.
*/
- clear_bit(EXTENT_FLAG_LOGGING, &flags);
+ em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING);
modified = !list_empty(&em->list);
/*
@@ -781,7 +750,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
goto remove_em;
gen = em->generation;
- compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ compressed = extent_map_is_compressed(em);
if (em->start < start) {
if (!split) {
@@ -814,7 +783,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->generation = gen;
split->flags = flags;
- split->compress_type = em->compress_type;
replace_extent_mapping(em_tree, em, split, modified);
free_extent_map(split);
split = split2;
@@ -831,7 +799,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end,
split->len = em_end - end;
split->block_start = em->block_start;
split->flags = flags;
- split->compress_type = em->compress_type;
split->generation = gen;
if (em->block_start < EXTENT_MAP_LAST_BYTE) {
@@ -997,14 +964,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
}
ASSERT(em->len == len);
- ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags));
+ ASSERT(!extent_map_is_compressed(em));
ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE);
- ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags));
+ ASSERT(em->flags & EXTENT_FLAG_PINNED);
+ ASSERT(!(em->flags & EXTENT_FLAG_LOGGING));
ASSERT(!list_empty(&em->list));
flags = em->flags;
- clear_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags &= ~EXTENT_FLAG_PINNED;
/* First, replace the em with a new extent_map starting from * em->start */
split_pre->start = em->start;
@@ -1015,7 +982,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_pre->orig_block_len = split_pre->block_len;
split_pre->ram_bytes = split_pre->len;
split_pre->flags = flags;
- split_pre->compress_type = em->compress_type;
split_pre->generation = em->generation;
replace_extent_mapping(em_tree, em, split_pre, 1);
@@ -1034,7 +1000,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
split_mid->orig_block_len = split_mid->block_len;
split_mid->ram_bytes = split_mid->len;
split_mid->flags = flags;
- split_mid->compress_type = em->compress_type;
split_mid->generation = em->generation;
add_extent_mapping(em_tree, split_mid, 1);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 35d27c756e08..e380fc08bbe4 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -5,30 +5,33 @@
#include <linux/rbtree.h>
#include <linux/refcount.h>
+#include "compression.h"
#define EXTENT_MAP_LAST_BYTE ((u64)-4)
#define EXTENT_MAP_HOLE ((u64)-3)
#define EXTENT_MAP_INLINE ((u64)-2)
-/* used only during fiemap calls */
-#define EXTENT_MAP_DELALLOC ((u64)-1)
/* bits for the extent_map::flags field */
enum {
/* this entry not yet on disk, don't free it */
- EXTENT_FLAG_PINNED,
- EXTENT_FLAG_COMPRESSED,
+ ENUM_BIT(EXTENT_FLAG_PINNED),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO),
+ ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD),
/* pre-allocated extent */
- EXTENT_FLAG_PREALLOC,
+ ENUM_BIT(EXTENT_FLAG_PREALLOC),
/* Logging this extent */
- EXTENT_FLAG_LOGGING,
+ ENUM_BIT(EXTENT_FLAG_LOGGING),
/* Filling in a preallocated extent */
- EXTENT_FLAG_FILLING,
- /* filesystem extent mapping type */
- EXTENT_FLAG_FS_MAPPING,
+ ENUM_BIT(EXTENT_FLAG_FILLING),
/* This em is merged from two or more physically adjacent ems */
- EXTENT_FLAG_MERGED,
+ ENUM_BIT(EXTENT_FLAG_MERGED),
};
+/*
+ * Keep this structure as compact as possible, as we can have really large
+ * amounts of allocated extent maps at any time.
+ */
struct extent_map {
struct rb_node rb_node;
@@ -49,11 +52,8 @@ struct extent_map {
* For non-merged extents, it's from btrfs_file_extent_item::generation.
*/
u64 generation;
- unsigned long flags;
- /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */
- struct map_lookup *map_lookup;
+ u32 flags;
refcount_t refs;
- unsigned int compress_type;
struct list_head list;
};
@@ -65,30 +65,57 @@ struct extent_map_tree {
struct btrfs_inode;
+static inline void extent_map_set_compression(struct extent_map *em,
+ enum btrfs_compression_type type)
+{
+ if (type == BTRFS_COMPRESS_ZLIB)
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
+ else if (type == BTRFS_COMPRESS_LZO)
+ em->flags |= EXTENT_FLAG_COMPRESS_LZO;
+ else if (type == BTRFS_COMPRESS_ZSTD)
+ em->flags |= EXTENT_FLAG_COMPRESS_ZSTD;
+}
+
+static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em)
+{
+ if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB)
+ return BTRFS_COMPRESS_ZLIB;
+
+ if (em->flags & EXTENT_FLAG_COMPRESS_LZO)
+ return BTRFS_COMPRESS_LZO;
+
+ if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD)
+ return BTRFS_COMPRESS_ZSTD;
+
+ return BTRFS_COMPRESS_NONE;
+}
+
+/*
+ * More efficient way to determine if extent is compressed, instead of using
+ * 'extent_map_compression() != BTRFS_COMPRESS_NONE'.
+ */
+static inline bool extent_map_is_compressed(const struct extent_map *em)
+{
+ return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB |
+ EXTENT_FLAG_COMPRESS_LZO |
+ EXTENT_FLAG_COMPRESS_ZSTD)) != 0;
+}
+
static inline int extent_map_in_tree(const struct extent_map *em)
{
return !RB_EMPTY_NODE(&em->rb_node);
}
-static inline u64 extent_map_end(struct extent_map *em)
+static inline u64 extent_map_end(const struct extent_map *em)
{
if (em->start + em->len < em->start)
return (u64)-1;
return em->start + em->len;
}
-static inline u64 extent_map_block_end(struct extent_map *em)
-{
- if (em->block_start + em->block_len < em->block_start)
- return (u64)-1;
- return em->block_start + em->block_len;
-}
-
void extent_map_tree_init(struct extent_map_tree *tree);
struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
-int add_extent_mapping(struct extent_map_tree *tree,
- struct extent_map *em, int modified);
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre,
u64 new_logical);
@@ -97,7 +124,7 @@ struct extent_map *alloc_extent_map(void);
void free_extent_map(struct extent_map *em);
int __init extent_map_init(void);
void __cold extent_map_exit(void);
-int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen);
void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 45cae356e89b..81ac1d474bf1 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -59,7 +59,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
goto out_unlock;
}
- ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start,
+ ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start,
&end, EXTENT_DIRTY);
if (!ret && start == 0)
i_size = min(i_size, end + 1);
@@ -94,7 +94,7 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
- return set_extent_bit(&inode->file_extent_tree, start, start + len - 1,
+ return set_extent_bit(inode->file_extent_tree, start, start + len - 1,
EXTENT_DIRTY, NULL);
}
@@ -123,7 +123,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES))
return 0;
- return clear_extent_bit(&inode->file_extent_tree, start,
+ return clear_extent_bit(inode->file_extent_tree, start,
start + len - 1, EXTENT_DIRTY, NULL);
}
@@ -1294,8 +1294,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
return;
}
if (compress_type != BTRFS_COMPRESS_NONE) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
+ extent_map_set_compression(em, compress_type);
em->block_start = bytenr;
em->block_len = em->orig_block_len;
} else {
@@ -1303,7 +1302,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
em->block_start = bytenr;
em->block_len = em->len;
if (type == BTRFS_FILE_EXTENT_PREALLOC)
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
}
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
em->block_start = EXTENT_MAP_INLINE;
@@ -1315,9 +1314,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
*/
em->orig_start = EXTENT_MAP_HOLE;
em->block_len = (u64)-1;
- em->compress_type = compress_type;
- if (compress_type != BTRFS_COMPRESS_NONE)
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ extent_map_set_compression(em, compress_type);
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 32611a4edd6b..38dfcac47609 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -111,8 +111,8 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
* accessed as prepare_pages should have marked them accessed
* in prepare_pages via find_or_create_page()
*/
- btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
- block_len);
+ btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]),
+ block_start, block_len);
unlock_page(pages[i]);
put_page(pages[i]);
}
@@ -168,9 +168,12 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
- btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
- btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
- btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
+ btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p),
+ start_pos, num_bytes);
+ btrfs_folio_clamp_clear_checked(fs_info, page_folio(p),
+ start_pos, num_bytes);
+ btrfs_folio_clamp_set_dirty(fs_info, page_folio(p),
+ start_pos, num_bytes);
}
/*
@@ -869,9 +872,9 @@ static int prepare_uptodate_page(struct inode *inode,
* released.
*
* The private flag check is essential for subpage as we need
- * to store extra bitmap using page->private.
+ * to store extra bitmap using folio private.
*/
- if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
+ if (page->mapping != inode->i_mapping || !folio_test_private(folio)) {
unlock_page(page);
return -EAGAIN;
}
@@ -2150,7 +2153,6 @@ out:
hole_em->block_start = EXTENT_MAP_HOLE;
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -2839,7 +2841,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
if (em->block_start == EXTENT_MAP_HOLE)
ret = RANGE_BOUNDARY_HOLE;
- else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ else if (em->flags & EXTENT_FLAG_PREALLOC)
ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
else
ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
@@ -2879,8 +2881,7 @@ static int btrfs_zero_range(struct inode *inode,
* extents and holes, we drop all the existing extents and allocate a
* new prealloc extent, so that we get a larger contiguous disk extent.
*/
- if (em->start <= alloc_start &&
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) {
const u64 em_end = em->start + em->len;
if (em_end >= offset + len) {
@@ -2915,7 +2916,7 @@ static int btrfs_zero_range(struct inode *inode,
goto out;
}
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (em->flags & EXTENT_FLAG_PREALLOC) {
free_extent_map(em);
ret = btrfs_fallocate_update_isize(inode, offset + len,
mode);
@@ -3136,7 +3137,7 @@ static long btrfs_fallocate(struct file *file, int mode,
last_byte = ALIGN(last_byte, blocksize);
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
- !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+ !(em->flags & EXTENT_FLAG_PREALLOC))) {
const u64 range_len = last_byte - cur_offset;
ret = add_falloc_range(&reserve_list, cur_offset, range_len);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 6f93c9a2c3e3..d372c7ce0e6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -439,8 +439,8 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl)
for (i = 0; i < io_ctl->num_pages; i++) {
if (io_ctl->pages[i]) {
- btrfs_page_clear_checked(io_ctl->fs_info,
- io_ctl->pages[i],
+ btrfs_folio_clear_checked(io_ctl->fs_info,
+ page_folio(io_ctl->pages[i]),
page_offset(io_ctl->pages[i]),
PAGE_SIZE);
unlock_page(io_ctl->pages[i]);
diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h
index 318df6f9d9cb..f8bb73d6ab68 100644
--- a/fs/btrfs/fs.h
+++ b/fs/btrfs/fs.h
@@ -188,6 +188,7 @@ enum {
BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27),
BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28),
BTRFS_MOUNT_NODISCARD = (1UL << 29),
+ BTRFS_MOUNT_NOSPACECACHE = (1UL << 30),
};
/*
@@ -398,7 +399,8 @@ struct btrfs_fs_info {
struct extent_io_tree excluded_extents;
/* logical->physical extent mapping */
- struct extent_map_tree mapping_tree;
+ struct rb_root_cached mapping_tree;
+ rwlock_t mapping_tree_lock;
/*
* Block reservation for extent, checksum, root tree and delayed dir
@@ -960,20 +962,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
BTRFS_MOUNT_##opt)
-#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (!btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_set_opt(fs_info->mount_opt, opt); \
-} while (0)
-
-#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
-do { \
- if (btrfs_test_opt(fs_info, opt)) \
- btrfs_info(fs_info, fmt, ##args); \
- btrfs_clear_opt(fs_info->mount_opt, opt); \
-} while (0)
-
static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
{
/* Do it this way so we only ever do one test_bit in the normal case. */
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fb3c3f43c3fa..4795738d5785 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -114,6 +114,15 @@ struct data_reloc_warn {
int mirror_num;
};
+/*
+ * For the file_extent_tree, we want to hold the inode lock when we lookup and
+ * update the disk_i_size, but lockdep will complain because our io_tree we hold
+ * the tree lock and get the inode lock when setting delalloc. These two things
+ * are unrelated, so make a class for the file_extent_tree so we don't get the
+ * two locking patterns mixed up.
+ */
+static struct lock_class_key file_extent_tree_class;
+
static const struct inode_operations btrfs_dir_inode_operations;
static const struct inode_operations btrfs_symlink_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
@@ -447,8 +456,8 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
* range, then btrfs_mark_ordered_io_finished() will handle
* the ordered extent accounting for the range.
*/
- btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
- offset, bytes);
+ btrfs_folio_clamp_clear_ordered(inode->root->fs_info,
+ page_folio(page), offset, bytes);
put_page(page);
}
@@ -1037,7 +1046,7 @@ free_pages:
if (pages) {
for (i = 0; i < nr_pages; i++) {
WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ btrfs_free_compr_page(pages[i]);
}
kfree(pages);
}
@@ -1052,7 +1061,7 @@ static void free_async_extent_pages(struct async_extent *async_extent)
for (i = 0; i < async_extent->nr_pages; i++) {
WARN_ON(async_extent->pages[i]->mapping);
- put_page(async_extent->pages[i]);
+ btrfs_free_compr_page(async_extent->pages[i]);
}
kfree(async_extent->pages);
async_extent->nr_pages = 0;
@@ -2793,7 +2802,7 @@ out_page:
PAGE_SIZE, !ret);
clear_page_dirty_for_io(page);
}
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE);
unlock_page(page);
put_page(page);
kfree(fixup);
@@ -2848,7 +2857,7 @@ int btrfs_writepage_cow_fixup(struct page *page)
* page->mapping outside of the page lock.
*/
ihold(inode);
- btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
+ btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE);
get_page(page);
btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
fixup->page = page;
@@ -3118,7 +3127,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
ordered_extent->disk_num_bytes);
}
}
- unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
+ unpin_extent_cache(inode, ordered_extent->file_offset,
ordered_extent->num_bytes, trans->transid);
if (ret < 0) {
btrfs_abort_transaction(trans, ret);
@@ -3175,8 +3184,23 @@ out:
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
- /* Drop extent maps for the part of the extent we didn't write. */
- btrfs_drop_extent_map_range(inode, unwritten_start, end, false);
+ /*
+ * Drop extent maps for the part of the extent we didn't write.
+ *
+ * We have an exception here for the free_space_inode, this is
+ * because when we do btrfs_get_extent() on the free space inode
+ * we will search the commit root. If this is a new block group
+ * we won't find anything, and we will trip over the assert in
+ * writepage where we do ASSERT(em->block_start !=
+ * EXTENT_MAP_HOLE).
+ *
+ * Theoretically we could also skip this for any NOCOW extent as
+ * we don't mess with the extent map tree in the NOCOW case, but
+ * for now simply skip this if we are the free space inode.
+ */
+ if (!btrfs_is_free_space_inode(inode))
+ btrfs_drop_extent_map_range(inode, unwritten_start,
+ end, false);
/*
* If the ordered extent had an IOERR or something else went
@@ -3796,7 +3820,7 @@ cache_index:
* cache.
*
* This is required for both inode re-read from disk and delayed inode
- * in delayed_nodes_tree.
+ * in the delayed_nodes xarray.
*/
if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info))
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -4449,6 +4473,8 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
u64 root_flags;
int ret;
+ down_write(&fs_info->subvol_sem);
+
/*
* Don't allow to delete a subvolume with send in progress. This is
* inside the inode lock so the error handling that has to drop the bit
@@ -4460,25 +4486,25 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
dest->root_key.objectid);
- return -EPERM;
+ ret = -EPERM;
+ goto out_up_write;
}
if (atomic_read(&dest->nr_swapfiles)) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu with active swapfile",
root->root_key.objectid);
- return -EPERM;
+ ret = -EPERM;
+ goto out_up_write;
}
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- down_write(&fs_info->subvol_sem);
-
ret = may_destroy_subvol(dest);
if (ret)
- goto out_up_write;
+ goto out_undead;
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
@@ -4488,7 +4514,7 @@ int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
*/
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
if (ret)
- goto out_up_write;
+ goto out_undead;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
@@ -4554,15 +4580,17 @@ out_end_trans:
inode->i_flags |= S_DEAD;
out_release:
btrfs_subvolume_release_metadata(root, &block_rsv);
-out_up_write:
- up_write(&fs_info->subvol_sem);
+out_undead:
if (ret) {
spin_lock(&dest->root_item_lock);
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
spin_unlock(&dest->root_item_lock);
- } else {
+ }
+out_up_write:
+ up_write(&fs_info->subvol_sem);
+ if (!ret) {
d_invalidate(dentry);
btrfs_prune_dentries(dest);
ASSERT(dest->send_in_progress == 0);
@@ -4725,7 +4753,7 @@ again:
/*
* We unlock the page after the io is completed and then re-lock it
* above. release_folio() could have come in between that and cleared
- * PagePrivate(), but left the page in the mapping. Set the page mapped
+ * folio private, but left the page in the mapping. Set the page mapped
* here to make sure it's properly set for the subpage stuff.
*/
ret = set_page_extent_mapped(page);
@@ -4767,9 +4795,10 @@ again:
memzero_page(page, (block_start - page_offset(page)) + offset,
len);
}
- btrfs_page_clear_checked(fs_info, page, block_start,
- block_end + 1 - block_start);
- btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), block_start,
+ block_end + 1 - block_start);
+ btrfs_folio_set_dirty(fs_info, page_folio(page), block_start,
+ block_end + 1 - block_start);
unlock_extent(io_tree, block_start, block_end, &cached_state);
if (only_release_metadata)
@@ -4889,7 +4918,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
last_byte = ALIGN(last_byte, fs_info->sectorsize);
hole_size = last_byte - cur_offset;
- if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
struct extent_map *hole_em;
err = maybe_insert_hole(inode, cur_offset, hole_size);
@@ -4917,7 +4946,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
hole_em->block_len = 0;
hole_em->orig_block_len = 0;
hole_em->ram_bytes = hole_size;
- hole_em->compress_type = BTRFS_COMPRESS_NONE;
hole_em->generation = btrfs_get_fs_generation(fs_info);
err = btrfs_replace_extent_map_range(inode, hole_em, true);
@@ -6217,6 +6245,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
inode->i_generation = BTRFS_I(inode)->generation;
/*
+ * We don't have any capability xattrs set here yet, shortcut any
+ * queries for the xattrs here. If we add them later via the inode
+ * security init path or any other path this flag will be cleared.
+ */
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ /*
* Subvolumes don't inherit flags from their parent directory.
* Originally this was probably by accident, but we probably can't
* change it now without compatibility issues.
@@ -7258,13 +7293,11 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
em->orig_block_len = orig_block_len;
em->ram_bytes = ram_bytes;
em->generation = -1;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
- if (type == BTRFS_ORDERED_PREALLOC) {
- set_bit(EXTENT_FLAG_FILLING, &em->flags);
- } else if (type == BTRFS_ORDERED_COMPRESSED) {
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
- em->compress_type = compress_type;
- }
+ em->flags |= EXTENT_FLAG_PINNED;
+ if (type == BTRFS_ORDERED_PREALLOC)
+ em->flags |= EXTENT_FLAG_FILLING;
+ else if (type == BTRFS_ORDERED_COMPRESSED)
+ extent_map_set_compression(em, compress_type);
ret = btrfs_replace_extent_map_range(inode, em, true);
if (ret) {
@@ -7304,10 +7337,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
* just use the extent.
*
*/
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ if ((em->flags & EXTENT_FLAG_PREALLOC) ||
((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
em->block_start != EXTENT_MAP_HOLE)) {
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
type = BTRFS_ORDERED_PREALLOC;
else
type = BTRFS_ORDERED_NOCOW;
@@ -7542,7 +7575,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* to buffered IO. Don't blame me, this is the price we pay for using
* the generic code.
*/
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+ if (extent_map_is_compressed(em) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
/*
@@ -7638,7 +7671,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
* that, since we have locked only the parts we are performing I/O in.
*/
if ((em->block_start == EXTENT_MAP_HOLE) ||
- (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
+ ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) {
iomap->addr = IOMAP_NULL_ADDR;
iomap->type = IOMAP_HOLE;
} else {
@@ -7802,6 +7835,7 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
+ struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
int ret;
ret = fiemap_prep(inode, fieinfo, start, &len, 0);
@@ -7827,7 +7861,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return ret;
}
- return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
+ btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ /*
+ * We did an initial flush to avoid holding the inode's lock while
+ * triggering writeback and waiting for the completion of IO and ordered
+ * extents. Now after we locked the inode we do it again, because it's
+ * possible a new write may have happened in between those two steps.
+ */
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
+ ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
+ if (ret) {
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+ return ret;
+ }
+ }
+
+ ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
+ btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
+
+ return ret;
}
static int btrfs_writepages(struct address_space *mapping,
@@ -7851,13 +7904,14 @@ static void btrfs_readahead(struct readahead_control *rac)
static void wait_subpage_spinlock(struct page *page)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct folio *folio = page_folio(page);
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, page->mapping))
return;
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ subpage = folio_get_private(folio);
/*
* This may look insane as we just acquire the spinlock and release it,
@@ -7995,7 +8049,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
page_end);
ASSERT(range_end + 1 - cur < U32_MAX);
range_len = range_end + 1 - cur;
- if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
+ if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
/*
* If Ordered (Private2) is cleared, it means endio has
* already been executed for the range.
@@ -8004,7 +8058,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
*/
goto next;
}
- btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
+ btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
/*
* IO on this page will never be started, so we need to account
@@ -8074,7 +8128,7 @@ next:
* did something wrong.
*/
ASSERT(!folio_test_ordered(folio));
- btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
+ btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
if (!inode_evicting)
__btrfs_release_folio(folio, GFP_NOFS);
clear_page_extent_mapped(&folio->page);
@@ -8098,6 +8152,7 @@ next:
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
{
struct page *page = vmf->page;
+ struct folio *folio = page_folio(page);
struct inode *inode = file_inode(vmf->vma->vm_file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -8114,6 +8169,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
u64 page_end;
u64 end;
+ ASSERT(folio_order(folio) == 0);
+
reserved_space = PAGE_SIZE;
sb_start_pagefault(inode->i_sb);
@@ -8217,9 +8274,9 @@ again:
if (zero_start != PAGE_SIZE)
memzero_page(page, zero_start, PAGE_SIZE - zero_start);
- btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
- btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
- btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
+ btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
+ btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start);
+ btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start);
btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
@@ -8462,10 +8519,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_inode *ei;
struct inode *inode;
+ struct extent_io_tree *file_extent_tree = NULL;
+
+ /* Self tests may pass a NULL fs_info. */
+ if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) {
+ file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
+ if (!file_extent_tree)
+ return NULL;
+ }
ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
- if (!ei)
+ if (!ei) {
+ kfree(file_extent_tree);
return NULL;
+ }
ei->root = NULL;
ei->generation = 0;
@@ -8501,10 +8568,18 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
inode = &ei->vfs_inode;
extent_map_tree_init(&ei->extent_tree);
+
+ /* This io tree sets the valid inode. */
extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
ei->io_tree.inode = ei;
- extent_io_tree_init(fs_info, &ei->file_extent_tree,
- IO_TREE_INODE_FILE_EXTENT);
+
+ ei->file_extent_tree = file_extent_tree;
+ if (file_extent_tree) {
+ extent_io_tree_init(fs_info, ei->file_extent_tree,
+ IO_TREE_INODE_FILE_EXTENT);
+ /* Lockdep class is set only for the file extent tree. */
+ lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class);
+ }
mutex_init(&ei->log_mutex);
spin_lock_init(&ei->ordered_tree_lock);
ei->ordered_tree = RB_ROOT;
@@ -8521,12 +8596,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
void btrfs_test_destroy_inode(struct inode *inode)
{
btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
#endif
void btrfs_free_inode(struct inode *inode)
{
+ kfree(BTRFS_I(inode)->file_extent_tree);
kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
}
@@ -9632,7 +9709,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
em->block_len = ins.offset;
em->orig_block_len = ins.offset;
em->ram_bytes = ins.offset;
- set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
+ em->flags |= EXTENT_FLAG_PREALLOC;
em->generation = trans->transid;
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
@@ -9785,7 +9862,9 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
page = find_get_page(inode->vfs_inode.i_mapping, index);
ASSERT(page); /* Pages should be in the extent_io_tree */
- btrfs_page_set_writeback(fs_info, page, start, len);
+ /* This is for data, which doesn't yet support larger folio. */
+ ASSERT(folio_order(page_folio(page)) == 0);
+ btrfs_folio_set_writeback(fs_info, page_folio(page), start, len);
put_page(page);
index++;
}
@@ -9994,7 +10073,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
if (!pages)
return -ENOMEM;
- ret = btrfs_alloc_page_array(nr_pages, pages);
+ ret = btrfs_alloc_page_array(nr_pages, pages, 0);
if (ret) {
ret = -ENOMEM;
goto out;
@@ -10113,12 +10192,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->len = min_t(u64, extent_map_end(em),
inode->vfs_inode.i_size) - iocb->ki_pos;
if (em->block_start == EXTENT_MAP_HOLE ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
+ (em->flags & EXTENT_FLAG_PREALLOC)) {
disk_bytenr = EXTENT_MAP_HOLE;
count = min_t(u64, count, encoded->len);
encoded->len = count;
encoded->unencoded_len = count;
- } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ } else if (extent_map_is_compressed(em)) {
disk_bytenr = em->block_start;
/*
* Bail if the buffer isn't large enough to return the whole
@@ -10133,7 +10212,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
encoded->unencoded_len = em->ram_bytes;
encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
ret = btrfs_encoded_io_compression_from_extent(fs_info,
- em->compress_type);
+ extent_map_compression(em));
if (ret < 0)
goto out_em;
encoded->compression = ret;
@@ -10229,6 +10308,13 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
return -EINVAL;
+ /*
+ * Compressed extents should always have checksums, so error out if we
+ * have a NOCOW file or inode was created while mounted with NODATASUM.
+ */
+ if (inode->flags & BTRFS_INODE_NODATASUM)
+ return -EINVAL;
+
orig_count = iov_iter_count(from);
/* The extent size must be sane. */
@@ -10564,6 +10650,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_state *cached_state = NULL;
struct extent_map *em = NULL;
+ struct btrfs_chunk_map *map = NULL;
struct btrfs_device *device = NULL;
struct btrfs_swap_info bsi = {
.lowest_ppage = (sector_t)-1ULL,
@@ -10680,7 +10767,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
ret = -EINVAL;
goto out;
}
- if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+ if (extent_map_is_compressed(em)) {
btrfs_warn(fs_info, "swapfile must not be compressed");
ret = -EINVAL;
goto out;
@@ -10703,13 +10790,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
goto out;
}
- em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
goto out;
}
- if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
btrfs_warn(fs_info,
"swapfile must have single data profile");
ret = -EINVAL;
@@ -10717,23 +10804,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
}
if (device == NULL) {
- device = em->map_lookup->stripes[0].dev;
+ device = map->stripes[0].dev;
ret = btrfs_add_swapfile_pin(inode, device, false);
if (ret == 1)
ret = 0;
else if (ret)
goto out;
- } else if (device != em->map_lookup->stripes[0].dev) {
+ } else if (device != map->stripes[0].dev) {
btrfs_warn(fs_info, "swapfile must be on one device");
ret = -EINVAL;
goto out;
}
- physical_block_start = (em->map_lookup->stripes[0].physical +
- (logical_block_start - em->start));
- len = min(len, em->len - (logical_block_start - em->start));
- free_extent_map(em);
- em = NULL;
+ physical_block_start = (map->stripes[0].physical +
+ (logical_block_start - map->start));
+ len = min(len, map->chunk_len - (logical_block_start - map->start));
+ btrfs_free_chunk_map(map);
+ map = NULL;
bg = btrfs_lookup_block_group(fs_info, logical_block_start);
if (!bg) {
@@ -10786,6 +10873,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
out:
if (!IS_ERR_OR_NULL(em))
free_extent_map(em);
+ if (!IS_ERR_OR_NULL(map))
+ btrfs_free_chunk_map(map);
unlock_extent(io_tree, 0, isize - 1, &cached_state);
@@ -10930,7 +11019,7 @@ static const struct address_space_operations btrfs_aops = {
.release_folio = btrfs_release_folio,
.migrate_folio = btrfs_migrate_folio,
.dirty_folio = filemap_dirty_folio,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = btrfs_swap_activate,
.swap_deactivate = btrfs_swap_deactivate,
};
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a1743904202b..9d1eac15e09e 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -721,7 +721,7 @@ static noinline int create_subvol(struct mnt_idmap *idmap,
free_extent_buffer(leaf);
leaf = NULL;
- new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
+ new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev);
if (IS_ERR(new_root)) {
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
@@ -790,6 +790,9 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
return -EOPNOTSUPP;
}
+ if (btrfs_root_refs(&root->root_item) == 0)
+ return -ENOENT;
+
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return -EINVAL;
@@ -2608,6 +2611,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
ret = -EFAULT;
goto out;
}
+ if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
/* compression requires us to start the IO */
if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
@@ -3808,6 +3815,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
goto out;
}
+ if (sa->create && is_fstree(sa->qgroupid)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
trans = btrfs_join_transaction(root);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
@@ -4533,29 +4545,29 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool
if (ret < 0)
goto out_acct;
- file_start_write(file);
-
if (iov_iter_count(&iter) == 0) {
ret = 0;
- goto out_end_write;
+ goto out_iov;
}
pos = args.offset;
ret = rw_verify_area(WRITE, file, &pos, args.len);
if (ret < 0)
- goto out_end_write;
+ goto out_iov;
init_sync_kiocb(&kiocb, file);
ret = kiocb_set_rw_flags(&kiocb, 0);
if (ret)
- goto out_end_write;
+ goto out_iov;
kiocb.ki_pos = pos;
+ file_start_write(file);
+
ret = btrfs_do_write_iter(&kiocb, &iter, &args);
if (ret > 0)
fsnotify_modify(file);
-out_end_write:
file_end_write(file);
+out_iov:
kfree(iov);
out_acct:
if (ret > 0)
diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c
index 0fe0ae54ac67..fd88af17d8d9 100644
--- a/fs/btrfs/lru_cache.c
+++ b/fs/btrfs/lru_cache.c
@@ -9,7 +9,7 @@
*
* @cache: The cache.
* @max_size: Maximum size (number of entries) for the cache.
- * Use 0 for unlimited size, it's the user's responsability to
+ * Use 0 for unlimited size, it's the user's responsibility to
* trim the cache in that case.
*/
void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size)
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index d3fcfc628a4f..e43bc0fdc74e 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -152,7 +152,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
cur_page = out_pages[*cur_out / PAGE_SIZE];
/* Allocate a new page */
if (!cur_page) {
- cur_page = alloc_page(GFP_NOFS);
+ cur_page = btrfs_alloc_compr_page();
if (!cur_page)
return -ENOMEM;
out_pages[*cur_out / PAGE_SIZE] = cur_page;
@@ -178,7 +178,7 @@ static int copy_compressed_data_to_page(char *compressed_data,
cur_page = out_pages[*cur_out / PAGE_SIZE];
/* Allocate a new page */
if (!cur_page) {
- cur_page = alloc_page(GFP_NOFS);
+ cur_page = btrfs_alloc_compr_page();
if (!cur_page)
return -ENOMEM;
out_pages[*cur_out / PAGE_SIZE] = cur_page;
@@ -425,16 +425,16 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb)
}
int lzo_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
+ struct btrfs_fs_info *fs_info = btrfs_sb(dest_page->mapping->host->i_sb);
+ const u32 sectorsize = fs_info->sectorsize;
size_t in_len;
size_t out_len;
size_t max_segment_len = WORKSPACE_BUF_LENGTH;
int ret = 0;
- char *kaddr;
- unsigned long bytes;
if (srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)
return -EUCLEAN;
@@ -451,7 +451,7 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
}
data_in += LZO_LEN;
- out_len = PAGE_SIZE;
+ out_len = sectorsize;
ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
if (ret != LZO_E_OK) {
pr_warn("BTRFS: decompress failed!\n");
@@ -459,29 +459,13 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in,
goto out;
}
- if (out_len < start_byte) {
+ ASSERT(out_len <= sectorsize);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, out_len);
+ /* Early end, considered as an error. */
+ if (unlikely(out_len < destlen)) {
ret = -EIO;
- goto out;
+ memzero_page(dest_page, dest_pgoff + out_len, destlen - out_len);
}
-
- /*
- * the caller is already checking against PAGE_SIZE, but lets
- * move this check closer to the memcpy/memset
- */
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes = min_t(unsigned long, destlen, out_len - start_byte);
-
- kaddr = kmap_local_page(dest_page);
- memcpy(kaddr, workspace->buf + start_byte, bytes);
-
- /*
- * btrfs_getblock is doing a zero on the tail of the page too,
- * but this will cover anything missing from the decompressed
- * data.
- */
- if (bytes < destlen)
- memset(kaddr+bytes, 0, destlen-bytes);
- kunmap_local(kaddr);
out:
return ret;
}
diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c
index b8f9c9e56c8c..cdada4865837 100644
--- a/fs/btrfs/messages.c
+++ b/fs/btrfs/messages.c
@@ -287,7 +287,7 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
* panic or BUGs, depending on mount options.
*/
__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int error, const char *fmt, ...)
{
char *s_id = "<unknown>";
diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h
index 4d04c1fa5899..08a9272399d2 100644
--- a/fs/btrfs/messages.h
+++ b/fs/btrfs/messages.h
@@ -194,7 +194,7 @@ const char * __attribute_const__ btrfs_decode_error(int error);
__printf(5, 6)
__cold
-void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
+void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int error, const char *fmt, ...);
/*
* If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a82e1417c4d2..59850dc17b22 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -323,9 +323,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered,
*
* If there's no such bit, we need to skip to next range.
*/
- if (!btrfs_page_test_ordered(fs_info, page, file_offset, len))
+ if (!btrfs_folio_test_ordered(fs_info, page_folio(page),
+ file_offset, len))
return false;
- btrfs_page_clear_ordered(fs_info, page, file_offset, len);
+ btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len);
}
/* Now we're fine to update the accounting. */
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 567a6d3d4712..127ef8bf0ffd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -97,13 +97,6 @@ struct btrfs_ordered_extent {
u64 bytes_left;
/*
- * the end of the ordered extent which is behind it but
- * didn't update disk_i_size. Please see the comment of
- * btrfs_ordered_update_i_size();
- */
- u64 outstanding_isize;
-
- /*
* If we get truncated we need to adjust the file extent we enter for
* this ordered extent so that we do not expose stale data.
*/
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index e46774e8f49f..5470e1cdf10c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -194,7 +194,7 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info,
*
* Must be called with qgroup_lock held and @prealloc preallocated.
*
- * The control on the lifespan of @prealloc would be transfered to this
+ * The control on the lifespan of @prealloc would be transferred to this
* function, thus caller should no longer touch @prealloc.
*/
static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
@@ -1736,6 +1736,15 @@ out:
return ret;
}
+static bool qgroup_has_usage(struct btrfs_qgroup *qgroup)
+{
+ return (qgroup->rfer > 0 || qgroup->rfer_cmpr > 0 ||
+ qgroup->excl > 0 || qgroup->excl_cmpr > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] > 0 ||
+ qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS] > 0);
+}
+
int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@@ -1755,6 +1764,11 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
goto out;
}
+ if (is_fstree(qgroupid) && qgroup_has_usage(qgroup)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
/* Check if there are no children of this qgroup */
if (!list_empty(&qgroup->members)) {
ret = -EBUSY;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 3e014b9370a3..792c8e17c31d 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -964,7 +964,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
{
int ret;
- ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
+ ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0);
if (ret < 0)
return ret;
/* Mapping all sectors */
@@ -979,7 +979,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
int ret;
ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
- rbio->stripe_pages + data_pages);
+ rbio->stripe_pages + data_pages, 0);
if (ret < 0)
return ret;
@@ -1530,7 +1530,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
const int data_pages = rbio->nr_data * rbio->stripe_npages;
int ret;
- ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages);
+ ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0);
if (ret < 0)
return ret;
@@ -1549,7 +1549,6 @@ struct btrfs_plug_cb {
struct blk_plug_cb cb;
struct btrfs_fs_info *info;
struct list_head rbio_list;
- struct work_struct work;
};
/*
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 45e6ff78316f..470213688872 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -164,7 +164,7 @@ struct raid56_bio_trace_info {
u8 stripe_nr;
};
-static inline int nr_data_stripes(const struct map_lookup *map)
+static inline int nr_data_stripes(const struct btrfs_chunk_map *map)
{
return map->num_stripes - btrfs_nr_parity_stripes(map->type);
}
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 6486f0d7e993..8c4fc98ca9ce 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -889,8 +889,10 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info,
out_unlock:
spin_unlock(&fs_info->ref_verify_lock);
out:
- if (ret)
+ if (ret) {
+ btrfs_free_ref_cache(fs_info);
btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
+ }
return ret;
}
@@ -1021,8 +1023,8 @@ int btrfs_build_ref_tree(struct btrfs_fs_info *fs_info)
}
}
if (ret) {
- btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
btrfs_free_ref_cache(fs_info);
+ btrfs_clear_opt(fs_info->mount_opt, REF_VERIFY);
}
btrfs_free_path(path);
return ret;
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index f88b0c2ac3fe..ae90894dc7dc 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -141,9 +141,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
if (datal < block_size)
memzero_page(page, datal, block_size - datal);
- btrfs_page_set_uptodate(fs_info, page, file_offset, block_size);
- btrfs_page_clear_checked(fs_info, page, file_offset, block_size);
- btrfs_page_set_dirty(fs_info, page, file_offset, block_size);
+ btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size);
+ btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size);
+ btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size);
out_unlock:
if (page) {
unlock_page(page);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index f5d9e5f74a52..abe594f77f99 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2895,7 +2895,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
* will re-read the whole page anyway.
*/
if (page) {
- btrfs_subpage_clear_uptodate(fs_info, page, i_size,
+ btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size,
round_up(i_size, PAGE_SIZE) - i_size);
unlock_page(page);
put_page(page);
@@ -2951,7 +2951,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->len = end + 1 - start;
em->block_len = em->len;
em->block_start = block_start;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags |= EXTENT_FLAG_PINNED;
lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
@@ -3070,7 +3070,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
clamped_len);
goto release_page;
}
- btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len);
+ btrfs_folio_set_dirty(fs_info, page_folio(page),
+ clamped_start, clamped_len);
/*
* Set the boundary if it's inside the page.
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f62a408671cb..0123d2728923 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -43,7 +43,7 @@ struct scrub_ctx;
/*
* The following value only influences the performance.
*
- * This detemines how many stripes would be submitted in one go,
+ * This determines how many stripes would be submitted in one go,
* which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP).
*/
#define SCRUB_STRIPES_PER_GROUP 8
@@ -192,7 +192,6 @@ struct scrub_ctx {
int cur_stripe;
atomic_t cancel_req;
int readonly;
- int sectors_per_bio;
/* State of IO submission throttling affecting the associated device */
ktime_t throttle_deadline;
@@ -262,7 +261,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info,
atomic_set(&stripe->pending_io, 0);
spin_lock_init(&stripe->write_error_lock);
- ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages);
+ ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0);
if (ret < 0)
goto error;
@@ -710,7 +709,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr)
/* Metadata, verify the full tree block. */
if (sector->is_metadata) {
/*
- * Check if the tree block crosses the stripe boudary. If
+ * Check if the tree block crosses the stripe boundary. If
* crossed the boundary, we cannot verify it but only give a
* warning.
*
@@ -884,7 +883,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx,
/*
* Init needed infos for error reporting.
*
- * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio()
+ * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio()
* thus no need for dev/physical, error reporting still needs dev and physical.
*/
if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) {
@@ -1099,12 +1098,22 @@ out:
static void scrub_read_endio(struct btrfs_bio *bbio)
{
struct scrub_stripe *stripe = bbio->private;
+ struct bio_vec *bvec;
+ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
+ int num_sectors;
+ u32 bio_size = 0;
+ int i;
+
+ ASSERT(sector_nr < stripe->nr_sectors);
+ bio_for_each_bvec_all(bvec, &bbio->bio, i)
+ bio_size += bvec->bv_len;
+ num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
if (bbio->bio.bi_status) {
- bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
- bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
+ bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
+ bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
} else {
- bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+ bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
}
bio_put(&bbio->bio);
if (atomic_dec_and_test(&stripe->pending_io)) {
@@ -1280,7 +1289,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d
* return 0 if it is a data stripe, 1 means parity stripe.
*/
static int get_raid56_logic_offset(u64 physical, int num,
- struct map_lookup *map, u64 *offset,
+ struct btrfs_chunk_map *map, u64 *offset,
u64 *stripe_start)
{
int i;
@@ -1409,14 +1418,11 @@ search_forward:
if (ret > 0)
break;
next:
- path->slots[0]++;
- if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
- ret = btrfs_next_leaf(extent_root, path);
- if (ret) {
- /* Either no more item or fatal error */
- btrfs_release_path(path);
- return ret;
- }
+ ret = btrfs_next_item(extent_root, path);
+ if (ret) {
+ /* Either no more items or a fatal error. */
+ btrfs_release_path(path);
+ return ret;
}
}
btrfs_release_path(path);
@@ -1640,6 +1646,9 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = stripe->bg->fs_info;
struct btrfs_bio *bbio = NULL;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
u64 stripe_len = BTRFS_STRIPE_LEN;
int mirror = stripe->mirror_num;
int i;
@@ -1650,6 +1659,10 @@ static void scrub_submit_extent_sector_read(struct scrub_ctx *sctx,
struct page *page = scrub_stripe_get_page(stripe, i);
unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i);
+ /* We're beyond the chunk boundary, no need to read anymore. */
+ if (i >= nr_sectors)
+ break;
+
/* The current sector cannot be merged, submit the bio. */
if (bbio &&
((i > 0 &&
@@ -1705,6 +1718,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_bio *bbio;
+ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
+ stripe->bg->length - stripe->logical) >>
+ fs_info->sectorsize_bits;
int mirror = stripe->mirror_num;
ASSERT(stripe->bg);
@@ -1719,14 +1735,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
scrub_read_endio, stripe);
- /* Read the whole stripe. */
bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
- for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
+ /* Read the whole range inside the chunk boundary. */
+ for (unsigned int cur = 0; cur < nr_sectors; cur++) {
+ struct page *page = scrub_stripe_get_page(stripe, cur);
+ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
int ret;
- ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
+ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
/* We should have allocated enough bio vectors. */
- ASSERT(ret == PAGE_SIZE);
+ ASSERT(ret == fs_info->sectorsize);
}
atomic_inc(&stripe->pending_io);
@@ -1816,7 +1834,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx)
if (sctx->is_dev_replace) {
/*
* For dev-replace, if we know there is something wrong with
- * metadata, we should immedately abort.
+ * metadata, we should immediately abort.
*/
for (int i = 0; i < nr_stripes; i++) {
if (stripe_has_metadata_error(&sctx->stripes[i])) {
@@ -1898,7 +1916,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *
static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
u64 full_stripe_start)
{
DECLARE_COMPLETION_ONSTACK(io_done);
@@ -2067,7 +2085,7 @@ out:
*/
static int scrub_simple_mirror(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
u64 logical_start, u64 logical_length,
struct btrfs_device *device,
u64 physical, int mirror_num)
@@ -2128,7 +2146,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
}
/* Calculate the full stripe length for simple stripe based profiles */
-static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
+static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map)
{
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
@@ -2137,7 +2155,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
}
/* Get the logical bytenr for the stripe */
-static u64 simple_stripe_get_logical(struct map_lookup *map,
+static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map,
struct btrfs_block_group *bg,
int stripe_index)
{
@@ -2154,7 +2172,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map,
}
/* Get the mirror number for the stripe */
-static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
+static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index)
{
ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
BTRFS_BLOCK_GROUP_RAID10));
@@ -2166,7 +2184,7 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
static int scrub_simple_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct btrfs_device *device,
int stripe_index)
{
@@ -2199,18 +2217,17 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx,
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct btrfs_block_group *bg,
- struct extent_map *em,
+ struct btrfs_chunk_map *map,
struct btrfs_device *scrub_dev,
int stripe_index)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct map_lookup *map = em->map_lookup;
const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
const u64 chunk_logical = bg->start;
int ret;
int ret2;
u64 physical = map->stripes[stripe_index].physical;
- const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
+ const u64 dev_stripe_len = btrfs_calc_stripe_length(map);
const u64 physical_end = physical + dev_stripe_len;
u64 logical;
u64 logic_end;
@@ -2373,17 +2390,12 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
u64 dev_extent_len)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
int i;
int ret = 0;
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, bg->start, bg->length);
- read_unlock(&map_tree->lock);
-
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, bg->start, bg->length);
+ if (!map) {
/*
* Might have been an unused block group deleted by the cleaner
* kthread or relocation.
@@ -2395,22 +2407,21 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
return ret;
}
- if (em->start != bg->start)
+ if (map->start != bg->start)
goto out;
- if (em->len < dev_extent_len)
+ if (map->chunk_len < dev_extent_len)
goto out;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; ++i) {
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
- ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
+ ret = scrub_stripe(sctx, bg, map, scrub_dev, i);
if (ret)
goto out;
}
}
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 4e36550618e5..e48a063ef085 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6705,11 +6705,20 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
if (ret)
goto out;
}
- if (sctx->cur_inode_last_extent <
- sctx->cur_inode_size) {
- ret = send_hole(sctx, sctx->cur_inode_size);
- if (ret)
+ if (sctx->cur_inode_last_extent < sctx->cur_inode_size) {
+ ret = range_is_hole_in_parent(sctx,
+ sctx->cur_inode_last_extent,
+ sctx->cur_inode_size);
+ if (ret < 0) {
goto out;
+ } else if (ret == 0) {
+ ret = send_hole(sctx, sctx->cur_inode_size);
+ if (ret < 0)
+ goto out;
+ } else {
+ /* Range is already a hole, skip. */
+ ret = 0;
+ }
}
}
if (need_truncate) {
@@ -8111,7 +8120,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
}
if (arg->flags & ~BTRFS_SEND_FLAG_MASK) {
- ret = -EINVAL;
+ ret = -EOPNOTSUPP;
goto out;
}
@@ -8205,8 +8214,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
goto out;
}
- sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots),
- arg->clone_sources_count + 1,
+ sctx->clone_roots = kvcalloc(arg->clone_sources_count + 1,
+ sizeof(*sctx->clone_roots),
GFP_KERNEL);
if (!sctx->clone_roots) {
ret = -ENOMEM;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 571bb13587d5..3b54eb583474 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -856,7 +856,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info)
{
- u64 global_rsv_size = fs_info->global_block_rsv.reserved;
+ const u64 global_rsv_size = btrfs_block_rsv_reserved(&fs_info->global_block_rsv);
u64 ordered, delalloc;
u64 thresh;
u64 used;
@@ -956,8 +956,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
if (ordered >= delalloc)
- used += fs_info->delayed_refs_rsv.reserved +
- fs_info->delayed_block_rsv.reserved;
+ used += btrfs_block_rsv_reserved(&fs_info->delayed_refs_rsv) +
+ btrfs_block_rsv_reserved(&fs_info->delayed_block_rsv);
else
used += space_info->bytes_may_use - global_rsv_size;
@@ -1173,7 +1173,7 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
enum btrfs_flush_state flush;
u64 delalloc_size = 0;
u64 to_reclaim, block_rsv_size;
- u64 global_rsv_size = global_rsv->reserved;
+ const u64 global_rsv_size = btrfs_block_rsv_reserved(global_rsv);
loops++;
@@ -1185,9 +1185,9 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
* assume it's tied up in delalloc reservations.
*/
block_rsv_size = global_rsv_size +
- delayed_block_rsv->reserved +
- delayed_refs_rsv->reserved +
- trans_rsv->reserved;
+ btrfs_block_rsv_reserved(delayed_block_rsv) +
+ btrfs_block_rsv_reserved(delayed_refs_rsv) +
+ btrfs_block_rsv_reserved(trans_rsv);
if (block_rsv_size < space_info->bytes_may_use)
delalloc_size = space_info->bytes_may_use - block_rsv_size;
@@ -1207,16 +1207,16 @@ static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
to_reclaim = delalloc_size;
flush = FLUSH_DELALLOC;
} else if (space_info->bytes_pinned >
- (delayed_block_rsv->reserved +
- delayed_refs_rsv->reserved)) {
+ (btrfs_block_rsv_reserved(delayed_block_rsv) +
+ btrfs_block_rsv_reserved(delayed_refs_rsv))) {
to_reclaim = space_info->bytes_pinned;
flush = COMMIT_TRANS;
- } else if (delayed_block_rsv->reserved >
- delayed_refs_rsv->reserved) {
- to_reclaim = delayed_block_rsv->reserved;
+ } else if (btrfs_block_rsv_reserved(delayed_block_rsv) >
+ btrfs_block_rsv_reserved(delayed_refs_rsv)) {
+ to_reclaim = btrfs_block_rsv_reserved(delayed_block_rsv);
flush = FLUSH_DELAYED_ITEMS_NR;
} else {
- to_reclaim = delayed_refs_rsv->reserved;
+ to_reclaim = btrfs_block_rsv_reserved(delayed_refs_rsv);
flush = FLUSH_DELAYED_REFS_NR;
}
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index 1b999c6e4193..0e49dab8dad2 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -64,7 +64,7 @@
* This means a slightly higher tree locking latency.
*/
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping)
{
if (fs_info->sectorsize >= PAGE_SIZE)
return false;
@@ -74,8 +74,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
* mapping. And if page->mapping->host is data inode, it's subpage.
* As we have ruled our sectorsize >= PAGE_SIZE case already.
*/
- if (!page->mapping || !page->mapping->host ||
- is_data_inode(page->mapping->host))
+ if (!mapping || !mapping->host || is_data_inode(mapping->host))
return true;
/*
@@ -116,7 +115,7 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector
}
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page, enum btrfs_subpage_type type)
+ struct folio *folio, enum btrfs_subpage_type type)
{
struct btrfs_subpage *subpage;
@@ -124,31 +123,30 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
* We have cases like a dummy extent buffer page, which is not mapped
* and doesn't need to be locked.
*/
- if (page->mapping)
- ASSERT(PageLocked(page));
+ if (folio->mapping)
+ ASSERT(folio_test_locked(folio));
- /* Either not subpage, or the page already has private attached */
- if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
+ /* Either not subpage, or the folio already has private attached. */
+ if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio))
return 0;
subpage = btrfs_alloc_subpage(fs_info, type);
if (IS_ERR(subpage))
return PTR_ERR(subpage);
- attach_page_private(page, subpage);
+ folio_attach_private(folio, subpage);
return 0;
}
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- /* Either not subpage, or already detached */
- if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
+ /* Either not subpage, or the folio already has private attached. */
+ if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio))
return;
- subpage = detach_page_private(page);
+ subpage = folio_detach_private(folio);
ASSERT(subpage);
btrfs_free_subpage(subpage);
}
@@ -188,77 +186,78 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage)
* This is important for eb allocation, to prevent race with last eb freeing
* of the same page.
* With the eb_refs increased before the eb inserted into radix tree,
- * detach_extent_buffer_page() won't detach the page private while we're still
+ * detach_extent_buffer_page() won't detach the folio private while we're still
* allocating the extent buffer.
*/
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->mapping);
- lockdep_assert_held(&page->mapping->private_lock);
+ ASSERT(folio_test_private(folio) && folio->mapping);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
atomic_inc(&subpage->eb_refs);
}
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
struct btrfs_subpage *subpage;
- if (!btrfs_is_subpage(fs_info, page))
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->mapping);
- lockdep_assert_held(&page->mapping->private_lock);
+ ASSERT(folio_test_private(folio) && folio->mapping);
+ lockdep_assert_held(&folio->mapping->i_private_lock);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
ASSERT(atomic_read(&subpage->eb_refs));
atomic_dec(&subpage->eb_refs);
}
static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
+ /* For subpage support, the folio must be single page. */
+ ASSERT(folio_order(folio) == 0);
+
/* Basic checks */
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(len, fs_info->sectorsize));
/*
* The range check only works for mapped page, we can still have
* unmapped page like dummy extent buffer pages.
*/
- if (page->mapping)
- ASSERT(page_offset(page) <= start &&
- start + len <= page_offset(page) + PAGE_SIZE);
+ if (folio->mapping)
+ ASSERT(folio_pos(folio) <= start &&
+ start + len <= folio_pos(folio) + PAGE_SIZE);
}
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = len >> fs_info->sectorsize_bits;
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
atomic_add(nbits, &subpage->readers);
}
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = len >> fs_info->sectorsize_bits;
bool is_data;
bool last;
- btrfs_subpage_assert(fs_info, page, start, len);
- is_data = is_data_inode(page->mapping->host);
+ btrfs_subpage_assert(fs_info, folio, start, len);
+ is_data = is_data_inode(folio->mapping->host);
ASSERT(atomic_read(&subpage->readers) >= nbits);
last = atomic_sub_and_test(nbits, &subpage->readers);
@@ -270,35 +269,35 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
* As we want the atomic_sub_and_test() to be always executed.
*/
if (is_data && last)
- unlock_page(page);
+ folio_unlock(folio);
}
-static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len)
+static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len)
{
u64 orig_start = *start;
u32 orig_len = *len;
- *start = max_t(u64, page_offset(page), orig_start);
+ *start = max_t(u64, folio_pos(folio), orig_start);
/*
* For certain call sites like btrfs_drop_pages(), we may have pages
* beyond the target range. In that case, just set @len to 0, subpage
* helpers can handle @len == 0 without any problem.
*/
- if (page_offset(page) >= orig_start + orig_len)
+ if (folio_pos(folio) >= orig_start + orig_len)
*len = 0;
else
- *len = min_t(u64, page_offset(page) + PAGE_SIZE,
+ *len = min_t(u64, folio_pos(folio) + PAGE_SIZE,
orig_start + orig_len) - *start;
}
void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = (len >> fs_info->sectorsize_bits);
int ret;
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
ASSERT(atomic_read(&subpage->readers) == 0);
ret = atomic_add_return(nbits, &subpage->writers);
@@ -306,12 +305,12 @@ void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
}
bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
const int nbits = (len >> fs_info->sectorsize_bits);
- btrfs_subpage_assert(fs_info, page, start, len);
+ btrfs_subpage_assert(fs_info, folio, start, len);
/*
* We have call sites passing @lock_page into
@@ -328,7 +327,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
}
/*
- * Lock a page for delalloc page writeback.
+ * Lock a folio for delalloc page writeback.
*
* Return -EAGAIN if the page is not properly initialized.
* Return 0 with the page locked, and writer counter updated.
@@ -337,38 +336,40 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
* it's really the correct page, as the caller is using
* filemap_get_folios_contig(), which can race with page invalidating.
*/
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
- lock_page(page);
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_lock(folio);
return 0;
}
- lock_page(page);
- if (!PagePrivate(page) || !page->private) {
- unlock_page(page);
+ folio_lock(folio);
+ if (!folio_test_private(folio) || !folio_get_private(folio)) {
+ folio_unlock(folio);
return -EAGAIN;
}
- btrfs_subpage_clamp_range(page, &start, &len);
- btrfs_subpage_start_writer(fs_info, page, start, len);
+ btrfs_subpage_clamp_range(folio, &start, &len);
+ btrfs_subpage_start_writer(fs_info, folio, start, len);
return 0;
}
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
- return unlock_page(page);
- btrfs_subpage_clamp_range(page, &start, &len);
- if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
- unlock_page(page);
+ if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
+ btrfs_subpage_clamp_range(folio, &start, &len);
+ if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len))
+ folio_unlock(folio);
}
-#define subpage_calc_start_bit(fs_info, page, name, start, len) \
+#define subpage_calc_start_bit(fs_info, folio, name, start, len) \
({ \
unsigned int start_bit; \
\
- btrfs_subpage_assert(fs_info, page, start, len); \
+ btrfs_subpage_assert(fs_info, folio, start, len); \
start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \
start_bit += fs_info->subpage_info->name##_offset; \
start_bit; \
@@ -385,46 +386,46 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
fs_info->subpage_info->bitmap_nr_bits)
void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate))
- SetPageUptodate(page);
+ folio_mark_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
uptodate, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- ClearPageUptodate(page);
+ folio_clear_uptodate(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
spin_unlock_irqrestore(&subpage->lock, flags);
- set_page_dirty(page);
+ folio_mark_dirty(folio);
}
/*
@@ -438,10 +439,10 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
* extra handling for tree blocks.
*/
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
dirty, start, len);
unsigned long flags;
bool last = false;
@@ -455,101 +456,102 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
}
void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
bool last;
- last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len);
if (last)
- clear_page_dirty_for_io(page);
+ folio_clear_dirty_for_io(folio);
}
void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- set_page_writeback(page);
+ if (!folio_test_writeback(folio))
+ folio_start_writeback(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
writeback, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) {
- ASSERT(PageWriteback(page));
- end_page_writeback(page);
+ ASSERT(folio_test_writeback(folio));
+ folio_end_writeback(folio);
}
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- SetPageOrdered(page);
+ folio_set_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
ordered, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered))
- ClearPageOrdered(page);
+ folio_clear_ordered(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
if (subpage_test_bitmap_all_set(fs_info, subpage, checked))
- SetPageChecked(page);
+ folio_set_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page,
+ struct btrfs_subpage *subpage = folio_get_private(folio);
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio,
checked, start, len);
unsigned long flags;
spin_lock_irqsave(&subpage->lock, flags);
bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
- ClearPageChecked(page);
+ folio_clear_checked(folio);
spin_unlock_irqrestore(&subpage->lock, flags);
}
@@ -559,10 +561,10 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info,
*/
#define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+ struct folio *folio, u64 start, u32 len) \
{ \
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \
- unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \
+ struct btrfs_subpage *subpage = folio_get_private(folio); \
+ unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \
name, start, len); \
unsigned long flags; \
bool ret; \
@@ -584,88 +586,94 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
* in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall
* back to regular sectorsize branch.
*/
-#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \
- test_page_func) \
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+#define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func, \
+ folio_clear_func, folio_test_func) \
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- set_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_set_func(folio); \
return; \
} \
- btrfs_subpage_set_##name(fs_info, page, start, len); \
+ btrfs_subpage_set_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- clear_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_clear_func(folio); \
return; \
} \
- btrfs_subpage_clear_##name(fs_info, page, start, len); \
+ btrfs_subpage_clear_##name(fs_info, folio, start, len); \
} \
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
- return test_page_func(page); \
- return btrfs_subpage_test_##name(fs_info, page, start, len); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) \
+ return folio_test_func(folio); \
+ return btrfs_subpage_test_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- set_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_set_func(folio); \
return; \
} \
- btrfs_subpage_clamp_range(page, &start, &len); \
- btrfs_subpage_set_##name(fs_info, page, start, len); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ btrfs_subpage_set_##name(fs_info, folio, start, len); \
} \
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
- clear_page_func(page); \
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) { \
+ folio_clear_func(folio); \
return; \
} \
- btrfs_subpage_clamp_range(page, &start, &len); \
- btrfs_subpage_clear_##name(fs_info, page, start, len); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ btrfs_subpage_clear_##name(fs_info, folio, start, len); \
} \
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len) \
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len) \
{ \
- if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
- return test_page_func(page); \
- btrfs_subpage_clamp_range(page, &start, &len); \
- return btrfs_subpage_test_##name(fs_info, page, start, len); \
-}
-IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
- PageUptodate);
-IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
- PageDirty);
-IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
- PageWriteback);
-IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered,
- PageOrdered);
-IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked);
+ if (unlikely(!fs_info) || \
+ !btrfs_is_subpage(fs_info, folio->mapping)) \
+ return folio_test_func(folio); \
+ btrfs_subpage_clamp_range(folio, &start, &len); \
+ return btrfs_subpage_test_##name(fs_info, folio, start, len); \
+}
+IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate,
+ folio_test_uptodate);
+IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io,
+ folio_test_dirty);
+IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback,
+ folio_test_writeback);
+IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered,
+ folio_test_ordered);
+IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked,
+ folio_test_checked);
/*
* Make sure not only the page dirty bit is cleared, but also subpage dirty bit
* is cleared.
*/
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page)
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio)
{
- struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct btrfs_subpage *subpage = folio_get_private(folio);
if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
return;
- ASSERT(!PageDirty(page));
- if (!btrfs_is_subpage(fs_info, page))
+ ASSERT(!folio_test_dirty(folio));
+ if (!btrfs_is_subpage(fs_info, folio->mapping))
return;
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty));
}
@@ -684,18 +692,20 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
* extent_write_locked_range().
* In this case, we have to call subpage helper to handle the case.
*/
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
- u64 start, u32 len)
+void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage *subpage;
- ASSERT(PageLocked(page));
+ ASSERT(folio_test_locked(folio));
/* For non-subpage case, we just unlock the page */
- if (!btrfs_is_subpage(fs_info, page))
- return unlock_page(page);
+ if (!btrfs_is_subpage(fs_info, folio->mapping)) {
+ folio_unlock(folio);
+ return;
+ }
- ASSERT(PagePrivate(page) && page->private);
- subpage = (struct btrfs_subpage *)page->private;
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
+ subpage = folio_get_private(folio);
/*
* For subpage case, there are two types of locked page. With or
@@ -704,12 +714,14 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
* Since we own the page lock, no one else could touch subpage::writers
* and we are safe to do several atomic operations without spinlock.
*/
- if (atomic_read(&subpage->writers) == 0)
+ if (atomic_read(&subpage->writers) == 0) {
/* No writers, locked by plain lock_page() */
- return unlock_page(page);
+ folio_unlock(folio);
+ return;
+ }
/* Have writers, use proper subpage helper to end it */
- btrfs_page_end_writer_lock(fs_info, page, start, len);
+ btrfs_folio_end_writer_lock(fs_info, folio, start, len);
}
#define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \
@@ -717,7 +729,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
subpage_info->name##_offset, subpage_info->bitmap_nr_bits)
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len)
+ struct folio *folio, u64 start, u32 len)
{
struct btrfs_subpage_info *subpage_info = fs_info->subpage_info;
struct btrfs_subpage *subpage;
@@ -729,9 +741,9 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
unsigned long checked_bitmap;
unsigned long flags;
- ASSERT(PagePrivate(page) && page->private);
+ ASSERT(folio_test_private(folio) && folio_get_private(folio));
ASSERT(subpage_info);
- subpage = (struct btrfs_subpage *)page->private;
+ subpage = folio_get_private(folio);
spin_lock_irqsave(&subpage->lock, flags);
GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap);
@@ -741,10 +753,10 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap);
spin_unlock_irqrestore(&subpage->lock, flags);
- dump_page(page, "btrfs subpage dump");
+ dump_page(folio_page(folio, 0), "btrfs subpage dump");
btrfs_warn(fs_info,
"start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl",
- start, len, page_offset(page),
+ start, len, folio_pos(folio),
subpage_info->bitmap_nr_bits, &uptodate_bitmap,
subpage_info->bitmap_nr_bits, &error_bitmap,
subpage_info->bitmap_nr_bits, &dirty_bitmap,
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index 5cbf67ccbdeb..793c2b314a58 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -73,71 +73,68 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
-bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
+bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping);
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page, enum btrfs_subpage_type type);
-void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
- struct page *page);
+ struct folio *folio, enum btrfs_subpage_type type);
+void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio);
/* Allocate additional data where page represents more than one sector */
struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info,
enum btrfs_subpage_type type);
void btrfs_free_subpage(struct btrfs_subpage *subpage);
-void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page);
-void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
- struct page *page);
+void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio);
void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
-void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
+int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
+void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
/*
* Template for subpage related operations.
*
- * btrfs_subpage_*() are for call sites where the page has subpage attached and
- * the range is ensured to be inside the page.
+ * btrfs_subpage_*() are for call sites where the folio has subpage attached and
+ * the range is ensured to be inside the folio's single page.
*
- * btrfs_page_*() are for call sites where the page can either be subpage
- * specific or regular page. The function will handle both cases.
- * But the range still needs to be inside the page.
+ * btrfs_folio_*() are for call sites where the page can either be subpage
+ * specific or regular folios. The function will handle both cases.
+ * But the range still needs to be inside one single page.
*
- * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't
+ * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't
* need to be inside the page. Those functions will truncate the range
* automatically.
*/
#define DECLARE_BTRFS_SUBPAGE_OPS(name) \
void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
+ struct folio *folio, u64 start, u32 len); \
void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
+ struct folio *folio, u64 start, u32 len); \
bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len); \
-bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len); \
+bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
+ struct folio *folio, u64 start, u32 len);
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(dirty);
@@ -146,13 +143,12 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered);
DECLARE_BTRFS_SUBPAGE_OPS(checked);
bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
-void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
- struct page *page);
-void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
- u64 start, u32 len);
+void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio);
+void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info,
+ struct folio *folio, u64 start, u32 len);
void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info,
- struct page *page, u64 start, u32 len);
+ struct folio *folio, u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index ef256b944c72..101f786963d4 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -27,6 +27,7 @@
#include <linux/crc32c.h>
#include <linux/btrfs.h>
#include <linux/security.h>
+#include <linux/fs_parser.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
@@ -64,19 +65,7 @@
#include <trace/events/btrfs.h>
static const struct super_operations btrfs_super_ops;
-
-/*
- * Types for mounting the default subvolume and a subvolume explicitly
- * requested by subvol=/path. That way the callchain is straightforward and we
- * don't have to play tricks with the mount options and recursive calls to
- * btrfs_mount.
- *
- * The new btrfs_root_fs_type also servers as a tag for the bdev_holder.
- */
static struct file_system_type btrfs_fs_type;
-static struct file_system_type btrfs_root_fs_type;
-
-static int btrfs_remount(struct super_block *sb, int *flags, char *data);
static void btrfs_put_super(struct super_block *sb)
{
@@ -86,8 +75,22 @@ static void btrfs_put_super(struct super_block *sb)
close_ctree(fs_info);
}
+/* Store the mount options related information. */
+struct btrfs_fs_context {
+ char *subvol_name;
+ u64 subvol_objectid;
+ u64 max_inline;
+ u32 commit_interval;
+ u32 metadata_ratio;
+ u32 thread_pool_size;
+ unsigned long mount_opt;
+ unsigned long compress_type:4;
+ unsigned int compress_level;
+ refcount_t refs;
+};
+
enum {
- Opt_acl, Opt_noacl,
+ Opt_acl,
Opt_clear_cache,
Opt_commit_interval,
Opt_compress,
@@ -97,27 +100,26 @@ enum {
Opt_degraded,
Opt_device,
Opt_fatal_errors,
- Opt_flushoncommit, Opt_noflushoncommit,
+ Opt_flushoncommit,
Opt_max_inline,
- Opt_barrier, Opt_nobarrier,
- Opt_datacow, Opt_nodatacow,
- Opt_datasum, Opt_nodatasum,
- Opt_defrag, Opt_nodefrag,
- Opt_discard, Opt_nodiscard,
+ Opt_barrier,
+ Opt_datacow,
+ Opt_datasum,
+ Opt_defrag,
+ Opt_discard,
Opt_discard_mode,
- Opt_norecovery,
Opt_ratio,
Opt_rescan_uuid_tree,
Opt_skip_balance,
- Opt_space_cache, Opt_no_space_cache,
+ Opt_space_cache,
Opt_space_cache_version,
- Opt_ssd, Opt_nossd,
- Opt_ssd_spread, Opt_nossd_spread,
+ Opt_ssd,
+ Opt_ssd_spread,
Opt_subvol,
Opt_subvol_empty,
Opt_subvolid,
Opt_thread_pool,
- Opt_treelog, Opt_notreelog,
+ Opt_treelog,
Opt_user_subvol_rm_allowed,
/* Rescue options */
@@ -128,14 +130,10 @@ enum {
Opt_ignoredatacsums,
Opt_rescue_all,
- /* Deprecated options */
- Opt_recovery,
- Opt_inode_cache, Opt_noinode_cache,
-
/* Debugging options */
- Opt_enospc_debug, Opt_noenospc_debug,
+ Opt_enospc_debug,
#ifdef CONFIG_BTRFS_DEBUG
- Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
+ Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all,
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
Opt_ref_verify,
@@ -143,785 +141,611 @@ enum {
Opt_err,
};
-static const match_table_t tokens = {
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_clear_cache, "clear_cache"},
- {Opt_commit_interval, "commit=%u"},
- {Opt_compress, "compress"},
- {Opt_compress_type, "compress=%s"},
- {Opt_compress_force, "compress-force"},
- {Opt_compress_force_type, "compress-force=%s"},
- {Opt_degraded, "degraded"},
- {Opt_device, "device=%s"},
- {Opt_fatal_errors, "fatal_errors=%s"},
- {Opt_flushoncommit, "flushoncommit"},
- {Opt_noflushoncommit, "noflushoncommit"},
- {Opt_inode_cache, "inode_cache"},
- {Opt_noinode_cache, "noinode_cache"},
- {Opt_max_inline, "max_inline=%s"},
- {Opt_barrier, "barrier"},
- {Opt_nobarrier, "nobarrier"},
- {Opt_datacow, "datacow"},
- {Opt_nodatacow, "nodatacow"},
- {Opt_datasum, "datasum"},
- {Opt_nodatasum, "nodatasum"},
- {Opt_defrag, "autodefrag"},
- {Opt_nodefrag, "noautodefrag"},
- {Opt_discard, "discard"},
- {Opt_discard_mode, "discard=%s"},
- {Opt_nodiscard, "nodiscard"},
- {Opt_norecovery, "norecovery"},
- {Opt_ratio, "metadata_ratio=%u"},
- {Opt_rescan_uuid_tree, "rescan_uuid_tree"},
- {Opt_skip_balance, "skip_balance"},
- {Opt_space_cache, "space_cache"},
- {Opt_no_space_cache, "nospace_cache"},
- {Opt_space_cache_version, "space_cache=%s"},
- {Opt_ssd, "ssd"},
- {Opt_nossd, "nossd"},
- {Opt_ssd_spread, "ssd_spread"},
- {Opt_nossd_spread, "nossd_spread"},
- {Opt_subvol, "subvol=%s"},
- {Opt_subvol_empty, "subvol="},
- {Opt_subvolid, "subvolid=%s"},
- {Opt_thread_pool, "thread_pool=%u"},
- {Opt_treelog, "treelog"},
- {Opt_notreelog, "notreelog"},
- {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+enum {
+ Opt_fatal_errors_panic,
+ Opt_fatal_errors_bug,
+};
- /* Rescue options */
- {Opt_rescue, "rescue=%s"},
+static const struct constant_table btrfs_parameter_fatal_errors[] = {
+ { "panic", Opt_fatal_errors_panic },
+ { "bug", Opt_fatal_errors_bug },
+ {}
+};
+
+enum {
+ Opt_discard_sync,
+ Opt_discard_async,
+};
+
+static const struct constant_table btrfs_parameter_discard[] = {
+ { "sync", Opt_discard_sync },
+ { "async", Opt_discard_async },
+ {}
+};
+
+enum {
+ Opt_space_cache_v1,
+ Opt_space_cache_v2,
+};
+
+static const struct constant_table btrfs_parameter_space_cache[] = {
+ { "v1", Opt_space_cache_v1 },
+ { "v2", Opt_space_cache_v2 },
+ {}
+};
+
+enum {
+ Opt_rescue_usebackuproot,
+ Opt_rescue_nologreplay,
+ Opt_rescue_ignorebadroots,
+ Opt_rescue_ignoredatacsums,
+ Opt_rescue_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_rescue[] = {
+ { "usebackuproot", Opt_rescue_usebackuproot },
+ { "nologreplay", Opt_rescue_nologreplay },
+ { "ignorebadroots", Opt_rescue_ignorebadroots },
+ { "ibadroots", Opt_rescue_ignorebadroots },
+ { "ignoredatacsums", Opt_rescue_ignoredatacsums },
+ { "idatacsums", Opt_rescue_ignoredatacsums },
+ { "all", Opt_rescue_parameter_all },
+ {}
+};
+
+#ifdef CONFIG_BTRFS_DEBUG
+enum {
+ Opt_fragment_parameter_data,
+ Opt_fragment_parameter_metadata,
+ Opt_fragment_parameter_all,
+};
+
+static const struct constant_table btrfs_parameter_fragment[] = {
+ { "data", Opt_fragment_parameter_data },
+ { "metadata", Opt_fragment_parameter_metadata },
+ { "all", Opt_fragment_parameter_all },
+ {}
+};
+#endif
+
+static const struct fs_parameter_spec btrfs_fs_parameters[] = {
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_flag_no("autodefrag", Opt_defrag),
+ fsparam_flag_no("barrier", Opt_barrier),
+ fsparam_flag("clear_cache", Opt_clear_cache),
+ fsparam_u32("commit", Opt_commit_interval),
+ fsparam_flag("compress", Opt_compress),
+ fsparam_string("compress", Opt_compress_type),
+ fsparam_flag("compress-force", Opt_compress_force),
+ fsparam_string("compress-force", Opt_compress_force_type),
+ fsparam_flag_no("datacow", Opt_datacow),
+ fsparam_flag_no("datasum", Opt_datasum),
+ fsparam_flag("degraded", Opt_degraded),
+ fsparam_string("device", Opt_device),
+ fsparam_flag_no("discard", Opt_discard),
+ fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard),
+ fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors),
+ fsparam_flag_no("flushoncommit", Opt_flushoncommit),
+ fsparam_string("max_inline", Opt_max_inline),
+ fsparam_u32("metadata_ratio", Opt_ratio),
+ fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree),
+ fsparam_flag("skip_balance", Opt_skip_balance),
+ fsparam_flag_no("space_cache", Opt_space_cache),
+ fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache),
+ fsparam_flag_no("ssd", Opt_ssd),
+ fsparam_flag_no("ssd_spread", Opt_ssd_spread),
+ fsparam_string("subvol", Opt_subvol),
+ fsparam_flag("subvol=", Opt_subvol_empty),
+ fsparam_u64("subvolid", Opt_subvolid),
+ fsparam_u32("thread_pool", Opt_thread_pool),
+ fsparam_flag_no("treelog", Opt_treelog),
+ fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed),
+
+ /* Rescue options. */
+ fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue),
/* Deprecated, with alias rescue=nologreplay */
- {Opt_nologreplay, "nologreplay"},
+ __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL),
/* Deprecated, with alias rescue=usebackuproot */
- {Opt_usebackuproot, "usebackuproot"},
+ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL),
- /* Deprecated options */
- {Opt_recovery, "recovery"},
-
- /* Debugging options */
- {Opt_enospc_debug, "enospc_debug"},
- {Opt_noenospc_debug, "noenospc_debug"},
+ /* Debugging options. */
+ fsparam_flag_no("enospc_debug", Opt_enospc_debug),
#ifdef CONFIG_BTRFS_DEBUG
- {Opt_fragment_data, "fragment=data"},
- {Opt_fragment_metadata, "fragment=metadata"},
- {Opt_fragment_all, "fragment=all"},
+ fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment),
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- {Opt_ref_verify, "ref_verify"},
+ fsparam_flag("ref_verify", Opt_ref_verify),
#endif
- {Opt_err, NULL},
+ {}
};
-static const match_table_t rescue_tokens = {
- {Opt_usebackuproot, "usebackuproot"},
- {Opt_nologreplay, "nologreplay"},
- {Opt_ignorebadroots, "ignorebadroots"},
- {Opt_ignorebadroots, "ibadroots"},
- {Opt_ignoredatacsums, "ignoredatacsums"},
- {Opt_ignoredatacsums, "idatacsums"},
- {Opt_rescue_all, "all"},
- {Opt_err, NULL},
-};
-
-static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt,
- const char *opt_name)
+/* No support for restricting writes to btrfs devices yet... */
+static inline blk_mode_t btrfs_open_mode(struct fs_context *fc)
{
- if (fs_info->mount_opt & opt) {
- btrfs_err(fs_info, "%s must be used with ro mount option",
- opt_name);
- return true;
- }
- return false;
+ return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES;
}
-static int parse_rescue_options(struct btrfs_fs_info *info, const char *options)
+static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
- char *opts;
- char *orig;
- char *p;
- substring_t args[MAX_OPT_ARGS];
- int ret = 0;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ int opt;
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ opt = fs_parse(fc, btrfs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- while ((p = strsep(&opts, ":")) != NULL) {
- int token;
+ switch (opt) {
+ case Opt_degraded:
+ btrfs_set_opt(ctx->mount_opt, DEGRADED);
+ break;
+ case Opt_subvol_empty:
+ /*
+ * This exists because we used to allow it on accident, so we're
+ * keeping it to maintain ABI. See 37becec95ac3 ("Btrfs: allow
+ * empty subvol= again").
+ */
+ break;
+ case Opt_subvol:
+ kfree(ctx->subvol_name);
+ ctx->subvol_name = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->subvol_name)
+ return -ENOMEM;
+ break;
+ case Opt_subvolid:
+ ctx->subvol_objectid = result.uint_64;
- if (!*p)
- continue;
- token = match_token(p, rescue_tokens, args);
- switch (token){
- case Opt_usebackuproot:
- btrfs_info(info,
- "trying to use backup root at mount time");
- btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
- break;
- case Opt_nologreplay:
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_ignorebadroots:
- btrfs_set_and_info(info, IGNOREBADROOTS,
- "ignoring bad roots");
- break;
- case Opt_ignoredatacsums:
- btrfs_set_and_info(info, IGNOREDATACSUMS,
- "ignoring data csums");
- break;
- case Opt_rescue_all:
- btrfs_info(info, "enabling all of the rescue options");
- btrfs_set_and_info(info, IGNOREDATACSUMS,
- "ignoring data csums");
- btrfs_set_and_info(info, IGNOREBADROOTS,
- "ignoring bad roots");
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_err:
- btrfs_info(info, "unrecognized rescue option '%s'", p);
- ret = -EINVAL;
- goto out;
- default:
- break;
- }
+ /* subvolid=0 means give me the original fs_tree. */
+ if (!ctx->subvol_objectid)
+ ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID;
+ break;
+ case Opt_device: {
+ struct btrfs_device *device;
+ blk_mode_t mode = btrfs_open_mode(fc);
+ mutex_lock(&uuid_mutex);
+ device = btrfs_scan_one_device(param->string, mode, false);
+ mutex_unlock(&uuid_mutex);
+ if (IS_ERR(device))
+ return PTR_ERR(device);
+ break;
}
-out:
- kfree(orig);
- return ret;
-}
-
-/*
- * Regular mount options parser. Everything that is needed only when
- * reading in a new superblock is parsed here.
- * XXX JDM: This needs to be cleaned up for remount.
- */
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags)
-{
- substring_t args[MAX_OPT_ARGS];
- char *p, *num;
- int intarg;
- int ret = 0;
- char *compress_type;
- bool compress_force = false;
- enum btrfs_compression_type saved_compress_type;
- int saved_compress_level;
- bool saved_compress_force;
- int no_compress = 0;
- const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state);
-
- if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE))
- btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
- else if (btrfs_free_space_cache_v1_active(info)) {
- if (btrfs_is_zoned(info)) {
- btrfs_info(info,
- "zoned: clearing existing space cache");
- btrfs_set_super_cache_generation(info->super_copy, 0);
+ case Opt_datasum:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NODATASUM);
} else {
- btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
}
- }
-
- /*
- * Even the options are empty, we still need to do extra check
- * against new flags
- */
- if (!options)
- goto check;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_degraded:
- btrfs_info(info, "allowing degraded mounts");
- btrfs_set_opt(info->mount_opt, DEGRADED);
- break;
- case Opt_subvol:
- case Opt_subvol_empty:
- case Opt_subvolid:
- case Opt_device:
- /*
- * These are parsed by btrfs_parse_subvol_options or
- * btrfs_parse_device_options and can be ignored here.
- */
- break;
- case Opt_nodatasum:
- btrfs_set_and_info(info, NODATASUM,
- "setting nodatasum");
- break;
- case Opt_datasum:
- if (btrfs_test_opt(info, NODATASUM)) {
- if (btrfs_test_opt(info, NODATACOW))
- btrfs_info(info,
- "setting datasum, datacow enabled");
- else
- btrfs_info(info, "setting datasum");
- }
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- break;
- case Opt_nodatacow:
- if (!btrfs_test_opt(info, NODATACOW)) {
- if (!btrfs_test_opt(info, COMPRESS) ||
- !btrfs_test_opt(info, FORCE_COMPRESS)) {
- btrfs_info(info,
- "setting nodatacow, compression disabled");
- } else {
- btrfs_info(info, "setting nodatacow");
- }
- }
- btrfs_clear_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- btrfs_set_opt(info->mount_opt, NODATACOW);
- btrfs_set_opt(info->mount_opt, NODATASUM);
- break;
- case Opt_datacow:
- btrfs_clear_and_info(info, NODATACOW,
- "setting datacow");
- break;
- case Opt_compress_force:
- case Opt_compress_force_type:
- compress_force = true;
- fallthrough;
- case Opt_compress:
- case Opt_compress_type:
- saved_compress_type = btrfs_test_opt(info,
- COMPRESS) ?
- info->compress_type : BTRFS_COMPRESS_NONE;
- saved_compress_force =
- btrfs_test_opt(info, FORCE_COMPRESS);
- saved_compress_level = info->compress_level;
- if (token == Opt_compress ||
- token == Opt_compress_force ||
- strncmp(args[0].from, "zlib", 4) == 0) {
- compress_type = "zlib";
-
- info->compress_type = BTRFS_COMPRESS_ZLIB;
- info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
- /*
- * args[0] contains uninitialized data since
- * for these tokens we don't expect any
- * parameter.
- */
- if (token != Opt_compress &&
- token != Opt_compress_force)
- info->compress_level =
- btrfs_compress_str2level(
- BTRFS_COMPRESS_ZLIB,
- args[0].from + 4);
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- no_compress = 0;
- } else if (strncmp(args[0].from, "lzo", 3) == 0) {
- compress_type = "lzo";
- info->compress_type = BTRFS_COMPRESS_LZO;
- info->compress_level = 0;
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- btrfs_set_fs_incompat(info, COMPRESS_LZO);
- no_compress = 0;
- } else if (strncmp(args[0].from, "zstd", 4) == 0) {
- compress_type = "zstd";
- info->compress_type = BTRFS_COMPRESS_ZSTD;
- info->compress_level =
- btrfs_compress_str2level(
- BTRFS_COMPRESS_ZSTD,
- args[0].from + 4);
- btrfs_set_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, NODATACOW);
- btrfs_clear_opt(info->mount_opt, NODATASUM);
- btrfs_set_fs_incompat(info, COMPRESS_ZSTD);
- no_compress = 0;
- } else if (strncmp(args[0].from, "no", 2) == 0) {
- compress_type = "no";
- info->compress_level = 0;
- info->compress_type = 0;
- btrfs_clear_opt(info->mount_opt, COMPRESS);
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- compress_force = false;
- no_compress++;
- } else {
- btrfs_err(info, "unrecognized compression value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
-
- if (compress_force) {
- btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
- } else {
- /*
- * If we remount from compress-force=xxx to
- * compress=xxx, we need clear FORCE_COMPRESS
- * flag, otherwise, there is no way for users
- * to disable forcible compression separately.
- */
- btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
- }
- if (no_compress == 1) {
- btrfs_info(info, "use no compression");
- } else if ((info->compress_type != saved_compress_type) ||
- (compress_force != saved_compress_force) ||
- (info->compress_level != saved_compress_level)) {
- btrfs_info(info, "%s %s compression, level %d",
- (compress_force) ? "force" : "use",
- compress_type, info->compress_level);
- }
- compress_force = false;
- break;
- case Opt_ssd:
- btrfs_set_and_info(info, SSD,
- "enabling ssd optimizations");
- btrfs_clear_opt(info->mount_opt, NOSSD);
- break;
- case Opt_ssd_spread:
- btrfs_set_and_info(info, SSD,
- "enabling ssd optimizations");
- btrfs_set_and_info(info, SSD_SPREAD,
- "using spread ssd allocation scheme");
- btrfs_clear_opt(info->mount_opt, NOSSD);
- break;
- case Opt_nossd:
- btrfs_set_opt(info->mount_opt, NOSSD);
- btrfs_clear_and_info(info, SSD,
- "not using ssd optimizations");
- fallthrough;
- case Opt_nossd_spread:
- btrfs_clear_and_info(info, SSD_SPREAD,
- "not using spread ssd allocation scheme");
- break;
- case Opt_barrier:
- btrfs_clear_and_info(info, NOBARRIER,
- "turning on barriers");
- break;
- case Opt_nobarrier:
- btrfs_set_and_info(info, NOBARRIER,
- "turning off barriers");
- break;
- case Opt_thread_pool:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized thread_pool value %s",
- args[0].from);
- goto out;
- } else if (intarg == 0) {
- btrfs_err(info, "invalid value 0 for thread_pool");
- ret = -EINVAL;
- goto out;
- }
- info->thread_pool_size = intarg;
- break;
- case Opt_max_inline:
- num = match_strdup(&args[0]);
- if (num) {
- info->max_inline = memparse(num, NULL);
- kfree(num);
-
- if (info->max_inline) {
- info->max_inline = min_t(u64,
- info->max_inline,
- info->sectorsize);
- }
- btrfs_info(info, "max_inline at %llu",
- info->max_inline);
- } else {
- ret = -ENOMEM;
- goto out;
- }
- break;
- case Opt_acl:
+ break;
+ case Opt_datacow:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ btrfs_set_opt(ctx->mount_opt, NODATACOW);
+ btrfs_set_opt(ctx->mount_opt, NODATASUM);
+ } else {
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ }
+ break;
+ case Opt_compress_force:
+ case Opt_compress_force_type:
+ btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS);
+ fallthrough;
+ case Opt_compress:
+ case Opt_compress_type:
+ if (opt == Opt_compress || opt == Opt_compress_force) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "zlib", 4) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_ZLIB;
+ ctx->compress_level =
+ btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB,
+ param->string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "lzo", 3) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_LZO;
+ ctx->compress_level = 0;
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "zstd", 4) == 0) {
+ ctx->compress_type = BTRFS_COMPRESS_ZSTD;
+ ctx->compress_level =
+ btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD,
+ param->string + 4);
+ btrfs_set_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, NODATACOW);
+ btrfs_clear_opt(ctx->mount_opt, NODATASUM);
+ } else if (strncmp(param->string, "no", 2) == 0) {
+ ctx->compress_level = 0;
+ ctx->compress_type = 0;
+ btrfs_clear_opt(ctx->mount_opt, COMPRESS);
+ btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS);
+ } else {
+ btrfs_err(NULL, "unrecognized compression value %s",
+ param->string);
+ return -EINVAL;
+ }
+ break;
+ case Opt_ssd:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NOSSD);
+ btrfs_clear_opt(ctx->mount_opt, SSD);
+ btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, SSD);
+ btrfs_clear_opt(ctx->mount_opt, NOSSD);
+ }
+ break;
+ case Opt_ssd_spread:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, SSD);
+ btrfs_set_opt(ctx->mount_opt, SSD_SPREAD);
+ btrfs_clear_opt(ctx->mount_opt, NOSSD);
+ }
+ break;
+ case Opt_barrier:
+ if (result.negated)
+ btrfs_set_opt(ctx->mount_opt, NOBARRIER);
+ else
+ btrfs_clear_opt(ctx->mount_opt, NOBARRIER);
+ break;
+ case Opt_thread_pool:
+ if (result.uint_32 == 0) {
+ btrfs_err(NULL, "invalid value 0 for thread_pool");
+ return -EINVAL;
+ }
+ ctx->thread_pool_size = result.uint_32;
+ break;
+ case Opt_max_inline:
+ ctx->max_inline = memparse(param->string, NULL);
+ break;
+ case Opt_acl:
+ if (result.negated) {
+ fc->sb_flags &= ~SB_POSIXACL;
+ } else {
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- info->sb->s_flags |= SB_POSIXACL;
- break;
+ fc->sb_flags |= SB_POSIXACL;
#else
- btrfs_err(info, "support for ACL not compiled in!");
- ret = -EINVAL;
- goto out;
+ btrfs_err(NULL, "support for ACL not compiled in");
+ return -EINVAL;
#endif
- case Opt_noacl:
- info->sb->s_flags &= ~SB_POSIXACL;
- break;
- case Opt_notreelog:
- btrfs_set_and_info(info, NOTREELOG,
- "disabling tree log");
- break;
- case Opt_treelog:
- btrfs_clear_and_info(info, NOTREELOG,
- "enabling tree log");
- break;
- case Opt_norecovery:
- case Opt_nologreplay:
- btrfs_warn(info,
+ }
+ /*
+ * VFS limits the ability to toggle ACL on and off via remount,
+ * despite every file system allowing this. This seems to be
+ * an oversight since we all do, but it'll fail if we're
+ * remounting. So don't set the mask here, we'll check it in
+ * btrfs_reconfigure and do the toggling ourselves.
+ */
+ if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE)
+ fc->sb_flags_mask |= SB_POSIXACL;
+ break;
+ case Opt_treelog:
+ if (result.negated)
+ btrfs_set_opt(ctx->mount_opt, NOTREELOG);
+ else
+ btrfs_clear_opt(ctx->mount_opt, NOTREELOG);
+ break;
+ case Opt_nologreplay:
+ btrfs_warn(NULL,
"'nologreplay' is deprecated, use 'rescue=nologreplay' instead");
- btrfs_set_and_info(info, NOLOGREPLAY,
- "disabling log replay at mount time");
- break;
- case Opt_flushoncommit:
- btrfs_set_and_info(info, FLUSHONCOMMIT,
- "turning on flush-on-commit");
- break;
- case Opt_noflushoncommit:
- btrfs_clear_and_info(info, FLUSHONCOMMIT,
- "turning off flush-on-commit");
- break;
- case Opt_ratio:
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized metadata_ratio value %s",
- args[0].from);
- goto out;
- }
- info->metadata_ratio = intarg;
- btrfs_info(info, "metadata ratio %u",
- info->metadata_ratio);
- break;
- case Opt_discard:
- case Opt_discard_mode:
- if (token == Opt_discard ||
- strcmp(args[0].from, "sync") == 0) {
- btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC);
- btrfs_set_and_info(info, DISCARD_SYNC,
- "turning on sync discard");
- } else if (strcmp(args[0].from, "async") == 0) {
- btrfs_clear_opt(info->mount_opt, DISCARD_SYNC);
- btrfs_set_and_info(info, DISCARD_ASYNC,
- "turning on async discard");
- } else {
- btrfs_err(info, "unrecognized discard mode value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- btrfs_clear_opt(info->mount_opt, NODISCARD);
- break;
- case Opt_nodiscard:
- btrfs_clear_and_info(info, DISCARD_SYNC,
- "turning off discard");
- btrfs_clear_and_info(info, DISCARD_ASYNC,
- "turning off async discard");
- btrfs_set_opt(info->mount_opt, NODISCARD);
- break;
- case Opt_space_cache:
- case Opt_space_cache_version:
- /*
- * We already set FREE_SPACE_TREE above because we have
- * compat_ro(FREE_SPACE_TREE) set, and we aren't going
- * to allow v1 to be set for extent tree v2, simply
- * ignore this setting if we're extent tree v2.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- if (token == Opt_space_cache ||
- strcmp(args[0].from, "v1") == 0) {
- btrfs_clear_opt(info->mount_opt,
- FREE_SPACE_TREE);
- btrfs_set_and_info(info, SPACE_CACHE,
- "enabling disk space caching");
- } else if (strcmp(args[0].from, "v2") == 0) {
- btrfs_clear_opt(info->mount_opt,
- SPACE_CACHE);
- btrfs_set_and_info(info, FREE_SPACE_TREE,
- "enabling free space tree");
- } else {
- btrfs_err(info, "unrecognized space_cache value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- break;
- case Opt_rescan_uuid_tree:
- btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
- break;
- case Opt_no_space_cache:
- /*
- * We cannot operate without the free space tree with
- * extent tree v2, ignore this option.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- if (btrfs_test_opt(info, SPACE_CACHE)) {
- btrfs_clear_and_info(info, SPACE_CACHE,
- "disabling disk space caching");
- }
- if (btrfs_test_opt(info, FREE_SPACE_TREE)) {
- btrfs_clear_and_info(info, FREE_SPACE_TREE,
- "disabling free space tree");
- }
- break;
- case Opt_inode_cache:
- case Opt_noinode_cache:
- btrfs_warn(info,
- "the 'inode_cache' option is deprecated and has no effect since 5.11");
- break;
- case Opt_clear_cache:
- /*
- * We cannot clear the free space tree with extent tree
- * v2, ignore this option.
- */
- if (btrfs_fs_incompat(info, EXTENT_TREE_V2))
- break;
- btrfs_set_and_info(info, CLEAR_CACHE,
- "force clearing of disk cache");
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
+ break;
+ case Opt_flushoncommit:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT);
+ else
+ btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT);
+ break;
+ case Opt_ratio:
+ ctx->metadata_ratio = result.uint_32;
+ break;
+ case Opt_discard:
+ if (result.negated) {
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ btrfs_set_opt(ctx->mount_opt, NODISCARD);
+ } else {
+ btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ }
+ break;
+ case Opt_discard_mode:
+ switch (result.uint_32) {
+ case Opt_discard_sync:
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC);
+ btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC);
break;
- case Opt_user_subvol_rm_allowed:
- btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ case Opt_discard_async:
+ btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC);
+ btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC);
break;
- case Opt_enospc_debug:
- btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+ default:
+ btrfs_err(NULL, "unrecognized discard mode value %s",
+ param->key);
+ return -EINVAL;
+ }
+ btrfs_clear_opt(ctx->mount_opt, NODISCARD);
+ break;
+ case Opt_space_cache:
+ if (result.negated) {
+ btrfs_set_opt(ctx->mount_opt, NOSPACECACHE);
+ btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+ } else {
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+ }
+ break;
+ case Opt_space_cache_version:
+ switch (result.uint_32) {
+ case Opt_space_cache_v1:
+ btrfs_set_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE);
break;
- case Opt_noenospc_debug:
- btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG);
+ case Opt_space_cache_v2:
+ btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE);
+ btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE);
break;
- case Opt_defrag:
- btrfs_set_and_info(info, AUTO_DEFRAG,
- "enabling auto defrag");
+ default:
+ btrfs_err(NULL, "unrecognized space_cache value %s",
+ param->key);
+ return -EINVAL;
+ }
+ break;
+ case Opt_rescan_uuid_tree:
+ btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE);
+ break;
+ case Opt_clear_cache:
+ btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+ break;
+ case Opt_user_subvol_rm_allowed:
+ btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED);
+ break;
+ case Opt_enospc_debug:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG);
+ else
+ btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG);
+ break;
+ case Opt_defrag:
+ if (result.negated)
+ btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG);
+ else
+ btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG);
+ break;
+ case Opt_usebackuproot:
+ btrfs_warn(NULL,
+ "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead");
+ btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
+
+ /* If we're loading the backup roots we can't trust the space cache. */
+ btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE);
+ break;
+ case Opt_skip_balance:
+ btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE);
+ break;
+ case Opt_fatal_errors:
+ switch (result.uint_32) {
+ case Opt_fatal_errors_panic:
+ btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
break;
- case Opt_nodefrag:
- btrfs_clear_and_info(info, AUTO_DEFRAG,
- "disabling auto defrag");
+ case Opt_fatal_errors_bug:
+ btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR);
break;
- case Opt_recovery:
- case Opt_usebackuproot:
- btrfs_warn(info,
- "'%s' is deprecated, use 'rescue=usebackuproot' instead",
- token == Opt_recovery ? "recovery" :
- "usebackuproot");
- btrfs_info(info,
- "trying to use backup root at mount time");
- btrfs_set_opt(info->mount_opt, USEBACKUPROOT);
+ default:
+ btrfs_err(NULL, "unrecognized fatal_errors value %s",
+ param->key);
+ return -EINVAL;
+ }
+ break;
+ case Opt_commit_interval:
+ ctx->commit_interval = result.uint_32;
+ if (ctx->commit_interval == 0)
+ ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ break;
+ case Opt_rescue:
+ switch (result.uint_32) {
+ case Opt_rescue_usebackuproot:
+ btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT);
break;
- case Opt_skip_balance:
- btrfs_set_opt(info->mount_opt, SKIP_BALANCE);
+ case Opt_rescue_nologreplay:
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
- case Opt_fatal_errors:
- if (strcmp(args[0].from, "panic") == 0) {
- btrfs_set_opt(info->mount_opt,
- PANIC_ON_FATAL_ERROR);
- } else if (strcmp(args[0].from, "bug") == 0) {
- btrfs_clear_opt(info->mount_opt,
- PANIC_ON_FATAL_ERROR);
- } else {
- btrfs_err(info, "unrecognized fatal_errors value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
+ case Opt_rescue_ignorebadroots:
+ btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
break;
- case Opt_commit_interval:
- intarg = 0;
- ret = match_int(&args[0], &intarg);
- if (ret) {
- btrfs_err(info, "unrecognized commit_interval value %s",
- args[0].from);
- ret = -EINVAL;
- goto out;
- }
- if (intarg == 0) {
- btrfs_info(info,
- "using default commit interval %us",
- BTRFS_DEFAULT_COMMIT_INTERVAL);
- intarg = BTRFS_DEFAULT_COMMIT_INTERVAL;
- } else if (intarg > 300) {
- btrfs_warn(info, "excessive commit interval %d",
- intarg);
- }
- info->commit_interval = intarg;
+ case Opt_rescue_ignoredatacsums:
+ btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
break;
- case Opt_rescue:
- ret = parse_rescue_options(info, args[0].from);
- if (ret < 0) {
- btrfs_err(info, "unrecognized rescue value %s",
- args[0].from);
- goto out;
- }
+ case Opt_rescue_parameter_all:
+ btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS);
+ btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS);
+ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY);
break;
+ default:
+ btrfs_info(NULL, "unrecognized rescue option '%s'",
+ param->key);
+ return -EINVAL;
+ }
+ break;
#ifdef CONFIG_BTRFS_DEBUG
- case Opt_fragment_all:
- btrfs_info(info, "fragmenting all space");
- btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
- btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA);
+ case Opt_fragment:
+ switch (result.uint_32) {
+ case Opt_fragment_parameter_all:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
break;
- case Opt_fragment_metadata:
- btrfs_info(info, "fragmenting metadata");
- btrfs_set_opt(info->mount_opt,
- FRAGMENT_METADATA);
+ case Opt_fragment_parameter_metadata:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA);
break;
- case Opt_fragment_data:
- btrfs_info(info, "fragmenting data");
- btrfs_set_opt(info->mount_opt, FRAGMENT_DATA);
+ case Opt_fragment_parameter_data:
+ btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA);
break;
+ default:
+ btrfs_info(NULL, "unrecognized fragment option '%s'",
+ param->key);
+ return -EINVAL;
+ }
+ break;
#endif
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
- case Opt_ref_verify:
- btrfs_info(info, "doing ref verification");
- btrfs_set_opt(info->mount_opt, REF_VERIFY);
- break;
+ case Opt_ref_verify:
+ btrfs_set_opt(ctx->mount_opt, REF_VERIFY);
+ break;
#endif
- case Opt_err:
- btrfs_err(info, "unrecognized mount option '%s'", p);
- ret = -EINVAL;
- goto out;
- default:
- break;
- }
+ default:
+ btrfs_err(NULL, "unrecognized mount option '%s'", param->key);
+ return -EINVAL;
}
-check:
- /* We're read-only, don't have to check. */
- if (new_flags & SB_RDONLY)
- goto out;
- if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
- check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
- check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))
- ret = -EINVAL;
-out:
- if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(info, FREE_SPACE_TREE) &&
- !btrfs_test_opt(info, CLEAR_CACHE)) {
- btrfs_err(info, "cannot disable free space tree");
- ret = -EINVAL;
- }
- if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
- !btrfs_test_opt(info, FREE_SPACE_TREE)) {
- btrfs_err(info, "cannot disable free space tree with block-group-tree feature");
- ret = -EINVAL;
- }
- if (!ret)
- ret = btrfs_check_mountopts_zoned(info);
- if (!ret && !remounting) {
- if (btrfs_test_opt(info, SPACE_CACHE))
- btrfs_info(info, "disk space caching is enabled");
- if (btrfs_test_opt(info, FREE_SPACE_TREE))
- btrfs_info(info, "using free space tree");
- }
- return ret;
+ return 0;
}
/*
- * Parse mount options that are required early in the mount process.
- *
- * All other options will be parsed on much later in the mount process and
- * only when we need to allocate a new super block.
+ * Some options only have meaning at mount time and shouldn't persist across
+ * remounts, or be displayed. Clear these at the end of mount and remount code
+ * paths.
*/
-static int btrfs_parse_device_options(const char *options, blk_mode_t flags)
+static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info)
{
- substring_t args[MAX_OPT_ARGS];
- char *device_name, *opts, *orig, *p;
- struct btrfs_device *device = NULL;
- int error = 0;
+ btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT);
+ btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE);
+ btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE);
+}
- lockdep_assert_held(&uuid_mutex);
+static bool check_ro_option(struct btrfs_fs_info *fs_info,
+ unsigned long mount_opt, unsigned long opt,
+ const char *opt_name)
+{
+ if (mount_opt & opt) {
+ btrfs_err(fs_info, "%s must be used with ro mount option",
+ opt_name);
+ return true;
+ }
+ return false;
+}
- if (!options)
- return 0;
+bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+ unsigned long flags)
+{
+ bool ret = true;
- /*
- * strsep changes the string, duplicate it because btrfs_parse_options
- * gets called later
- */
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ if (!(flags & SB_RDONLY) &&
+ (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") ||
+ check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")))
+ ret = false;
- while ((p = strsep(&opts, ",")) != NULL) {
- int token;
+ if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) {
+ btrfs_err(info, "cannot disable free-space-tree");
+ ret = false;
+ }
+ if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) &&
+ !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) {
+ btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature");
+ ret = false;
+ }
- if (!*p)
- continue;
+ if (btrfs_check_mountopts_zoned(info, mount_opt))
+ ret = false;
- token = match_token(p, tokens, args);
- if (token == Opt_device) {
- device_name = match_strdup(&args[0]);
- if (!device_name) {
- error = -ENOMEM;
- goto out;
- }
- device = btrfs_scan_one_device(device_name, flags, false);
- kfree(device_name);
- if (IS_ERR(device)) {
- error = PTR_ERR(device);
- goto out;
- }
- }
+ if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
+ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE))
+ btrfs_info(info, "disk space caching is enabled");
+ if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
+ btrfs_info(info, "using free-space-tree");
}
-out:
- kfree(orig);
- return error;
+ return ret;
}
/*
- * Parse mount options that are related to subvolume id
+ * This is subtle, we only call this during open_ctree(). We need to pre-load
+ * the mount options with the on-disk settings. Before the new mount API took
+ * effect we would do this on mount and remount. With the new mount API we'll
+ * only do this on the initial mount.
*
- * The value is later passed to mount_subvol()
+ * This isn't a change in behavior, because we're using the current state of the
+ * file system to set the current mount options. If you mounted with special
+ * options to disable these features and then remounted we wouldn't revert the
+ * settings, because mounting without these features cleared the on-disk
+ * settings, so this being called on re-mount is not needed.
*/
-static int btrfs_parse_subvol_options(const char *options, char **subvol_name,
- u64 *subvol_objectid)
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info)
{
- substring_t args[MAX_OPT_ARGS];
- char *opts, *orig, *p;
- int error = 0;
- u64 subvolid;
-
- if (!options)
- return 0;
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) {
+ btrfs_info(fs_info,
+ "forcing free space tree for sector size %u with page size %lu",
+ fs_info->sectorsize, PAGE_SIZE);
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ }
+ }
/*
- * strsep changes the string, duplicate it because
- * btrfs_parse_device_options gets called later
+ * At this point our mount options are populated, so we only mess with
+ * these settings if we don't have any settings already.
*/
- opts = kstrdup(options, GFP_KERNEL);
- if (!opts)
- return -ENOMEM;
- orig = opts;
+ if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
+ return;
- while ((p = strsep(&opts, ",")) != NULL) {
- int token;
- if (!*p)
- continue;
+ if (btrfs_is_zoned(fs_info) &&
+ btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_info(fs_info, "zoned: clearing existing space cache");
+ btrfs_set_super_cache_generation(fs_info->super_copy, 0);
+ return;
+ }
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_subvol:
- kfree(*subvol_name);
- *subvol_name = match_strdup(&args[0]);
- if (!*subvol_name) {
- error = -ENOMEM;
- goto out;
- }
- break;
- case Opt_subvolid:
- error = match_u64(&args[0], &subvolid);
- if (error)
- goto out;
+ if (btrfs_test_opt(fs_info, SPACE_CACHE))
+ return;
- /* we want the original fs_tree */
- if (subvolid == 0)
- subvolid = BTRFS_FS_TREE_OBJECTID;
+ if (btrfs_test_opt(fs_info, NOSPACECACHE))
+ return;
- *subvol_objectid = subvolid;
- break;
- default:
- break;
- }
- }
+ /*
+ * At this point we don't have explicit options set by the user, set
+ * them ourselves based on the state of the file system.
+ */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ else if (btrfs_free_space_cache_v1_active(fs_info))
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+}
-out:
- kfree(orig);
- return error;
+static void set_device_specific_options(struct btrfs_fs_info *fs_info)
+{
+ if (!btrfs_test_opt(fs_info, NOSSD) &&
+ !fs_info->fs_devices->rotating)
+ btrfs_set_opt(fs_info->mount_opt, SSD);
+
+ /*
+ * For devices supporting discard turn on discard=async automatically,
+ * unless it's already set or disabled. This could be turned off by
+ * nodiscard for the same mount.
+ *
+ * The zoned mode piggy backs on the discard functionality for
+ * resetting a zone. There is no reason to delay the zone reset as it is
+ * fast enough. So, do not enable async discard for zoned mode.
+ */
+ if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
+ btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
+ btrfs_test_opt(fs_info, NODISCARD)) &&
+ fs_info->fs_devices->discardable &&
+ !btrfs_is_zoned(fs_info))
+ btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC);
}
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
@@ -1105,10 +929,6 @@ static int btrfs_fill_super(struct super_block *sb,
#endif
sb->s_xattr = btrfs_xattr_handlers;
sb->s_time_gran = 1;
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
- sb->s_flags |= SB_POSIXACL;
-#endif
- sb->s_flags |= SB_I_VERSION;
sb->s_iflags |= SB_I_CGROUPWB;
err = super_setup_bdi(sb);
@@ -1291,22 +1111,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
return 0;
}
-static int btrfs_test_super(struct super_block *s, void *data)
-{
- struct btrfs_fs_info *p = data;
- struct btrfs_fs_info *fs_info = btrfs_sb(s);
-
- return fs_info->fs_devices == p->fs_devices;
-}
-
-static int btrfs_set_super(struct super_block *s, void *data)
-{
- int err = set_anon_super(s, data);
- if (!err)
- s->s_fs_info = data;
- return err;
-}
-
/*
* subvolumes are identified by ino 256
*/
@@ -1382,200 +1186,6 @@ out:
return root;
}
-/*
- * Find a superblock for the given device / mount point.
- *
- * Note: This is based on mount_bdev from fs/super.c with a few additions
- * for multiple device setup. Make sure to keep it in sync.
- */
-static struct dentry *btrfs_mount_root(struct file_system_type *fs_type,
- int flags, const char *device_name, void *data)
-{
- struct block_device *bdev = NULL;
- struct super_block *s;
- struct btrfs_device *device = NULL;
- struct btrfs_fs_devices *fs_devices = NULL;
- struct btrfs_fs_info *fs_info = NULL;
- void *new_sec_opts = NULL;
- blk_mode_t mode = sb_open_mode(flags);
- int error = 0;
-
- if (data) {
- error = security_sb_eat_lsm_opts(data, &new_sec_opts);
- if (error)
- return ERR_PTR(error);
- }
-
- /*
- * Setup a dummy root and fs_info for test/set super. This is because
- * we don't actually fill this stuff out until open_ctree, but we need
- * then open_ctree will properly initialize the file system specific
- * settings later. btrfs_init_fs_info initializes the static elements
- * of the fs_info (locks and such) to make cleanup easier if we find a
- * superblock with our given fs_devices later on at sget() time.
- */
- fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
- if (!fs_info) {
- error = -ENOMEM;
- goto error_sec_opts;
- }
- btrfs_init_fs_info(fs_info);
-
- fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
- if (!fs_info->super_copy || !fs_info->super_for_commit) {
- error = -ENOMEM;
- goto error_fs_info;
- }
-
- mutex_lock(&uuid_mutex);
- error = btrfs_parse_device_options(data, mode);
- if (error) {
- mutex_unlock(&uuid_mutex);
- goto error_fs_info;
- }
-
- /*
- * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
- * either a valid device or an error.
- */
- device = btrfs_scan_one_device(device_name, mode, true);
- ASSERT(device != NULL);
- if (IS_ERR(device)) {
- mutex_unlock(&uuid_mutex);
- error = PTR_ERR(device);
- goto error_fs_info;
- }
-
- fs_devices = device->fs_devices;
- fs_info->fs_devices = fs_devices;
-
- error = btrfs_open_devices(fs_devices, mode, fs_type);
- mutex_unlock(&uuid_mutex);
- if (error)
- goto error_fs_info;
-
- if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
- error = -EACCES;
- goto error_close_devices;
- }
-
- bdev = fs_devices->latest_dev->bdev;
- s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC,
- fs_info);
- if (IS_ERR(s)) {
- error = PTR_ERR(s);
- goto error_close_devices;
- }
-
- if (s->s_root) {
- btrfs_close_devices(fs_devices);
- btrfs_free_fs_info(fs_info);
- if ((flags ^ s->s_flags) & SB_RDONLY)
- error = -EBUSY;
- } else {
- snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
- shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name,
- s->s_id);
- btrfs_sb(s)->bdev_holder = fs_type;
- error = btrfs_fill_super(s, fs_devices, data);
- }
- if (!error)
- error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL);
- security_free_mnt_opts(&new_sec_opts);
- if (error) {
- deactivate_locked_super(s);
- return ERR_PTR(error);
- }
-
- return dget(s->s_root);
-
-error_close_devices:
- btrfs_close_devices(fs_devices);
-error_fs_info:
- btrfs_free_fs_info(fs_info);
-error_sec_opts:
- security_free_mnt_opts(&new_sec_opts);
- return ERR_PTR(error);
-}
-
-/*
- * Mount function which is called by VFS layer.
- *
- * In order to allow mounting a subvolume directly, btrfs uses mount_subtree()
- * which needs vfsmount* of device's root (/). This means device's root has to
- * be mounted internally in any case.
- *
- * Operation flow:
- * 1. Parse subvol id related options for later use in mount_subvol().
- *
- * 2. Mount device's root (/) by calling vfs_kern_mount().
- *
- * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the
- * first place. In order to avoid calling btrfs_mount() again, we use
- * different file_system_type which is not registered to VFS by
- * register_filesystem() (btrfs_root_fs_type). As a result,
- * btrfs_mount_root() is called. The return value will be used by
- * mount_subtree() in mount_subvol().
- *
- * 3. Call mount_subvol() to get the dentry of subvolume. Since there is
- * "btrfs subvolume set-default", mount_subvol() is called always.
- */
-static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
- const char *device_name, void *data)
-{
- struct vfsmount *mnt_root;
- struct dentry *root;
- char *subvol_name = NULL;
- u64 subvol_objectid = 0;
- int error = 0;
-
- error = btrfs_parse_subvol_options(data, &subvol_name,
- &subvol_objectid);
- if (error) {
- kfree(subvol_name);
- return ERR_PTR(error);
- }
-
- /* mount device's root (/) */
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data);
- if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) {
- if (flags & SB_RDONLY) {
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
- flags & ~SB_RDONLY, device_name, data);
- } else {
- mnt_root = vfs_kern_mount(&btrfs_root_fs_type,
- flags | SB_RDONLY, device_name, data);
- if (IS_ERR(mnt_root)) {
- root = ERR_CAST(mnt_root);
- kfree(subvol_name);
- goto out;
- }
-
- down_write(&mnt_root->mnt_sb->s_umount);
- error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL);
- up_write(&mnt_root->mnt_sb->s_umount);
- if (error < 0) {
- root = ERR_PTR(error);
- mntput(mnt_root);
- kfree(subvol_name);
- goto out;
- }
- }
- }
- if (IS_ERR(mnt_root)) {
- root = ERR_CAST(mnt_root);
- kfree(subvol_name);
- goto out;
- }
-
- /* mount_subvol() will free subvol_name and mnt_root */
- root = mount_subvol(subvol_name, subvol_objectid, mnt_root);
-
-out:
- return root;
-}
-
static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
u32 new_pool_size, u32 old_pool_size)
{
@@ -1638,202 +1248,282 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
btrfs_set_free_space_cache_v1_active(fs_info, cache_opt);
}
-static int btrfs_remount(struct super_block *sb, int *flags, char *data)
+static int btrfs_remount_rw(struct btrfs_fs_info *fs_info)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(sb);
- unsigned old_flags = sb->s_flags;
- unsigned long old_opts = fs_info->mount_opt;
- unsigned long old_compress_type = fs_info->compress_type;
- u64 old_max_inline = fs_info->max_inline;
- u32 old_thread_pool_size = fs_info->thread_pool_size;
- u32 old_metadata_ratio = fs_info->metadata_ratio;
int ret;
- sync_filesystem(sb);
- set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+ if (BTRFS_FS_ERROR(fs_info)) {
+ btrfs_err(fs_info,
+ "remounting read-write after error is not allowed");
+ return -EINVAL;
+ }
- if (data) {
- void *new_sec_opts = NULL;
+ if (fs_info->fs_devices->rw_devices == 0)
+ return -EACCES;
- ret = security_sb_eat_lsm_opts(data, &new_sec_opts);
- if (!ret)
- ret = security_sb_remount(sb, new_sec_opts);
- security_free_mnt_opts(&new_sec_opts);
- if (ret)
- goto restore;
+ if (!btrfs_check_rw_degradable(fs_info, NULL)) {
+ btrfs_warn(fs_info,
+ "too many missing devices, writable remount is not allowed");
+ return -EACCES;
+ }
+
+ if (btrfs_super_log_root(fs_info->super_copy) != 0) {
+ btrfs_warn(fs_info,
+ "mount required to replay tree-log, cannot remount read-write");
+ return -EINVAL;
}
- ret = btrfs_parse_options(fs_info, data, *flags);
+ /*
+ * NOTE: when remounting with a change that does writes, don't put it
+ * anywhere above this point, as we are not sure to be safe to write
+ * until we pass the above checks.
+ */
+ ret = btrfs_start_pre_rw_mount(fs_info);
if (ret)
- goto restore;
+ return ret;
- ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY));
- if (ret < 0)
- goto restore;
+ btrfs_clear_sb_rdonly(fs_info->sb);
- btrfs_remount_begin(fs_info, old_opts, *flags);
- btrfs_resize_thread_pool(fs_info,
- fs_info->thread_pool_size, old_thread_pool_size);
+ set_bit(BTRFS_FS_OPEN, &fs_info->flags);
- if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
- (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
- (!sb_rdonly(sb) || (*flags & SB_RDONLY))) {
- btrfs_warn(fs_info,
- "remount supports changing free space tree only from ro to rw");
- /* Make sure free space cache options match the state on disk */
- if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
- btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
- btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
- }
- if (btrfs_free_space_cache_v1_active(fs_info)) {
- btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
- btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
- }
- }
+ /*
+ * If we've gone from readonly -> read-write, we need to get our
+ * sync/async discard lists in the right state.
+ */
+ btrfs_discard_resume(fs_info);
- if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
- goto out;
+ return 0;
+}
- if (*flags & SB_RDONLY) {
- /*
- * this also happens on 'umount -rf' or on shutdown, when
- * the filesystem is busy.
- */
- cancel_work_sync(&fs_info->async_reclaim_work);
- cancel_work_sync(&fs_info->async_data_reclaim_work);
+static int btrfs_remount_ro(struct btrfs_fs_info *fs_info)
+{
+ /*
+ * This also happens on 'umount -rf' or on shutdown, when the
+ * filesystem is busy.
+ */
+ cancel_work_sync(&fs_info->async_reclaim_work);
+ cancel_work_sync(&fs_info->async_data_reclaim_work);
- btrfs_discard_cleanup(fs_info);
+ btrfs_discard_cleanup(fs_info);
- /* wait for the uuid_scan task to finish */
- down(&fs_info->uuid_tree_rescan_sem);
- /* avoid complains from lockdep et al. */
- up(&fs_info->uuid_tree_rescan_sem);
+ /* Wait for the uuid_scan task to finish */
+ down(&fs_info->uuid_tree_rescan_sem);
+ /* Avoid complains from lockdep et al. */
+ up(&fs_info->uuid_tree_rescan_sem);
- btrfs_set_sb_rdonly(sb);
+ btrfs_set_sb_rdonly(fs_info->sb);
- /*
- * Setting SB_RDONLY will put the cleaner thread to
- * sleep at the next loop if it's already active.
- * If it's already asleep, we'll leave unused block
- * groups on disk until we're mounted read-write again
- * unless we clean them up here.
- */
- btrfs_delete_unused_bgs(fs_info);
+ /*
+ * Setting SB_RDONLY will put the cleaner thread to sleep at the next
+ * loop if it's already active. If it's already asleep, we'll leave
+ * unused block groups on disk until we're mounted read-write again
+ * unless we clean them up here.
+ */
+ btrfs_delete_unused_bgs(fs_info);
- /*
- * The cleaner task could be already running before we set the
- * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock).
- * We must make sure that after we finish the remount, i.e. after
- * we call btrfs_commit_super(), the cleaner can no longer start
- * a transaction - either because it was dropping a dead root,
- * running delayed iputs or deleting an unused block group (the
- * cleaner picked a block group from the list of unused block
- * groups before we were able to in the previous call to
- * btrfs_delete_unused_bgs()).
- */
- wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING,
- TASK_UNINTERRUPTIBLE);
+ /*
+ * The cleaner task could be already running before we set the flag
+ * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). We must make
+ * sure that after we finish the remount, i.e. after we call
+ * btrfs_commit_super(), the cleaner can no longer start a transaction
+ * - either because it was dropping a dead root, running delayed iputs
+ * or deleting an unused block group (the cleaner picked a block
+ * group from the list of unused block groups before we were able to
+ * in the previous call to btrfs_delete_unused_bgs()).
+ */
+ wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE);
- /*
- * We've set the superblock to RO mode, so we might have made
- * the cleaner task sleep without running all pending delayed
- * iputs. Go through all the delayed iputs here, so that if an
- * unmount happens without remounting RW we don't end up at
- * finishing close_ctree() with a non-empty list of delayed
- * iputs.
- */
- btrfs_run_delayed_iputs(fs_info);
+ /*
+ * We've set the superblock to RO mode, so we might have made the
+ * cleaner task sleep without running all pending delayed iputs. Go
+ * through all the delayed iputs here, so that if an unmount happens
+ * without remounting RW we don't end up at finishing close_ctree()
+ * with a non-empty list of delayed iputs.
+ */
+ btrfs_run_delayed_iputs(fs_info);
- btrfs_dev_replace_suspend_for_unmount(fs_info);
- btrfs_scrub_cancel(fs_info);
- btrfs_pause_balance(fs_info);
+ btrfs_dev_replace_suspend_for_unmount(fs_info);
+ btrfs_scrub_cancel(fs_info);
+ btrfs_pause_balance(fs_info);
- /*
- * Pause the qgroup rescan worker if it is running. We don't want
- * it to be still running after we are in RO mode, as after that,
- * by the time we unmount, it might have left a transaction open,
- * so we would leak the transaction and/or crash.
- */
- btrfs_qgroup_wait_for_completion(fs_info, false);
+ /*
+ * Pause the qgroup rescan worker if it is running. We don't want it to
+ * be still running after we are in RO mode, as after that, by the time
+ * we unmount, it might have left a transaction open, so we would leak
+ * the transaction and/or crash.
+ */
+ btrfs_qgroup_wait_for_completion(fs_info, false);
- ret = btrfs_commit_super(fs_info);
- if (ret)
- goto restore;
- } else {
- if (BTRFS_FS_ERROR(fs_info)) {
- btrfs_err(fs_info,
- "Remounting read-write after error is not allowed");
- ret = -EINVAL;
- goto restore;
- }
- if (fs_info->fs_devices->rw_devices == 0) {
- ret = -EACCES;
- goto restore;
- }
+ return btrfs_commit_super(fs_info);
+}
- if (!btrfs_check_rw_degradable(fs_info, NULL)) {
- btrfs_warn(fs_info,
- "too many missing devices, writable remount is not allowed");
- ret = -EACCES;
- goto restore;
- }
+static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+ fs_info->max_inline = ctx->max_inline;
+ fs_info->commit_interval = ctx->commit_interval;
+ fs_info->metadata_ratio = ctx->metadata_ratio;
+ fs_info->thread_pool_size = ctx->thread_pool_size;
+ fs_info->mount_opt = ctx->mount_opt;
+ fs_info->compress_type = ctx->compress_type;
+ fs_info->compress_level = ctx->compress_level;
+}
- if (btrfs_super_log_root(fs_info->super_copy) != 0) {
- btrfs_warn(fs_info,
- "mount required to replay tree-log, cannot remount read-write");
- ret = -EINVAL;
- goto restore;
- }
+static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx)
+{
+ ctx->max_inline = fs_info->max_inline;
+ ctx->commit_interval = fs_info->commit_interval;
+ ctx->metadata_ratio = fs_info->metadata_ratio;
+ ctx->thread_pool_size = fs_info->thread_pool_size;
+ ctx->mount_opt = fs_info->mount_opt;
+ ctx->compress_type = fs_info->compress_type;
+ ctx->compress_level = fs_info->compress_level;
+}
- /*
- * NOTE: when remounting with a change that does writes, don't
- * put it anywhere above this point, as we are not sure to be
- * safe to write until we pass the above checks.
- */
- ret = btrfs_start_pre_rw_mount(fs_info);
- if (ret)
- goto restore;
+#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...) \
+do { \
+ if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \
+ btrfs_raw_test_opt(fs_info->mount_opt, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+} while (0)
+
+#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...) \
+do { \
+ if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \
+ !btrfs_raw_test_opt(fs_info->mount_opt, opt)) \
+ btrfs_info(fs_info, fmt, ##args); \
+} while (0)
+
+static void btrfs_emit_options(struct btrfs_fs_info *info,
+ struct btrfs_fs_context *old)
+{
+ btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
+ btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
+ btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
+ btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
+ btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log");
+ btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time");
+ btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit");
+ btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard");
+ btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard");
+ btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree");
+ btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching");
+ btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache");
+ btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag");
+ btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data");
+ btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata");
+ btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification");
+ btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time");
+ btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots");
+ btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums");
+
+ btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
+ btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
+ btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
+ btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
+ btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
+ btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
+ btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");
+ btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag");
+ btrfs_info_if_unset(info, old, COMPRESS, "use no compression");
+
+ /* Did the compression settings change? */
+ if (btrfs_test_opt(info, COMPRESS) &&
+ (!old ||
+ old->compress_type != info->compress_type ||
+ old->compress_level != info->compress_level ||
+ (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) &&
+ btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) {
+ const char *compress_type = btrfs_compress_type2str(info->compress_type);
+
+ btrfs_info(info, "%s %s compression, level %d",
+ btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use",
+ compress_type, info->compress_level);
+ }
- btrfs_clear_sb_rdonly(sb);
+ if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE)
+ btrfs_info(info, "max_inline set to %llu", info->max_inline);
+}
- set_bit(BTRFS_FS_OPEN, &fs_info->flags);
+static int btrfs_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_context old_ctx;
+ int ret = 0;
+ bool mount_reconfigure = (fc->s_fs_info != NULL);
- /*
- * If we've gone from readonly -> read/write, we need to get
- * our sync/async discard lists in the right state.
- */
- btrfs_discard_resume(fs_info);
+ btrfs_info_to_ctx(fs_info, &old_ctx);
+
+ /*
+ * This is our "bind mount" trick, we don't want to allow the user to do
+ * anything other than mount a different ro/rw and a different subvol,
+ * all of the mount options should be maintained.
+ */
+ if (mount_reconfigure)
+ ctx->mount_opt = old_ctx.mount_opt;
+
+ sync_filesystem(sb);
+ set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+
+ if (!mount_reconfigure &&
+ !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags))
+ return -EINVAL;
+
+ ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY));
+ if (ret < 0)
+ return ret;
+
+ btrfs_ctx_to_info(fs_info, ctx);
+ btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags);
+ btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size,
+ old_ctx.thread_pool_size);
+
+ if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) !=
+ (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
+ (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) {
+ btrfs_warn(fs_info,
+ "remount supports changing free space tree only from RO to RW");
+ /* Make sure free space cache options match the state on disk. */
+ if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+ btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
+ if (btrfs_free_space_cache_v1_active(fs_info)) {
+ btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE);
+ btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE);
+ }
}
-out:
+
+ ret = 0;
+ if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY))
+ ret = btrfs_remount_ro(fs_info);
+ else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY))
+ ret = btrfs_remount_rw(fs_info);
+ if (ret)
+ goto restore;
+
/*
- * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
- * since the absence of the flag means it can be toggled off by remount.
+ * If we set the mask during the parameter parsing VFS would reject the
+ * remount. Here we can set the mask and the value will be updated
+ * appropriately.
*/
- *flags |= SB_I_VERSION;
+ if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL))
+ fc->sb_flags_mask |= SB_POSIXACL;
+ btrfs_emit_options(fs_info, &old_ctx);
wake_up_process(fs_info->transaction_kthread);
- btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
btrfs_clear_oneshot_options(fs_info);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
return 0;
-
restore:
- /* We've hit an error - don't reset SB_RDONLY */
- if (sb_rdonly(sb))
- old_flags |= SB_RDONLY;
- if (!(old_flags & SB_RDONLY))
- clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state);
- sb->s_flags = old_flags;
- fs_info->mount_opt = old_opts;
- fs_info->compress_type = old_compress_type;
- fs_info->max_inline = old_max_inline;
- btrfs_resize_thread_pool(fs_info,
- old_thread_pool_size, fs_info->thread_pool_size);
- fs_info->metadata_ratio = old_metadata_ratio;
- btrfs_remount_cleanup(fs_info, old_opts);
+ btrfs_ctx_to_info(fs_info, &old_ctx);
+ btrfs_remount_cleanup(fs_info, old_ctx.mount_opt);
clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
-
return ret;
}
@@ -2094,6 +1784,309 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0;
}
+static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc)
+{
+ struct btrfs_fs_info *p = fc->s_fs_info;
+ struct btrfs_fs_info *fs_info = btrfs_sb(sb);
+
+ return fs_info->fs_devices == p->fs_devices;
+}
+
+static int btrfs_get_tree_super(struct fs_context *fc)
+{
+ struct btrfs_fs_info *fs_info = fc->s_fs_info;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_devices *fs_devices = NULL;
+ struct block_device *bdev;
+ struct btrfs_device *device;
+ struct super_block *sb;
+ blk_mode_t mode = btrfs_open_mode(fc);
+ int ret;
+
+ btrfs_ctx_to_info(fs_info, ctx);
+ mutex_lock(&uuid_mutex);
+
+ /*
+ * With 'true' passed to btrfs_scan_one_device() (mount time) we expect
+ * either a valid device or an error.
+ */
+ device = btrfs_scan_one_device(fc->source, mode, true);
+ ASSERT(device != NULL);
+ if (IS_ERR(device)) {
+ mutex_unlock(&uuid_mutex);
+ return PTR_ERR(device);
+ }
+
+ fs_devices = device->fs_devices;
+ fs_info->fs_devices = fs_devices;
+
+ ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type);
+ mutex_unlock(&uuid_mutex);
+ if (ret)
+ return ret;
+
+ if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) {
+ ret = -EACCES;
+ goto error;
+ }
+
+ bdev = fs_devices->latest_dev->bdev;
+
+ /*
+ * From now on the error handling is not straightforward.
+ *
+ * If successful, this will transfer the fs_info into the super block,
+ * and fc->s_fs_info will be NULL. However if there's an existing
+ * super, we'll still have fc->s_fs_info populated. If we error
+ * completely out it'll be cleaned up when we drop the fs_context,
+ * otherwise it's tied to the lifetime of the super_block.
+ */
+ sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc);
+ if (IS_ERR(sb)) {
+ ret = PTR_ERR(sb);
+ goto error;
+ }
+
+ set_device_specific_options(fs_info);
+
+ if (sb->s_root) {
+ btrfs_close_devices(fs_devices);
+ if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY)
+ ret = -EBUSY;
+ } else {
+ snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
+ shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id);
+ btrfs_sb(sb)->bdev_holder = &btrfs_fs_type;
+ ret = btrfs_fill_super(sb, fs_devices, NULL);
+ }
+
+ if (ret) {
+ deactivate_locked_super(sb);
+ return ret;
+ }
+
+ btrfs_clear_oneshot_options(fs_info);
+
+ fc->root = dget(sb->s_root);
+ return 0;
+
+error:
+ btrfs_close_devices(fs_devices);
+ return ret;
+}
+
+/*
+ * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes
+ * with different ro/rw options") the following works:
+ *
+ * (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo
+ * (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar
+ *
+ * which looks nice and innocent but is actually pretty intricate and deserves
+ * a long comment.
+ *
+ * On another filesystem a subvolume mount is close to something like:
+ *
+ * (iii) # create rw superblock + initial mount
+ * mount -t xfs /dev/sdb /opt/
+ *
+ * # create ro bind mount
+ * mount --bind -o ro /opt/foo /mnt/foo
+ *
+ * # unmount initial mount
+ * umount /opt
+ *
+ * Of course, there's some special subvolume sauce and there's the fact that the
+ * sb->s_root dentry is really swapped after mount_subtree(). But conceptually
+ * it's very close and will help us understand the issue.
+ *
+ * The old mount API didn't cleanly distinguish between a mount being made ro
+ * and a superblock being made ro. The only way to change the ro state of
+ * either object was by passing ms_rdonly. If a new mount was created via
+ * mount(2) such as:
+ *
+ * mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null);
+ *
+ * the MS_RDONLY flag being specified had two effects:
+ *
+ * (1) MNT_READONLY was raised -> the resulting mount got
+ * @mnt->mnt_flags |= MNT_READONLY raised.
+ *
+ * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems
+ * made the superblock ro. Note, how SB_RDONLY has the same value as
+ * ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2).
+ *
+ * Creating a subtree mount via (iii) ends up leaving a rw superblock with a
+ * subtree mounted ro.
+ *
+ * But consider the effect on the old mount API on btrfs subvolume mounting
+ * which combines the distinct step in (iii) into a single step.
+ *
+ * By issuing (i) both the mount and the superblock are turned ro. Now when (ii)
+ * is issued the superblock is ro and thus even if the mount created for (ii) is
+ * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro
+ * to rw for (ii) which it did using an internal remount call.
+ *
+ * IOW, subvolume mounting was inherently complicated due to the ambiguity of
+ * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate
+ * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when
+ * passed by mount(8) to mount(2).
+ *
+ * Enter the new mount API. The new mount API disambiguates making a mount ro
+ * and making a superblock ro.
+ *
+ * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either
+ * fsmount() or mount_setattr() this is a pure VFS level change for a
+ * specific mount or mount tree that is never seen by the filesystem itself.
+ *
+ * (4) To turn a superblock ro the "ro" flag must be used with
+ * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem
+ * in fc->sb_flags.
+ *
+ * This disambiguation has rather positive consequences. Mounting a subvolume
+ * ro will not also turn the superblock ro. Only the mount for the subvolume
+ * will become ro.
+ *
+ * So, if the superblock creation request comes from the new mount API the
+ * caller must have explicitly done:
+ *
+ * fsconfig(FSCONFIG_SET_FLAG, "ro")
+ * fsmount/mount_setattr(MOUNT_ATTR_RDONLY)
+ *
+ * IOW, at some point the caller must have explicitly turned the whole
+ * superblock ro and we shouldn't just undo it like we did for the old mount
+ * API. In any case, it lets us avoid the hack in the new mount API.
+ *
+ * Consequently, the remounting hack must only be used for requests originating
+ * from the old mount API and should be marked for full deprecation so it can be
+ * turned off in a couple of years.
+ *
+ * The new mount API has no reason to support this hack.
+ */
+static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc)
+{
+ struct vfsmount *mnt;
+ int ret;
+ const bool ro2rw = !(fc->sb_flags & SB_RDONLY);
+
+ /*
+ * We got an EBUSY because our SB_RDONLY flag didn't match the existing
+ * super block, so invert our setting here and retry the mount so we
+ * can get our vfsmount.
+ */
+ if (ro2rw)
+ fc->sb_flags |= SB_RDONLY;
+ else
+ fc->sb_flags &= ~SB_RDONLY;
+
+ mnt = fc_mount(fc);
+ if (IS_ERR(mnt))
+ return mnt;
+
+ if (!fc->oldapi || !ro2rw)
+ return mnt;
+
+ /* We need to convert to rw, call reconfigure. */
+ fc->sb_flags &= ~SB_RDONLY;
+ down_write(&mnt->mnt_sb->s_umount);
+ ret = btrfs_reconfigure(fc);
+ up_write(&mnt->mnt_sb->s_umount);
+ if (ret) {
+ mntput(mnt);
+ return ERR_PTR(ret);
+ }
+ return mnt;
+}
+
+static int btrfs_get_tree_subvol(struct fs_context *fc)
+{
+ struct btrfs_fs_info *fs_info = NULL;
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct fs_context *dup_fc;
+ struct dentry *dentry;
+ struct vfsmount *mnt;
+
+ /*
+ * Setup a dummy root and fs_info for test/set super. This is because
+ * we don't actually fill this stuff out until open_ctree, but we need
+ * then open_ctree will properly initialize the file system specific
+ * settings later. btrfs_init_fs_info initializes the static elements
+ * of the fs_info (locks and such) to make cleanup easier if we find a
+ * superblock with our given fs_devices later on at sget() time.
+ */
+ fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL);
+ if (!fs_info)
+ return -ENOMEM;
+
+ fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL);
+ if (!fs_info->super_copy || !fs_info->super_for_commit) {
+ btrfs_free_fs_info(fs_info);
+ return -ENOMEM;
+ }
+ btrfs_init_fs_info(fs_info);
+
+ dup_fc = vfs_dup_fs_context(fc);
+ if (IS_ERR(dup_fc)) {
+ btrfs_free_fs_info(fs_info);
+ return PTR_ERR(dup_fc);
+ }
+
+ /*
+ * When we do the sget_fc this gets transferred to the sb, so we only
+ * need to set it on the dup_fc as this is what creates the super block.
+ */
+ dup_fc->s_fs_info = fs_info;
+
+ /*
+ * We'll do the security settings in our btrfs_get_tree_super() mount
+ * loop, they were duplicated into dup_fc, we can drop the originals
+ * here.
+ */
+ security_free_mnt_opts(&fc->security);
+ fc->security = NULL;
+
+ mnt = fc_mount(dup_fc);
+ if (PTR_ERR_OR_ZERO(mnt) == -EBUSY)
+ mnt = btrfs_reconfigure_for_mount(dup_fc);
+ put_fs_context(dup_fc);
+ if (IS_ERR(mnt))
+ return PTR_ERR(mnt);
+
+ /*
+ * This free's ->subvol_name, because if it isn't set we have to
+ * allocate a buffer to hold the subvol_name, so we just drop our
+ * reference to it here.
+ */
+ dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt);
+ ctx->subvol_name = NULL;
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);
+
+ fc->root = dentry;
+ return 0;
+}
+
+static int btrfs_get_tree(struct fs_context *fc)
+{
+ /*
+ * Since we use mount_subtree to mount the default/specified subvol, we
+ * have to do mounts in two steps.
+ *
+ * First pass through we call btrfs_get_tree_subvol(), this is just a
+ * wrapper around fc_mount() to call back into here again, and this time
+ * we'll call btrfs_get_tree_super(). This will do the open_ctree() and
+ * everything to open the devices and file system. Then we return back
+ * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and
+ * from there we can do our mount_subvol() call, which will lookup
+ * whichever subvol we're mounting and setup this fc with the
+ * appropriate dentry for the subvol.
+ */
+ if (fc->s_fs_info)
+ return btrfs_get_tree_super(fc);
+ return btrfs_get_tree_subvol(fc);
+}
+
static void btrfs_kill_super(struct super_block *sb)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -2101,22 +2094,85 @@ static void btrfs_kill_super(struct super_block *sb)
btrfs_free_fs_info(fs_info);
}
-static struct file_system_type btrfs_fs_type = {
- .owner = THIS_MODULE,
- .name = "btrfs",
- .mount = btrfs_mount,
- .kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA,
-};
+static void btrfs_free_fs_context(struct fs_context *fc)
+{
+ struct btrfs_fs_context *ctx = fc->fs_private;
+ struct btrfs_fs_info *fs_info = fc->s_fs_info;
+
+ if (fs_info)
+ btrfs_free_fs_info(fs_info);
+
+ if (ctx && refcount_dec_and_test(&ctx->refs)) {
+ kfree(ctx->subvol_name);
+ kfree(ctx);
+ }
+}
+
+static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc)
+{
+ struct btrfs_fs_context *ctx = src_fc->fs_private;
-static struct file_system_type btrfs_root_fs_type = {
- .owner = THIS_MODULE,
- .name = "btrfs",
- .mount = btrfs_mount_root,
- .kill_sb = btrfs_kill_super,
- .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ /*
+ * Give a ref to our ctx to this dup, as we want to keep it around for
+ * our original fc so we can have the subvolume name or objectid.
+ *
+ * We unset ->source in the original fc because the dup needs it for
+ * mounting, and then once we free the dup it'll free ->source, so we
+ * need to make sure we're only pointing to it in one fc.
+ */
+ refcount_inc(&ctx->refs);
+ fc->fs_private = ctx;
+ fc->source = src_fc->source;
+ src_fc->source = NULL;
+ return 0;
+}
+
+static const struct fs_context_operations btrfs_fs_context_ops = {
+ .parse_param = btrfs_parse_param,
+ .reconfigure = btrfs_reconfigure,
+ .get_tree = btrfs_get_tree,
+ .dup = btrfs_dup_fs_context,
+ .free = btrfs_free_fs_context,
};
+static int btrfs_init_fs_context(struct fs_context *fc)
+{
+ struct btrfs_fs_context *ctx;
+
+ ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ refcount_set(&ctx->refs, 1);
+ fc->fs_private = ctx;
+ fc->ops = &btrfs_fs_context_ops;
+
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx);
+ } else {
+ ctx->thread_pool_size =
+ min_t(unsigned long, num_online_cpus() + 2, 8);
+ ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE;
+ ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+ }
+
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+ fc->sb_flags |= SB_POSIXACL;
+#endif
+ fc->sb_flags |= SB_I_VERSION;
+
+ return 0;
+}
+
+static struct file_system_type btrfs_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "btrfs",
+ .init_fs_context = btrfs_init_fs_context,
+ .parameters = btrfs_fs_parameters,
+ .kill_sb = btrfs_kill_super,
+ .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP,
+ };
+
MODULE_ALIAS_FS("btrfs");
static int btrfs_control_open(struct inode *inode, struct file *file)
@@ -2328,7 +2384,6 @@ static const struct super_operations btrfs_super_ops = {
.destroy_inode = btrfs_destroy_inode,
.free_inode = btrfs_free_inode,
.statfs = btrfs_statfs,
- .remount_fs = btrfs_remount,
.freeze_fs = btrfs_freeze,
.unfreeze_fs = btrfs_unfreeze,
};
diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h
index 8dbb909b364f..f18253ca280d 100644
--- a/fs/btrfs/super.h
+++ b/fs/btrfs/super.h
@@ -3,11 +3,12 @@
#ifndef BTRFS_SUPER_H
#define BTRFS_SUPER_H
-int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
- unsigned long new_flags);
+bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt,
+ unsigned long flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
u64 subvol_objectid);
+void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info);
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e6b51fb3ddc1..84c05246ffd8 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1783,6 +1783,10 @@ static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj,
unsigned long long limit;
limit = memparse(buf, &endptr);
+ /* There could be trailing '\n', also catch any typos after the value. */
+ endptr = skip_spaces(endptr);
+ if (*endptr != 0)
+ return -EINVAL;
WRITE_ONCE(device->scrub_speed_max, limit);
return len;
}
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index ca09cf9afce8..709c6cc9706a 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -28,6 +28,7 @@ const char *test_error[] = {
[TEST_ALLOC_INODE] = "cannot allocate inode",
[TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group",
[TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map",
+ [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map",
};
static const struct super_operations btrfs_test_super_ops = {
@@ -102,7 +103,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
if (!dev)
return ERR_PTR(-ENOMEM);
- extent_io_tree_init(NULL, &dev->alloc_state, 0);
+ extent_io_tree_init(fs_info, &dev->alloc_state, 0);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);
@@ -185,7 +186,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
}
spin_unlock(&fs_info->buffer_lock);
- btrfs_mapping_tree_free(&fs_info->mapping_tree);
+ btrfs_mapping_tree_free(fs_info);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
dev_list) {
btrfs_free_dummy_device(dev);
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index 7a2d7ffbe30e..dc2f2ab15fa5 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -23,6 +23,7 @@ enum {
TEST_ALLOC_INODE,
TEST_ALLOC_BLOCK_GROUP,
TEST_ALLOC_EXTENT_MAP,
+ TEST_ALLOC_CHUNK_MAP,
};
extern const char *test_error[];
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 1cc86af97dc6..25b3349595e0 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -652,7 +652,7 @@ static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory,
const char *test_name)
{
for (int i = 0; i < eb->len; i++) {
- struct page *page = eb->pages[i >> PAGE_SHIFT];
+ struct page *page = folio_page(eb->folios[i >> PAGE_SHIFT], 0);
void *addr = page_address(page) + offset_in_page(i);
if (memcmp(addr, memory + i, 1) != 0) {
@@ -668,7 +668,7 @@ static int verify_eb_and_memory(struct extent_buffer *eb, void *memory,
const char *test_name)
{
for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) {
- void *eb_addr = page_address(eb->pages[i]);
+ void *eb_addr = folio_address(eb->folios[i]);
if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) {
dump_eb_and_memory_contents(eb, memory, test_name);
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 29bdd08b241f..253cce7ffecf 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -25,7 +25,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
#ifdef CONFIG_BTRFS_DEBUG
if (refcount_read(&em->refs) != 1) {
test_err(
-"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d",
+"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d",
em->start, em->len, em->block_start,
em->block_len, refcount_read(&em->refs));
@@ -73,7 +73,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 16K)");
@@ -94,7 +94,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
em->block_start = SZ_32K; /* avoid merging */
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [16K, 20K)");
@@ -121,9 +121,14 @@ static int test_case_1(struct btrfs_fs_info *fs_info,
test_err("case1 [%llu %llu]: ret %d", start, start + len, ret);
goto out;
}
- if (em &&
- (em->start != 0 || extent_map_end(em) != SZ_16K ||
- em->block_start != 0 || em->block_len != SZ_16K)) {
+ if (!em) {
+ test_err("case1 [%llu %llu]: no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
+ if (em->start != 0 || extent_map_end(em) != SZ_16K ||
+ em->block_start != 0 || em->block_len != SZ_16K) {
test_err(
"case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
start, start + len, ret, em->start, em->len,
@@ -161,7 +166,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = EXTENT_MAP_INLINE;
em->block_len = (u64)-1;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 1K)");
@@ -182,7 +187,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -209,9 +214,13 @@ static int test_case_2(struct btrfs_fs_info *fs_info,
test_err("case2 [0 1K]: ret %d", ret);
goto out;
}
- if (em &&
- (em->start != 0 || extent_map_end(em) != SZ_1K ||
- em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) {
+ if (!em) {
+ test_err("case2 [0 1K]: no extent map returned");
+ ret = -ENOENT;
+ goto out;
+ }
+ if (em->start != 0 || extent_map_end(em) != SZ_1K ||
+ em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1) {
test_err(
"case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu",
ret, em->start, em->len, em->block_start,
@@ -244,7 +253,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
em->block_start = SZ_4K;
em->block_len = SZ_4K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [4K, 8K)");
@@ -268,19 +277,24 @@ static int __test_case_3(struct btrfs_fs_info *fs_info,
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
- test_err("case3 [0x%llx 0x%llx): ret %d",
+ test_err("case3 [%llu %llu): ret %d",
start, start + len, ret);
goto out;
}
+ if (!em) {
+ test_err("case3 [%llu %llu): no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
/*
* Since bytes within em are contiguous, em->block_start is identical to
* em->start.
*/
- if (em &&
- (start < em->start || start + len > extent_map_end(em) ||
- em->start != em->block_start || em->len != em->block_len)) {
+ if (start < em->start || start + len > extent_map_end(em) ||
+ em->start != em->block_start || em->len != em->block_len) {
test_err(
-"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
+"case3 [%llu %llu): ret %d em (start %llu len %llu block_start %llu block_len %llu)",
start, start + len, ret, em->start, em->len,
em->block_start, em->block_len);
ret = -EINVAL;
@@ -343,7 +357,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = 0;
em->block_len = SZ_8K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [0, 8K)");
@@ -364,7 +378,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
em->block_start = SZ_16K; /* avoid merging */
em->block_len = 24 * SZ_1K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("cannot add extent range [8K, 32K)");
@@ -387,14 +401,20 @@ static int __test_case_4(struct btrfs_fs_info *fs_info,
ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
if (ret) {
- test_err("case4 [0x%llx 0x%llx): ret %d",
- start, len, ret);
+ test_err("case4 [%llu %llu): ret %d",
+ start, start + len, ret);
goto out;
}
- if (em && (start < em->start || start + len > extent_map_end(em))) {
+ if (!em) {
+ test_err("case4 [%llu %llu): no extent map returned",
+ start, start + len);
+ ret = -ENOENT;
+ goto out;
+ }
+ if (start < em->start || start + len > extent_map_end(em)) {
test_err(
-"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)",
- start, len, ret, em->start, em->len, em->block_start,
+"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu block_start %llu block_len %llu)",
+ start, start + len, ret, em->start, em->len, em->block_start,
em->block_len);
ret = -EINVAL;
}
@@ -443,7 +463,8 @@ static int test_case_4(struct btrfs_fs_info *fs_info,
return ret;
}
-static int add_compressed_extent(struct extent_map_tree *em_tree,
+static int add_compressed_extent(struct btrfs_fs_info *fs_info,
+ struct extent_map_tree *em_tree,
u64 start, u64 len, u64 block_start)
{
struct extent_map *em;
@@ -459,9 +480,9 @@ static int add_compressed_extent(struct extent_map_tree *em_tree,
em->len = len;
em->block_start = block_start;
em->block_len = SZ_4K;
- set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+ em->flags |= EXTENT_FLAG_COMPRESS_ZLIB;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
free_extent_map(em);
if (ret < 0) {
@@ -567,7 +588,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index)
* They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from
* merging the em's.
*/
-static int test_case_5(void)
+static int test_case_5(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
struct inode *inode;
@@ -585,35 +606,35 @@ static int test_case_5(void)
em_tree = &BTRFS_I(inode)->extent_tree;
/* [0, 12k) */
- ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0);
+ ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K * 3, 0);
if (ret) {
test_err("cannot add extent range [0, 12K)");
goto out;
}
/* [12k, 24k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [24k, 36k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [36k, 40k) */
- ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
}
/* [40k, 64k) */
- ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K);
if (ret) {
test_err("cannot add extent range [12k, 24k)");
goto out;
@@ -665,11 +686,11 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em
struct extent_map *em = NULL;
int ret;
- ret = add_compressed_extent(em_tree, 0, SZ_4K, 0);
+ ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K, 0);
if (ret)
goto out;
- ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0);
+ ret = add_compressed_extent(fs_info, em_tree, SZ_4K, SZ_4K, 0);
if (ret)
goto out;
@@ -713,7 +734,7 @@ out:
* true would mess up the start/end calculations and subsequent splits would be
* incorrect.
*/
-static int test_case_7(void)
+static int test_case_7(struct btrfs_fs_info *fs_info)
{
struct extent_map_tree *em_tree;
struct extent_map *em;
@@ -742,9 +763,9 @@ static int test_case_7(void)
em->len = SZ_16K;
em->block_start = 0;
em->block_len = SZ_4K;
- set_bit(EXTENT_FLAG_PINNED, &em->flags);
+ em->flags |= EXTENT_FLAG_PINNED;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -765,7 +786,7 @@ static int test_case_7(void)
em->block_start = SZ_32K;
em->block_len = SZ_16K;
write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len);
write_unlock(&em_tree->lock);
if (ret < 0) {
test_err("couldn't add extent map");
@@ -859,33 +880,21 @@ struct rmap_test_vector {
static int test_rmap_block(struct btrfs_fs_info *fs_info,
struct rmap_test_vector *test)
{
- struct extent_map *em;
- struct map_lookup *map = NULL;
+ struct btrfs_chunk_map *map;
u64 *logical = NULL;
int i, out_ndaddrs, out_stripe_len;
int ret;
- em = alloc_extent_map();
- if (!em) {
- test_std_err(TEST_ALLOC_EXTENT_MAP);
- return -ENOMEM;
- }
-
- map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL);
+ map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL);
if (!map) {
- kfree(em);
- test_std_err(TEST_ALLOC_EXTENT_MAP);
+ test_std_err(TEST_ALLOC_CHUNK_MAP);
return -ENOMEM;
}
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
/* Start at 4GiB logical address */
- em->start = SZ_4G;
- em->len = test->data_stripe_size * test->num_data_stripes;
- em->block_len = em->len;
- em->orig_block_len = test->data_stripe_size;
- em->map_lookup = map;
-
+ map->start = SZ_4G;
+ map->chunk_len = test->data_stripe_size * test->num_data_stripes;
+ map->stripe_size = test->data_stripe_size;
map->num_stripes = test->num_stripes;
map->type = test->raid_type;
@@ -901,15 +910,13 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
map->stripes[i].physical = test->data_stripe_phys_start[i];
}
- write_lock(&fs_info->mapping_tree.lock);
- ret = add_extent_mapping(&fs_info->mapping_tree, em, 0);
- write_unlock(&fs_info->mapping_tree.lock);
+ ret = btrfs_add_chunk_map(fs_info, map);
if (ret) {
- test_err("error adding block group mapping to mapping tree");
+ test_err("error adding chunk map to mapping tree");
goto out_free;
}
- ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1),
+ ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1),
&logical, &out_ndaddrs, &out_stripe_len);
if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) {
test_err("didn't rmap anything but expected %d",
@@ -938,14 +945,8 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info,
ret = 0;
out:
- write_lock(&fs_info->mapping_tree.lock);
- remove_extent_mapping(&fs_info->mapping_tree, em);
- write_unlock(&fs_info->mapping_tree.lock);
- /* For us */
- free_extent_map(em);
+ btrfs_remove_chunk_map(fs_info, map);
out_free:
- /* For the tree */
- free_extent_map(em);
kfree(logical);
return ret;
}
@@ -1022,13 +1023,13 @@ int btrfs_test_extent_map(void)
ret = test_case_4(fs_info, em_tree);
if (ret)
goto out;
- ret = test_case_5();
+ ret = test_case_5(fs_info);
if (ret)
goto out;
ret = test_case_6(fs_info, em_tree);
if (ret)
goto out;
- ret = test_case_7();
+ ret = test_case_7(fs_info);
if (ret)
goto out;
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 492d69d2fa73..9957de9f7806 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -211,9 +211,9 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot);
}
-static unsigned long prealloc_only = 0;
-static unsigned long compressed_only = 0;
-static unsigned long vacancy_only = 0;
+static u32 prealloc_only = 0;
+static u32 compressed_only = 0;
+static u32 vacancy_only = 0;
static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
{
@@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
/*
@@ -332,7 +332,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -355,7 +355,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -383,7 +383,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -412,7 +412,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
offset = em->start + em->len;
@@ -434,7 +434,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
@@ -468,7 +468,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -497,7 +497,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -527,7 +527,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != orig_start) {
@@ -560,7 +560,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != prealloc_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
prealloc_only, em->flags);
goto out;
}
@@ -595,7 +595,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -604,9 +604,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
@@ -629,7 +629,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -638,9 +638,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, em->orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
disk_bytenr = em->block_start;
@@ -664,7 +664,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != compressed_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
compressed_only, em->flags);
goto out;
}
@@ -701,9 +701,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
em->start, orig_start);
goto out;
}
- if (em->compress_type != BTRFS_COMPRESS_ZLIB) {
+ if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) {
test_err("unexpected compress type, wanted %d, got %d",
- BTRFS_COMPRESS_ZLIB, em->compress_type);
+ BTRFS_COMPRESS_ZLIB, extent_map_compression(em));
goto out;
}
offset = em->start + em->len;
@@ -726,7 +726,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -758,7 +758,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != vacancy_only) {
- test_err("unexpected flags set, want %lu have %lu",
+ test_err("unexpected flags set, want %u have %u",
vacancy_only, em->flags);
goto out;
}
@@ -786,7 +786,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, want 0 have %lu", em->flags);
+ test_err("unexpected flags set, want 0 have %u", em->flags);
goto out;
}
if (em->orig_start != em->start) {
@@ -866,7 +866,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != vacancy_only) {
- test_err("wrong flags, wanted %lu, have %lu", vacancy_only,
+ test_err("wrong flags, wanted %u, have %u", vacancy_only,
em->flags);
goto out;
}
@@ -888,7 +888,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
goto out;
}
if (em->flags != 0) {
- test_err("unexpected flags set, wanted 0 got %lu",
+ test_err("unexpected flags set, wanted 0 got %u",
em->flags);
goto out;
}
@@ -1095,8 +1095,8 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize)
test_msg("running inode tests");
- set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only);
- set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only);
+ compressed_only |= EXTENT_FLAG_COMPRESS_ZLIB;
+ prealloc_only |= EXTENT_FLAG_PREALLOC;
ret = test_btrfs_get_extent(sectorsize, nodesize);
if (ret)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 5b3333ceef04..bf8e64c766b6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -564,56 +564,22 @@ static int btrfs_reserve_trans_metadata(struct btrfs_fs_info *fs_info,
u64 num_bytes,
u64 *delayed_refs_bytes)
{
- struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
struct btrfs_space_info *si = fs_info->trans_block_rsv.space_info;
- u64 extra_delayed_refs_bytes = 0;
- u64 bytes;
+ u64 bytes = num_bytes + *delayed_refs_bytes;
int ret;
/*
- * If there's a gap between the size of the delayed refs reserve and
- * its reserved space, than some tasks have added delayed refs or bumped
- * its size otherwise (due to block group creation or removal, or block
- * group item update). Also try to allocate that gap in order to prevent
- * using (and possibly abusing) the global reserve when committing the
- * transaction.
- */
- if (flush == BTRFS_RESERVE_FLUSH_ALL &&
- !btrfs_block_rsv_full(delayed_refs_rsv)) {
- spin_lock(&delayed_refs_rsv->lock);
- if (delayed_refs_rsv->size > delayed_refs_rsv->reserved)
- extra_delayed_refs_bytes = delayed_refs_rsv->size -
- delayed_refs_rsv->reserved;
- spin_unlock(&delayed_refs_rsv->lock);
- }
-
- bytes = num_bytes + *delayed_refs_bytes + extra_delayed_refs_bytes;
-
- /*
* We want to reserve all the bytes we may need all at once, so we only
* do 1 enospc flushing cycle per transaction start.
*/
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
- if (ret == 0) {
- if (extra_delayed_refs_bytes > 0)
- btrfs_migrate_to_delayed_refs_rsv(fs_info,
- extra_delayed_refs_bytes);
- return 0;
- }
-
- if (extra_delayed_refs_bytes > 0) {
- bytes -= extra_delayed_refs_bytes;
- ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
- if (ret == 0)
- return 0;
- }
/*
* If we are an emergency flush, which can steal from the global block
* reserve, then attempt to not reserve space for the delayed refs, as
* we will consume space for them from the global block reserve.
*/
- if (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
+ if (ret && flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
bytes -= *delayed_refs_bytes;
*delayed_refs_bytes = 0;
ret = btrfs_reserve_metadata_bytes(fs_info, si, bytes, flush);
@@ -1868,7 +1834,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
key.offset = (u64)-1;
- pending->snap = btrfs_get_new_fs_root(fs_info, objectid, pending->anon_dev);
+ pending->snap = btrfs_get_new_fs_root(fs_info, objectid, &pending->anon_dev);
if (IS_ERR(pending->snap)) {
ret = PTR_ERR(pending->snap);
pending->snap = NULL;
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 50fdc69fdddf..6eccf8496486 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1436,7 +1436,7 @@ static int check_extent_item(struct extent_buffer *leaf,
if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) {
extent_err(leaf, slot,
"inline ref item overflows extent item, ptr %lu iref size %u end %lu",
- ptr, inline_type, end);
+ ptr, btrfs_extent_inline_ref_size(inline_type), end);
return -EUCLEAN;
}
diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h
index 3c2a02a72f64..14b9fbe82da4 100644
--- a/fs/btrfs/tree-checker.h
+++ b/fs/btrfs/tree-checker.h
@@ -22,7 +22,7 @@ struct btrfs_tree_parent_check {
/*
* Expected transid, can be 0 to skip the check, but such skip
- * should only be utlized for backref walk related code.
+ * should only be utilized for backref walk related code.
*/
u64 transid;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7d6729d9fd2f..331fc7429952 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2575,7 +2575,6 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans,
ret = btrfs_pin_reserved_extent(trans, eb);
if (ret)
return ret;
- btrfs_redirty_list_add(trans->transaction, eb);
} else {
unaccount_log_buffer(eb->fs_info, eb->start);
}
@@ -4520,7 +4519,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
int ret = 0;
if (inode->flags & BTRFS_INODE_NODATASUM ||
- test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+ (em->flags & EXTENT_FLAG_PREALLOC) ||
em->block_start == EXTENT_MAP_HOLE)
return 0;
@@ -4583,7 +4582,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans,
return 0;
/* If we're compressed we have to save the entire range of csums. */
- if (em->compress_type) {
+ if (extent_map_is_compressed(em)) {
csum_offset = 0;
csum_len = max(em->block_len, em->orig_block_len);
} else {
@@ -4623,18 +4622,20 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
struct btrfs_file_extent_item fi = { 0 };
struct extent_buffer *leaf;
struct btrfs_key key;
+ enum btrfs_compression_type compress_type;
u64 extent_offset = em->start - em->orig_start;
u64 block_len;
int ret;
btrfs_set_stack_file_extent_generation(&fi, trans->transid);
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+ if (em->flags & EXTENT_FLAG_PREALLOC)
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC);
else
btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG);
block_len = max(em->block_len, em->orig_block_len);
- if (em->compress_type != BTRFS_COMPRESS_NONE) {
+ compress_type = extent_map_compression(em);
+ if (compress_type != BTRFS_COMPRESS_NONE) {
btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start);
btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len);
} else if (em->block_start < EXTENT_MAP_LAST_BYTE) {
@@ -4646,7 +4647,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
btrfs_set_stack_file_extent_offset(&fi, extent_offset);
btrfs_set_stack_file_extent_num_bytes(&fi, em->len);
btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes);
- btrfs_set_stack_file_extent_compression(&fi, em->compress_type);
+ btrfs_set_stack_file_extent_compression(&fi, compress_type);
ret = log_extent_csums(trans, inode, log, em, ctx);
if (ret)
@@ -4859,13 +4860,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
continue;
/* We log prealloc extents beyond eof later. */
- if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) &&
+ if ((em->flags & EXTENT_FLAG_PREALLOC) &&
em->start >= i_size_read(&inode->vfs_inode))
continue;
/* Need a ref to keep it from getting evicted from cache */
refcount_inc(&em->refs);
- set_bit(EXTENT_FLAG_LOGGING, &em->flags);
+ em->flags |= EXTENT_FLAG_LOGGING;
list_add_tail(&em->list, &extents);
num++;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f627674b37db..d67785be2c77 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -41,6 +41,17 @@
BTRFS_BLOCK_GROUP_RAID10 | \
BTRFS_BLOCK_GROUP_RAID56_MASK)
+struct btrfs_io_geometry {
+ u32 stripe_index;
+ u32 stripe_nr;
+ int mirror_num;
+ int num_stripes;
+ u64 stripe_offset;
+ u64 raid56_full_stripe_start;
+ int max_errors;
+ enum btrfs_map_op op;
+};
+
const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
[BTRFS_RAID_RAID10] = {
.sub_stripes = 2,
@@ -1742,19 +1753,18 @@ out:
static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
struct rb_node *n;
u64 ret = 0;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- n = rb_last(&em_tree->map.rb_root);
+ read_lock(&fs_info->mapping_tree_lock);
+ n = rb_last(&fs_info->mapping_tree.rb_root);
if (n) {
- em = rb_entry(n, struct extent_map, rb_node);
- ret = em->start + em->len;
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(n, struct btrfs_chunk_map, rb_node);
+ ret = map->start + map->chunk_len;
}
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
@@ -2986,6 +2996,81 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
return ret;
}
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node;
+ struct rb_node *prev = NULL;
+ struct rb_node *orig_prev;
+ struct btrfs_chunk_map *map;
+ struct btrfs_chunk_map *prev_map = NULL;
+
+ while (node) {
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ prev = node;
+ prev_map = map;
+
+ if (logical < map->start) {
+ node = node->rb_left;
+ } else if (logical >= map->start + map->chunk_len) {
+ node = node->rb_right;
+ } else {
+ refcount_inc(&map->refs);
+ return map;
+ }
+ }
+
+ if (!prev)
+ return NULL;
+
+ orig_prev = prev;
+ while (prev && logical >= prev_map->start + prev_map->chunk_len) {
+ prev = rb_next(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+
+ if (!prev) {
+ prev = orig_prev;
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ while (prev && logical < prev_map->start) {
+ prev = rb_prev(prev);
+ prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node);
+ }
+ }
+
+ if (prev) {
+ u64 end = logical + length;
+
+ /*
+ * Caller can pass a U64_MAX length when it wants to get any
+ * chunk starting at an offset of 'logical' or higher, so deal
+ * with underflow by resetting the end offset to U64_MAX.
+ */
+ if (end < logical)
+ end = U64_MAX;
+
+ if (end > prev_map->start &&
+ logical < prev_map->start + prev_map->chunk_len) {
+ refcount_inc(&prev_map->refs);
+ return prev_map;
+ }
+ }
+
+ return NULL;
+}
+
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
+{
+ struct btrfs_chunk_map *map;
+
+ read_lock(&fs_info->mapping_tree_lock);
+ map = btrfs_find_chunk_map_nolock(fs_info, logical, length);
+ read_unlock(&fs_info->mapping_tree_lock);
+
+ return map;
+}
+
/*
* Find the mapping containing the given logical extent.
*
@@ -2994,38 +3079,35 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
*
* Return: Chunk mapping or ERR_PTR.
*/
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length)
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length)
{
- struct extent_map_tree *em_tree;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
- em_tree = &fs_info->mapping_tree;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, length);
- read_unlock(&em_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, length);
- if (!em) {
+ if (unlikely(!map)) {
btrfs_crit(fs_info,
"unable to find chunk map for logical %llu length %llu",
logical, length);
return ERR_PTR(-EINVAL);
}
- if (em->start > logical || em->start + em->len <= logical) {
+ if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) {
btrfs_crit(fs_info,
"found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
- logical, logical + length, em->start, em->start + em->len);
- free_extent_map(em);
+ logical, logical + length, map->start,
+ map->start + map->chunk_len);
+ btrfs_free_chunk_map(map);
return ERR_PTR(-EINVAL);
}
- /* callers are responsible for dropping em's ref. */
- return em;
+ /* Callers are responsible for dropping the reference. */
+ return map;
}
static int remove_chunk_item(struct btrfs_trans_handle *trans,
- struct map_lookup *map, u64 chunk_offset)
+ struct btrfs_chunk_map *map, u64 chunk_offset)
{
int i;
@@ -3050,23 +3132,21 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans,
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 dev_extent_len = 0;
int i, ret = 0;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em)) {
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map)) {
/*
* This is a logic error, but we don't want to just rely on the
* user having built with ASSERT enabled, so if ASSERT doesn't
* do anything we still error out.
*/
ASSERT(0);
- return PTR_ERR(em);
+ return PTR_ERR(map);
}
- map = em->map_lookup;
/*
* First delete the device extent items from the devices btree.
@@ -3169,7 +3249,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
goto out;
}
- trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
+ trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len);
if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
@@ -3188,7 +3268,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
*/
btrfs_trans_release_chunk_metadata(trans);
- ret = btrfs_remove_block_group(trans, chunk_offset, em);
+ ret = btrfs_remove_block_group(trans, map);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
@@ -3200,7 +3280,7 @@ out:
trans->removing_chunk = false;
}
/* once for us */
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -5347,24 +5427,131 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
}
}
+static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ set_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT, NULL);
+ }
+}
+
+static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits)
+{
+ for (int i = 0; i < map->num_stripes; i++) {
+ struct btrfs_io_stripe *stripe = &map->stripes[i];
+ struct btrfs_device *device = stripe->dev;
+
+ __clear_extent_bit(&device->alloc_state, stripe->physical,
+ stripe->physical + map->stripe_size - 1,
+ bits | EXTENT_NOWAIT,
+ NULL, NULL);
+ }
+}
+
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ write_lock(&fs_info->mapping_tree_lock);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ /* Once for the tree reference. */
+ btrfs_free_chunk_map(map);
+}
+
+EXPORT_FOR_TESTS
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map)
+{
+ struct rb_node **p;
+ struct rb_node *parent = NULL;
+ bool leftmost = true;
+
+ write_lock(&fs_info->mapping_tree_lock);
+ p = &fs_info->mapping_tree.rb_root.rb_node;
+ while (*p) {
+ struct btrfs_chunk_map *entry;
+
+ parent = *p;
+ entry = rb_entry(parent, struct btrfs_chunk_map, rb_node);
+
+ if (map->start < entry->start) {
+ p = &(*p)->rb_left;
+ } else if (map->start > entry->start) {
+ p = &(*p)->rb_right;
+ leftmost = false;
+ } else {
+ write_unlock(&fs_info->mapping_tree_lock);
+ return -EEXIST;
+ }
+ }
+ rb_link_node(&map->rb_node, parent, p);
+ rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost);
+ chunk_map_device_set_bits(map, CHUNK_ALLOCATED);
+ chunk_map_device_clear_bits(map, CHUNK_TRIMMED);
+ write_unlock(&fs_info->mapping_tree_lock);
+
+ return 0;
+}
+
+EXPORT_FOR_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp)
+{
+ struct btrfs_chunk_map *map;
+
+ map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp);
+ if (!map)
+ return NULL;
+
+ refcount_set(&map->refs, 1);
+ RB_CLEAR_NODE(&map->rb_node);
+
+ return map;
+}
+
+struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp)
+{
+ const int size = btrfs_chunk_map_size(map->num_stripes);
+ struct btrfs_chunk_map *clone;
+
+ clone = kmemdup(map, size, gfp);
+ if (!clone)
+ return NULL;
+
+ refcount_set(&clone->refs, 1);
+ RB_CLEAR_NODE(&clone->rb_node);
+
+ return clone;
+}
+
static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
struct alloc_chunk_ctl *ctl,
struct btrfs_device_info *devices_info)
{
struct btrfs_fs_info *info = trans->fs_info;
- struct map_lookup *map = NULL;
- struct extent_map_tree *em_tree;
+ struct btrfs_chunk_map *map;
struct btrfs_block_group *block_group;
- struct extent_map *em;
u64 start = ctl->start;
u64 type = ctl->type;
int ret;
int i;
int j;
- map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
+ map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS);
if (!map)
return ERR_PTR(-ENOMEM);
+
+ map->start = start;
+ map->chunk_len = ctl->chunk_size;
+ map->stripe_size = ctl->stripe_size;
+ map->type = type;
+ map->io_align = BTRFS_STRIPE_LEN;
+ map->io_width = BTRFS_STRIPE_LEN;
+ map->sub_stripes = ctl->sub_stripes;
map->num_stripes = ctl->num_stripes;
for (i = 0; i < ctl->ndevs; ++i) {
@@ -5375,41 +5562,22 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
j * ctl->stripe_size;
}
}
- map->io_align = BTRFS_STRIPE_LEN;
- map->io_width = BTRFS_STRIPE_LEN;
- map->type = type;
- map->sub_stripes = ctl->sub_stripes;
trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
- em = alloc_extent_map();
- if (!em) {
- kfree(map);
- return ERR_PTR(-ENOMEM);
- }
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = start;
- em->len = ctl->chunk_size;
- em->block_start = 0;
- em->block_len = em->len;
- em->orig_block_len = ctl->stripe_size;
-
- em_tree = &info->mapping_tree;
- write_lock(&em_tree->lock);
- ret = add_extent_mapping(em_tree, em, 0);
+ ret = btrfs_add_chunk_map(info, map);
if (ret) {
- write_unlock(&em_tree->lock);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ERR_PTR(ret);
}
- write_unlock(&em_tree->lock);
block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size);
- if (IS_ERR(block_group))
- goto error_del_extent;
+ if (IS_ERR(block_group)) {
+ btrfs_remove_chunk_map(info, map);
+ return block_group;
+ }
- for (i = 0; i < map->num_stripes; i++) {
+ for (int i = 0; i < map->num_stripes; i++) {
struct btrfs_device *dev = map->stripes[i].dev;
btrfs_device_set_bytes_used(dev,
@@ -5422,23 +5590,10 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans,
atomic64_sub(ctl->stripe_size * map->num_stripes,
&info->free_chunk_space);
- free_extent_map(em);
check_raid56_incompat_flag(info, type);
check_raid1c34_incompat_flag(info, type);
return block_group;
-
-error_del_extent:
- write_lock(&em_tree->lock);
- remove_extent_mapping(em_tree, em);
- write_unlock(&em_tree->lock);
-
- /* One for our allocation */
- free_extent_map(em);
- /* One for the tree reference */
- free_extent_map(em);
-
- return block_group;
}
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
@@ -5514,8 +5669,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_key key;
struct btrfs_chunk *chunk;
struct btrfs_stripe *stripe;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
size_t item_size;
int i;
int ret;
@@ -5544,14 +5698,13 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
*/
lockdep_assert_held(&fs_info->chunk_mutex);
- em = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
- if (IS_ERR(em)) {
- ret = PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, bg->start, bg->length);
+ if (IS_ERR(map)) {
+ ret = PTR_ERR(map);
btrfs_abort_transaction(trans, ret);
return ret;
}
- map = em->map_lookup;
item_size = btrfs_chunk_item_size(map->num_stripes);
chunk = kzalloc(item_size, GFP_NOFS);
@@ -5608,7 +5761,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
out:
kfree(chunk);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -5653,7 +5806,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
return 0;
}
-static inline int btrfs_chunk_max_errors(struct map_lookup *map)
+static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map)
{
const int index = btrfs_bg_flags_to_raid_index(map->type);
@@ -5662,17 +5815,15 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int miss_ndevs = 0;
int i;
bool ret = true;
- em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
+ if (IS_ERR(map))
return false;
- map = em->map_lookup;
for (i = 0; i < map->num_stripes; i++) {
if (test_bit(BTRFS_DEV_STATE_MISSING,
&map->stripes[i].dev->dev_state)) {
@@ -5693,38 +5844,37 @@ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (miss_ndevs > btrfs_chunk_max_errors(map))
ret = false;
end:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
-void btrfs_mapping_tree_free(struct extent_map_tree *tree)
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info)
{
- struct extent_map *em;
+ write_lock(&fs_info->mapping_tree_lock);
+ while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) {
+ struct btrfs_chunk_map *map;
+ struct rb_node *node;
- while (1) {
- write_lock(&tree->lock);
- em = lookup_extent_mapping(tree, 0, (u64)-1);
- if (em)
- remove_extent_mapping(tree, em);
- write_unlock(&tree->lock);
- if (!em)
- break;
- /* once for us */
- free_extent_map(em);
- /* once for the tree */
- free_extent_map(em);
+ node = rb_first_cached(&fs_info->mapping_tree);
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ rb_erase_cached(&map->rb_node, &fs_info->mapping_tree);
+ RB_CLEAR_NODE(&map->rb_node);
+ chunk_map_device_clear_bits(map, CHUNK_ALLOCATED);
+ /* Once for the tree ref. */
+ btrfs_free_chunk_map(map);
+ cond_resched_rwlock_write(&fs_info->mapping_tree_lock);
}
+ write_unlock(&fs_info->mapping_tree_lock);
}
int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
enum btrfs_raid_types index;
int ret = 1;
- em = btrfs_get_chunk_map(fs_info, logical, len);
- if (IS_ERR(em))
+ map = btrfs_get_chunk_map(fs_info, logical, len);
+ if (IS_ERR(map))
/*
* We could return errors for these cases, but that could get
* ugly and we'd probably do the same thing which is just not do
@@ -5733,7 +5883,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
*/
return 1;
- map = em->map_lookup;
index = btrfs_bg_flags_to_raid_index(map->type);
/* Non-RAID56, use their ncopies from btrfs_raid_array. */
@@ -5750,53 +5899,49 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
* stripe under reconstruction.
*/
ret = map->num_stripes;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
unsigned long len = fs_info->sectorsize;
if (!btrfs_fs_incompat(fs_info, RAID56))
return len;
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ map = btrfs_get_chunk_map(fs_info, logical, len);
- if (!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
+ if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
len = btrfs_stripe_nr_to_offset(nr_data_stripes(map));
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
}
return len;
}
int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
int ret = 0;
if (!btrfs_fs_incompat(fs_info, RAID56))
return 0;
- em = btrfs_get_chunk_map(fs_info, logical, len);
+ map = btrfs_get_chunk_map(fs_info, logical, len);
- if(!WARN_ON(IS_ERR(em))) {
- map = em->map_lookup;
+ if (!WARN_ON(IS_ERR(map))) {
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
ret = 1;
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
}
return ret;
}
static int find_live_mirror(struct btrfs_fs_info *fs_info,
- struct map_lookup *map, int first,
+ struct btrfs_chunk_map *map, int first,
int dev_replace_is_ongoing)
{
int i;
@@ -5903,8 +6048,7 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
u64 logical, u64 *length_ret,
u32 *num_stripes)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_discard_stripe *stripes;
u64 length = *length_ret;
u64 offset;
@@ -5922,11 +6066,9 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
int ret;
int i;
- em = btrfs_get_chunk_map(fs_info, logical, length);
- if (IS_ERR(em))
- return ERR_CAST(em);
-
- map = em->map_lookup;
+ map = btrfs_get_chunk_map(fs_info, logical, length);
+ if (IS_ERR(map))
+ return ERR_CAST(map);
/* we don't discard raid56 yet */
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
@@ -5934,8 +6076,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
goto out_free_map;
}
- offset = logical - em->start;
- length = min_t(u64, em->start + em->len - logical, length);
+ offset = logical - map->start;
+ length = min_t(u64, map->start + map->chunk_len - logical, length);
*length_ret = length;
/*
@@ -6032,10 +6174,10 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info,
}
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return stripes;
out_free_map:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ERR_PTR(ret);
}
@@ -6133,17 +6275,16 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op,
bioc->replace_nr_stripes = nr_extra_stripes;
}
-static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
- u64 offset, u32 *stripe_nr, u64 *stripe_offset,
- u64 *full_stripe_start)
+static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset,
+ struct btrfs_io_geometry *io_geom)
{
/*
* Stripe_nr is the stripe where this block falls. stripe_offset is
* the offset of this block in its stripe.
*/
- *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
- *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
- ASSERT(*stripe_offset < U32_MAX);
+ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK;
+ io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT;
+ ASSERT(io_geom->stripe_offset < U32_MAX);
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
unsigned long full_stripe_len =
@@ -6158,18 +6299,17 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* to go rounddown(), not round_down(), as nr_data_stripes is
* not ensured to be power of 2.
*/
- *full_stripe_start =
- btrfs_stripe_nr_to_offset(
- rounddown(*stripe_nr, nr_data_stripes(map)));
+ io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset(
+ rounddown(io_geom->stripe_nr, nr_data_stripes(map)));
- ASSERT(*full_stripe_start + full_stripe_len > offset);
- ASSERT(*full_stripe_start <= offset);
+ ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset);
+ ASSERT(io_geom->raid56_full_stripe_start <= offset);
/*
* For writes to RAID56, allow to write a full stripe set, but
* no straddling of stripe sets.
*/
- if (op == BTRFS_MAP_WRITE)
- return full_stripe_len - (offset - *full_stripe_start);
+ if (io_geom->op == BTRFS_MAP_WRITE)
+ return full_stripe_len - (offset - io_geom->raid56_full_stripe_start);
}
/*
@@ -6177,26 +6317,180 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op,
* a single disk).
*/
if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK)
- return BTRFS_STRIPE_LEN - *stripe_offset;
+ return BTRFS_STRIPE_LEN - io_geom->stripe_offset;
return U64_MAX;
}
-static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
- u64 logical, u64 *length, struct btrfs_io_stripe *dst,
- struct map_lookup *map, u32 stripe_index,
- u64 stripe_offset, u64 stripe_nr)
+static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical,
+ u64 *length, struct btrfs_io_stripe *dst,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
{
- dst->dev = map->stripes[stripe_index].dev;
+ dst->dev = map->stripes[io_geom->stripe_index].dev;
- if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type))
+ if (io_geom->op == BTRFS_MAP_READ &&
+ btrfs_need_stripe_tree_update(fs_info, map->type))
return btrfs_get_raid_extent_offset(fs_info, logical, length,
- map->type, stripe_index, dst);
+ map->type,
+ io_geom->stripe_index, dst);
- dst->physical = map->stripes[stripe_index].physical +
- stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr);
+ dst->physical = map->stripes[io_geom->stripe_index].physical +
+ io_geom->stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom->stripe_nr);
return 0;
}
+static bool is_single_device_io(struct btrfs_fs_info *fs_info,
+ const struct btrfs_io_stripe *smap,
+ const struct btrfs_chunk_map *map,
+ int num_alloc_stripes,
+ enum btrfs_map_op op, int mirror_num)
+{
+ if (!smap)
+ return false;
+
+ if (num_alloc_stripes != 1)
+ return false;
+
+ if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ)
+ return false;
+
+ if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)
+ return false;
+
+ return true;
+}
+
+static void map_blocks_raid0(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ if (io_geom->op == BTRFS_MAP_READ)
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid1(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
+ }
+
+ io_geom->stripe_index = find_live_mirror(fs_info, map, 0,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index + 1;
+}
+
+static void map_blocks_dup(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->num_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index = io_geom->mirror_num - 1;
+ return;
+ }
+
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_raid10(struct btrfs_fs_info *fs_info,
+ struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ bool dev_replace_is_ongoing)
+{
+ u32 factor = map->num_stripes / map->sub_stripes;
+ int old_stripe_index;
+
+ io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes;
+ io_geom->stripe_nr /= factor;
+
+ if (io_geom->op != BTRFS_MAP_READ) {
+ io_geom->num_stripes = map->sub_stripes;
+ return;
+ }
+
+ if (io_geom->mirror_num) {
+ io_geom->stripe_index += io_geom->mirror_num - 1;
+ return;
+ }
+
+ old_stripe_index = io_geom->stripe_index;
+ io_geom->stripe_index = find_live_mirror(fs_info, map,
+ io_geom->stripe_index,
+ dev_replace_is_ongoing);
+ io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1;
+}
+
+static void map_blocks_raid56_write(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom,
+ u64 logical, u64 *length)
+{
+ int data_stripes = nr_data_stripes(map);
+
+ /*
+ * Needs full stripe mapping.
+ *
+ * Push stripe_nr back to the start of the full stripe For those cases
+ * needing a full stripe, @stripe_nr is the full stripe number.
+ *
+ * Originally we go raid56_full_stripe_start / full_stripe_len, but
+ * that can be expensive. Here we just divide @stripe_nr with
+ * @data_stripes.
+ */
+ io_geom->stripe_nr /= data_stripes;
+
+ /* RAID[56] write or recovery. Return all stripes */
+ io_geom->num_stripes = map->num_stripes;
+ io_geom->max_errors = btrfs_chunk_max_errors(map);
+
+ /* Return the length to the full stripe end. */
+ *length = min(logical + *length,
+ io_geom->raid56_full_stripe_start + map->start +
+ btrfs_stripe_nr_to_offset(data_stripes)) -
+ logical;
+ io_geom->stripe_index = 0;
+ io_geom->stripe_offset = 0;
+}
+
+static void map_blocks_raid56_read(struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ int data_stripes = nr_data_stripes(map);
+
+ ASSERT(io_geom->mirror_num <= 1);
+ /* Just grab the data stripe directly. */
+ io_geom->stripe_index = io_geom->stripe_nr % data_stripes;
+ io_geom->stripe_nr /= data_stripes;
+
+ /* We distribute the parity blocks across stripes. */
+ io_geom->stripe_index =
+ (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes;
+
+ if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1)
+ io_geom->mirror_num = 1;
+}
+
+static void map_blocks_single(const struct btrfs_chunk_map *map,
+ struct btrfs_io_geometry *io_geom)
+{
+ io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes;
+ io_geom->stripe_nr /= map->num_stripes;
+ io_geom->mirror_num = io_geom->stripe_index + 1;
+}
+
/*
* Map one logical range to one or more physical ranges.
*
@@ -6237,43 +6531,37 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
struct btrfs_io_context **bioc_ret,
struct btrfs_io_stripe *smap, int *mirror_num_ret)
{
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
+ struct btrfs_io_geometry io_geom = { 0 };
u64 map_offset;
- u64 stripe_offset;
- u32 stripe_nr;
- u32 stripe_index;
- int data_stripes;
int i;
int ret = 0;
- int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
- int num_stripes;
int num_copies;
- int max_errors = 0;
struct btrfs_io_context *bioc = NULL;
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
int dev_replace_is_ongoing = 0;
u16 num_alloc_stripes;
- u64 raid56_full_stripe_start = (u64)-1;
u64 max_len;
ASSERT(bioc_ret);
+ io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0);
+ io_geom.num_stripes = 1;
+ io_geom.stripe_index = 0;
+ io_geom.op = op;
+
num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize);
- if (mirror_num > num_copies)
+ if (io_geom.mirror_num > num_copies)
return -EINVAL;
- em = btrfs_get_chunk_map(fs_info, logical, *length);
- if (IS_ERR(em))
- return PTR_ERR(em);
+ map = btrfs_get_chunk_map(fs_info, logical, *length);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
- map = em->map_lookup;
- data_stripes = nr_data_stripes(map);
-
- map_offset = logical - em->start;
- max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr,
- &stripe_offset, &raid56_full_stripe_start);
- *length = min_t(u64, em->len - map_offset, max_len);
+ map_offset = logical - map->start;
+ io_geom.raid56_full_stripe_start = (u64)-1;
+ max_len = btrfs_max_io_len(map, map_offset, &io_geom);
+ *length = min_t(u64, map->chunk_len - map_offset, max_len);
down_read(&dev_replace->rwsem);
dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
@@ -6284,107 +6572,46 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
if (!dev_replace_is_ongoing)
up_read(&dev_replace->rwsem);
- num_stripes = 1;
- stripe_index = 0;
- if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
- stripe_index = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
- if (op == BTRFS_MAP_READ)
- mirror_num = 1;
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
- if (op != BTRFS_MAP_READ) {
- num_stripes = map->num_stripes;
- } else if (mirror_num) {
- stripe_index = mirror_num - 1;
- } else {
- stripe_index = find_live_mirror(fs_info, map, 0,
- dev_replace_is_ongoing);
- mirror_num = stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
- if (op != BTRFS_MAP_READ) {
- num_stripes = map->num_stripes;
- } else if (mirror_num) {
- stripe_index = mirror_num - 1;
- } else {
- mirror_num = 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
- u32 factor = map->num_stripes / map->sub_stripes;
-
- stripe_index = (stripe_nr % factor) * map->sub_stripes;
- stripe_nr /= factor;
-
- if (op != BTRFS_MAP_READ)
- num_stripes = map->sub_stripes;
- else if (mirror_num)
- stripe_index += mirror_num - 1;
- else {
- int old_stripe_index = stripe_index;
- stripe_index = find_live_mirror(fs_info, map,
- stripe_index,
- dev_replace_is_ongoing);
- mirror_num = stripe_index - old_stripe_index + 1;
- }
-
- } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
- if (op != BTRFS_MAP_READ || mirror_num > 1) {
- /*
- * Needs full stripe mapping.
- *
- * Push stripe_nr back to the start of the full stripe
- * For those cases needing a full stripe, @stripe_nr
- * is the full stripe number.
- *
- * Originally we go raid56_full_stripe_start / full_stripe_len,
- * but that can be expensive. Here we just divide
- * @stripe_nr with @data_stripes.
- */
- stripe_nr /= data_stripes;
-
- /* RAID[56] write or recovery. Return all stripes */
- num_stripes = map->num_stripes;
- max_errors = btrfs_chunk_max_errors(map);
-
- /* Return the length to the full stripe end */
- *length = min(logical + *length,
- raid56_full_stripe_start + em->start +
- btrfs_stripe_nr_to_offset(data_stripes)) -
- logical;
- stripe_index = 0;
- stripe_offset = 0;
- } else {
- ASSERT(mirror_num <= 1);
- /* Just grab the data stripe directly. */
- stripe_index = stripe_nr % data_stripes;
- stripe_nr /= data_stripes;
-
- /* We distribute the parity blocks across stripes */
- stripe_index = (stripe_nr + stripe_index) % map->num_stripes;
- if (op == BTRFS_MAP_READ && mirror_num < 1)
- mirror_num = 1;
- }
- } else {
+ switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+ case BTRFS_BLOCK_GROUP_RAID0:
+ map_blocks_raid0(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID1:
+ case BTRFS_BLOCK_GROUP_RAID1C3:
+ case BTRFS_BLOCK_GROUP_RAID1C4:
+ map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_DUP:
+ map_blocks_dup(map, &io_geom);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID10:
+ map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing);
+ break;
+ case BTRFS_BLOCK_GROUP_RAID5:
+ case BTRFS_BLOCK_GROUP_RAID6:
+ if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)
+ map_blocks_raid56_write(map, &io_geom, logical, length);
+ else
+ map_blocks_raid56_read(map, &io_geom);
+ break;
+ default:
/*
* After this, stripe_nr is the number of stripes on this
* device we have to walk to find the data, and stripe_index is
* the number of our device in the stripe array
*/
- stripe_index = stripe_nr % map->num_stripes;
- stripe_nr /= map->num_stripes;
- mirror_num = stripe_index + 1;
+ map_blocks_single(map, &io_geom);
+ break;
}
- if (stripe_index >= map->num_stripes) {
+ if (io_geom.stripe_index >= map->num_stripes) {
btrfs_crit(fs_info,
"stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
- stripe_index, map->num_stripes);
+ io_geom.stripe_index, map->num_stripes);
ret = -EINVAL;
goto out;
}
- num_alloc_stripes = num_stripes;
+ num_alloc_stripes = io_geom.num_stripes;
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ)
/*
@@ -6401,14 +6628,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* physical block information on the stack instead of allocating an
* I/O context structure.
*/
- if (smap && num_alloc_stripes == 1 &&
- !(btrfs_need_stripe_tree_update(fs_info, map->type) &&
- op != BTRFS_MAP_READ) &&
- !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) {
- ret = set_io_stripe(fs_info, op, logical, length, smap, map,
- stripe_index, stripe_offset, stripe_nr);
+ if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op,
+ io_geom.mirror_num)) {
+ ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom);
if (mirror_num_ret)
- *mirror_num_ret = mirror_num;
+ *mirror_num_ret = io_geom.mirror_num;
*bioc_ret = NULL;
goto out;
}
@@ -6428,7 +6652,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* It's still mostly the same as other profiles, just with extra rotation.
*/
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
- (op != BTRFS_MAP_READ || mirror_num > 1)) {
+ (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) {
/*
* For RAID56 @stripe_nr is already the number of full stripes
* before us, which is also the rotation value (needs to modulo
@@ -6437,28 +6661,31 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
* In this case, we just add @stripe_nr with @i, then do the
* modulo, to reduce one modulo call.
*/
- bioc->full_stripe_logical = em->start +
- btrfs_stripe_nr_to_offset(stripe_nr * data_stripes);
- for (int i = 0; i < num_stripes; i++) {
- ret = set_io_stripe(fs_info, op, logical, length,
- &bioc->stripes[i], map,
- (i + stripe_nr) % num_stripes,
- stripe_offset, stripe_nr);
- if (ret < 0)
- break;
+ bioc->full_stripe_logical = map->start +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr *
+ nr_data_stripes(map));
+ for (int i = 0; i < io_geom.num_stripes; i++) {
+ struct btrfs_io_stripe *dst = &bioc->stripes[i];
+ u32 stripe_index;
+
+ stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes;
+ dst->dev = map->stripes[stripe_index].dev;
+ dst->physical =
+ map->stripes[stripe_index].physical +
+ io_geom.stripe_offset +
+ btrfs_stripe_nr_to_offset(io_geom.stripe_nr);
}
} else {
/*
* For all other non-RAID56 profiles, just copy the target
* stripe into the bioc.
*/
- for (i = 0; i < num_stripes; i++) {
- ret = set_io_stripe(fs_info, op, logical, length,
- &bioc->stripes[i], map, stripe_index,
- stripe_offset, stripe_nr);
+ for (i = 0; i < io_geom.num_stripes; i++) {
+ ret = set_io_stripe(fs_info, logical, length,
+ &bioc->stripes[i], map, &io_geom);
if (ret < 0)
break;
- stripe_index++;
+ io_geom.stripe_index++;
}
}
@@ -6469,18 +6696,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
}
if (op != BTRFS_MAP_READ)
- max_errors = btrfs_chunk_max_errors(map);
+ io_geom.max_errors = btrfs_chunk_max_errors(map);
if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
op != BTRFS_MAP_READ) {
handle_ops_on_dev_replace(op, bioc, dev_replace, logical,
- &num_stripes, &max_errors);
+ &io_geom.num_stripes, &io_geom.max_errors);
}
*bioc_ret = bioc;
- bioc->num_stripes = num_stripes;
- bioc->max_errors = max_errors;
- bioc->mirror_num = mirror_num;
+ bioc->num_stripes = io_geom.num_stripes;
+ bioc->max_errors = io_geom.max_errors;
+ bioc->mirror_num = io_geom.mirror_num;
out:
if (dev_replace_is_ongoing) {
@@ -6488,7 +6715,7 @@ out:
/* Unlock and let waiting writers proceed */
up_read(&dev_replace->rwsem);
}
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -6660,12 +6887,11 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
devid, uuid);
}
-u64 btrfs_calc_stripe_length(const struct extent_map *em)
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map)
{
- const struct map_lookup *map = em->map_lookup;
const int data_stripes = calc_data_stripes(map->type, map->num_stripes);
- return div_u64(em->len, data_stripes);
+ return div_u64(map->chunk_len, data_stripes);
}
#if BITS_PER_LONG == 32
@@ -6734,9 +6960,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
{
BTRFS_DEV_LOOKUP_ARGS(args);
struct btrfs_fs_info *fs_info = leaf->fs_info;
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct map_lookup *map;
- struct extent_map *em;
+ struct btrfs_chunk_map *map;
u64 logical;
u64 length;
u64 devid;
@@ -6770,35 +6994,22 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
return ret;
}
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, logical, 1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, logical, 1);
/* already mapped? */
- if (em && em->start <= logical && em->start + em->len > logical) {
- free_extent_map(em);
+ if (map && map->start <= logical && map->start + map->chunk_len > logical) {
+ btrfs_free_chunk_map(map);
return 0;
- } else if (em) {
- free_extent_map(em);
+ } else if (map) {
+ btrfs_free_chunk_map(map);
}
- em = alloc_extent_map();
- if (!em)
- return -ENOMEM;
- map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
- if (!map) {
- free_extent_map(em);
+ map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS);
+ if (!map)
return -ENOMEM;
- }
-
- set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
- em->map_lookup = map;
- em->start = logical;
- em->len = length;
- em->orig_start = 0;
- em->block_start = 0;
- em->block_len = em->len;
+ map->start = logical;
+ map->chunk_len = length;
map->num_stripes = num_stripes;
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
@@ -6813,7 +7024,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
*/
map->sub_stripes = btrfs_raid_array[index].sub_stripes;
map->verified_stripes = 0;
- em->orig_block_len = btrfs_calc_stripe_length(em);
+ map->stripe_size = btrfs_calc_stripe_length(map);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
btrfs_stripe_offset_nr(leaf, chunk, i);
@@ -6829,7 +7040,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
devid, uuid);
if (IS_ERR(map->stripes[i].dev)) {
ret = PTR_ERR(map->stripes[i].dev);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
}
@@ -6838,15 +7049,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
&(map->stripes[i].dev->dev_state));
}
- write_lock(&map_tree->lock);
- ret = add_extent_mapping(map_tree, em, 0);
- write_unlock(&map_tree->lock);
+ ret = btrfs_add_chunk_map(fs_info, map);
if (ret < 0) {
btrfs_err(fs_info,
"failed to add chunk map, start=%llu len=%llu: %d",
- em->start, em->len, ret);
+ map->start, map->chunk_len, ret);
}
- free_extent_map(em);
return ret;
}
@@ -7156,26 +7364,21 @@ out_short_read:
bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
struct btrfs_device *failing_dev)
{
- struct extent_map_tree *map_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- u64 next_start = 0;
+ struct btrfs_chunk_map *map;
+ u64 next_start;
bool ret = true;
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, 0, (u64)-1);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, 0, U64_MAX);
/* No chunk at all? Return false anyway */
- if (!em) {
+ if (!map) {
ret = false;
goto out;
}
- while (em) {
- struct map_lookup *map;
+ while (map) {
int missing = 0;
int max_tolerated;
int i;
- map = em->map_lookup;
max_tolerated =
btrfs_get_num_tolerated_disk_barrier_failures(
map->type);
@@ -7193,18 +7396,15 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
if (!failing_dev)
btrfs_warn(fs_info,
"chunk %llu missing %d devices, max tolerance is %d for writable mount",
- em->start, missing, max_tolerated);
- free_extent_map(em);
+ map->start, missing, max_tolerated);
+ btrfs_free_chunk_map(map);
ret = false;
goto out;
}
- next_start = extent_map_end(em);
- free_extent_map(em);
+ next_start = map->start + map->chunk_len;
+ btrfs_free_chunk_map(map);
- read_lock(&map_tree->lock);
- em = lookup_extent_mapping(map_tree, next_start,
- (u64)(-1) - next_start);
- read_unlock(&map_tree->lock);
+ map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start);
}
out:
return ret;
@@ -7697,20 +7897,15 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
u64 physical_offset, u64 physical_len)
{
struct btrfs_dev_lookup_args args = { .devid = devid };
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_device *dev;
u64 stripe_len;
bool found = false;
int ret = 0;
int i;
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, chunk_offset, 1);
- read_unlock(&em_tree->lock);
-
- if (!em) {
+ map = btrfs_find_chunk_map(fs_info, chunk_offset, 1);
+ if (!map) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
physical_offset, devid);
@@ -7718,12 +7913,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
goto out;
}
- map = em->map_lookup;
- stripe_len = btrfs_calc_stripe_length(em);
+ stripe_len = btrfs_calc_stripe_length(map);
if (physical_len != stripe_len) {
btrfs_err(fs_info,
"dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
- physical_offset, devid, em->start, physical_len,
+ physical_offset, devid, map->start, physical_len,
stripe_len);
ret = -EUCLEAN;
goto out;
@@ -7746,7 +7940,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
if (map->verified_stripes >= map->num_stripes) {
btrfs_err(fs_info,
"too many dev extents for chunk %llu found",
- em->start);
+ map->start);
ret = -EUCLEAN;
goto out;
}
@@ -7792,32 +7986,30 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
}
out:
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
{
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
struct rb_node *node;
int ret = 0;
- read_lock(&em_tree->lock);
- for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
- em = rb_entry(node, struct extent_map, rb_node);
- if (em->map_lookup->num_stripes !=
- em->map_lookup->verified_stripes) {
+ read_lock(&fs_info->mapping_tree_lock);
+ for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) {
+ struct btrfs_chunk_map *map;
+
+ map = rb_entry(node, struct btrfs_chunk_map, rb_node);
+ if (map->num_stripes != map->verified_stripes) {
btrfs_err(fs_info,
"chunk %llu has missing dev extent, have %d expect %d",
- em->start, em->map_lookup->verified_stripes,
- em->map_lookup->num_stripes);
+ map->start, map->verified_stripes, map->num_stripes);
ret = -EUCLEAN;
goto out;
}
}
out:
- read_unlock(&em_tree->lock);
+ read_unlock(&fs_info->mapping_tree_lock);
return ret;
}
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 9cc374864a79..53f87f398da7 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -426,7 +426,8 @@ struct btrfs_discard_stripe {
struct btrfs_io_context {
refcount_t refs;
struct btrfs_fs_info *fs_info;
- u64 map_type; /* get from map_lookup->type */
+ /* Taken from struct btrfs_chunk_map::type. */
+ u64 map_type;
struct bio *orig_bio;
atomic_t error;
u16 max_errors;
@@ -529,18 +530,32 @@ struct btrfs_raid_attr {
extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES];
-struct map_lookup {
+struct btrfs_chunk_map {
+ struct rb_node rb_node;
+ /* For mount time dev extent verification. */
+ int verified_stripes;
+ refcount_t refs;
+ u64 start;
+ u64 chunk_len;
+ u64 stripe_size;
u64 type;
int io_align;
int io_width;
int num_stripes;
int sub_stripes;
- int verified_stripes; /* For mount time dev extent verification */
struct btrfs_io_stripe stripes[];
};
-#define map_lookup_size(n) (sizeof(struct map_lookup) + \
- (sizeof(struct btrfs_io_stripe) * (n)))
+#define btrfs_chunk_map_size(n) (sizeof(struct btrfs_chunk_map) + \
+ (sizeof(struct btrfs_io_stripe) * (n)))
+
+static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map)
+{
+ if (map && refcount_dec_and_test(&map->refs)) {
+ ASSERT(RB_EMPTY_NODE(&map->rb_node));
+ kfree(map);
+ }
+}
struct btrfs_balance_args;
struct btrfs_balance_progress;
@@ -598,7 +613,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
}
/*
- * Do the type safe converstion from stripe_nr to offset inside the chunk.
+ * Do the type safe conversion from stripe_nr to offset inside the chunk.
*
* @stripe_nr is u32, with left shift it can overflow u32 for chunks larger
* than 4G. This does the proper type cast to avoid overflow.
@@ -624,7 +639,7 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info);
int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info);
struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans,
u64 type);
-void btrfs_mapping_tree_free(struct extent_map_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info);
int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
blk_mode_t flags, void *holder);
struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags,
@@ -680,13 +695,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info,
u64 logical, u64 len);
unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
u64 logical);
-u64 btrfs_calc_stripe_length(const struct extent_map *em);
+u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map);
int btrfs_nr_parity_stripes(u64 type);
int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans,
struct btrfs_block_group *bg);
int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset);
-struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length);
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp);
+int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
+#endif
+
+struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp);
+struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length);
+void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map);
void btrfs_release_disk_super(struct btrfs_super_block *super);
static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 3cf236fb40a4..6287763fdccc 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -382,6 +382,53 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
+static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler,
+ struct dentry *unused,
+ struct inode *inode,
+ const char *name, void *buffer,
+ size_t size)
+{
+ int ret;
+ bool is_cap = false;
+
+ name = xattr_full_name(handler, name);
+
+ /*
+ * security.capability doesn't cache the results, so calls into us
+ * constantly to see if there's a capability xattr. Cache the result
+ * here in order to avoid wasting time doing lookups for xattrs we know
+ * don't exist.
+ */
+ if (strcmp(name, XATTR_NAME_CAPS) == 0) {
+ is_cap = true;
+ if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags))
+ return -ENODATA;
+ }
+
+ ret = btrfs_getxattr(inode, name, buffer, size);
+ if (ret == -ENODATA && is_cap)
+ set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+ return ret;
+}
+
+static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler,
+ struct mnt_idmap *idmap,
+ struct dentry *unused,
+ struct inode *inode,
+ const char *name,
+ const void *buffer,
+ size_t size, int flags)
+{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
+ name = xattr_full_name(handler, name);
+ if (strcmp(name, XATTR_NAME_CAPS) == 0)
+ clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
+ return btrfs_setxattr_trans(inode, name, buffer, size, flags);
+}
+
static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
struct mnt_idmap *idmap,
struct dentry *unused, struct inode *inode,
@@ -420,8 +467,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
static const struct xattr_handler btrfs_security_xattr_handler = {
.prefix = XATTR_SECURITY_PREFIX,
- .get = btrfs_xattr_handler_get,
- .set = btrfs_xattr_handler_set,
+ .get = btrfs_xattr_handler_get_security,
+ .set = btrfs_xattr_handler_set_security,
};
static const struct xattr_handler btrfs_trusted_xattr_handler = {
@@ -473,6 +520,10 @@ static int btrfs_initxattrs(struct inode *inode,
}
strcpy(name, XATTR_SECURITY_PREFIX);
strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+
+ if (strcmp(name, XATTR_NAME_CAPS) == 0)
+ clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
+
err = btrfs_setxattr(trans, inode, name, xattr->value,
xattr->value_len, 0);
kfree(name);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 6c231a116a29..8da66ea699e8 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -121,7 +121,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->strm.total_in = 0;
workspace->strm.total_out = 0;
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -200,7 +200,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -236,7 +236,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -354,18 +354,13 @@ done:
}
int zlib_decompress(struct list_head *ws, const u8 *data_in,
- struct page *dest_page, unsigned long start_byte, size_t srclen,
+ struct page *dest_page, unsigned long dest_pgoff, size_t srclen,
size_t destlen)
{
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
int wbits = MAX_WBITS;
- unsigned long bytes_left;
- unsigned long total_out = 0;
- unsigned long pg_offset = 0;
-
- destlen = min_t(unsigned long, destlen, PAGE_SIZE);
- bytes_left = destlen;
+ unsigned long to_copy;
workspace->strm.next_in = data_in;
workspace->strm.avail_in = srclen;
@@ -390,60 +385,30 @@ int zlib_decompress(struct list_head *ws, const u8 *data_in,
return -EIO;
}
- while (bytes_left > 0) {
- unsigned long buf_start;
- unsigned long buf_offset;
- unsigned long bytes;
-
- ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH);
- if (ret != Z_OK && ret != Z_STREAM_END)
- break;
-
- buf_start = total_out;
- total_out = workspace->strm.total_out;
-
- if (total_out == buf_start) {
- ret = -EIO;
- break;
- }
-
- if (total_out <= start_byte)
- goto next;
-
- if (total_out > start_byte && buf_start < start_byte)
- buf_offset = start_byte - buf_start;
- else
- buf_offset = 0;
-
- bytes = min(PAGE_SIZE - pg_offset,
- PAGE_SIZE - (buf_offset % PAGE_SIZE));
- bytes = min(bytes, bytes_left);
+ /*
+ * Everything (in/out buf) should be at most one sector, there should
+ * be no need to switch any input/output buffer.
+ */
+ ret = zlib_inflate(&workspace->strm, Z_FINISH);
+ to_copy = min(workspace->strm.total_out, destlen);
+ if (ret != Z_STREAM_END)
+ goto out;
- memcpy_to_page(dest_page, pg_offset,
- workspace->buf + buf_offset, bytes);
+ memcpy_to_page(dest_page, dest_pgoff, workspace->buf, to_copy);
- pg_offset += bytes;
- bytes_left -= bytes;
-next:
- workspace->strm.next_out = workspace->buf;
- workspace->strm.avail_out = workspace->buf_size;
- }
-
- if (ret != Z_STREAM_END && bytes_left != 0)
+out:
+ if (unlikely(to_copy != destlen)) {
+ pr_warn_ratelimited("BTRFS: infalte failed, decompressed=%lu expected=%zu\n",
+ to_copy, destlen);
ret = -EIO;
- else
+ } else {
ret = 0;
+ }
zlib_inflateEnd(&workspace->strm);
- /*
- * this should only happen if zlib returned fewer bytes than we
- * expected. btrfs_get_block is responsible for zeroing from the
- * end of the inline extent (destlen) to the end of the page
- */
- if (pg_offset < destlen) {
- memzero_page(dest_page, pg_offset, destlen - pg_offset);
- }
+ if (unlikely(to_copy < destlen))
+ memzero_page(dest_page, dest_pgoff + to_copy, destlen - to_copy);
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 188378ca19c7..5f750fa53a2b 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -578,26 +578,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
kvfree(zones);
- switch (bdev_zoned_model(bdev)) {
- case BLK_ZONED_HM:
+ if (bdev_is_zoned(bdev)) {
model = "host-managed zoned";
emulated = "";
- break;
- case BLK_ZONED_HA:
- model = "host-aware zoned";
- emulated = "";
- break;
- case BLK_ZONED_NONE:
+ } else {
model = "regular";
emulated = "emulated ";
- break;
- default:
- /* Just in case */
- btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s",
- bdev_zoned_model(bdev),
- rcu_str_deref(device->name));
- ret = -EOPNOTSUPP;
- goto out_free_zone_info;
}
btrfs_info_in_rcu(fs_info,
@@ -609,9 +595,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
out:
kvfree(zones);
-out_free_zone_info:
btrfs_destroy_dev_zone_info(device);
-
return ret;
}
@@ -688,8 +672,7 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info)
struct btrfs_device *device;
list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) {
- if (device->bdev &&
- bdev_zoned_model(device->bdev) == BLK_ZONED_HM) {
+ if (device->bdev && bdev_is_zoned(device->bdev)) {
btrfs_err(fs_info,
"zoned: mode not enabled but zoned device found: %pg",
device->bdev);
@@ -781,7 +764,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
* Check mount options here, because we might change fs_info->zoned
* from fs_info->zone_size.
*/
- ret = btrfs_check_mountopts_zoned(fs_info);
+ ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt);
if (ret)
return ret;
@@ -789,7 +772,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info)
return 0;
}
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt)
{
if (!btrfs_is_zoned(info))
return 0;
@@ -798,18 +781,21 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
* Space cache writing is not COWed. Disable that to avoid write errors
* in sequential zones.
*/
- if (btrfs_test_opt(info, SPACE_CACHE)) {
+ if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
btrfs_err(info, "zoned: space cache v1 is not supported");
return -EINVAL;
}
- if (btrfs_test_opt(info, NODATACOW)) {
+ if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) {
btrfs_err(info, "zoned: NODATACOW not supported");
return -EINVAL;
}
- btrfs_clear_and_info(info, DISCARD_ASYNC,
- "zoned: async discard ignored and disabled for zoned mode");
+ if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) {
+ btrfs_info(info,
+ "zoned: async discard ignored and disabled for zoned mode");
+ btrfs_clear_opt(*mount_opt, DISCARD_ASYNC);
+ }
return 0;
}
@@ -1290,7 +1276,7 @@ struct zone_info {
static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx,
struct zone_info *info, unsigned long *active,
- struct map_lookup *map)
+ struct btrfs_chunk_map *map)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_device *device = map->stripes[zone_idx].dev;
@@ -1393,7 +1379,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1435,7 +1421,7 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1483,7 +1469,7 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1515,7 +1501,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg,
}
static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
- struct map_lookup *map,
+ struct btrfs_chunk_map *map,
struct zone_info *zone_info,
unsigned long *active)
{
@@ -1552,9 +1538,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg,
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
{
struct btrfs_fs_info *fs_info = cache->fs_info;
- struct extent_map_tree *em_tree = &fs_info->mapping_tree;
- struct extent_map *em;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
u64 logical = cache->start;
u64 length = cache->length;
struct zone_info *zone_info = NULL;
@@ -1575,17 +1559,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
return -EIO;
}
- /* Get the chunk mapping */
- read_lock(&em_tree->lock);
- em = lookup_extent_mapping(em_tree, logical, length);
- read_unlock(&em_tree->lock);
-
- if (!em)
+ map = btrfs_find_chunk_map(fs_info, logical, length);
+ if (!map)
return -EINVAL;
- map = em->map_lookup;
-
- cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS);
+ cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS);
if (!cache->physical_map) {
ret = -ENOMEM;
goto out;
@@ -1661,6 +1639,15 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
}
out:
+ /* Reject non SINGLE data profiles without RST */
+ if ((map->type & BTRFS_BLOCK_GROUP_DATA) &&
+ (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) &&
+ !fs_info->stripe_root) {
+ btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree",
+ btrfs_bg_type_to_raid_name(map->type));
+ return -EINVAL;
+ }
+
if (cache->alloc_offset > cache->zone_capacity) {
btrfs_err(fs_info,
"zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu",
@@ -1687,12 +1674,12 @@ out:
spin_unlock(&fs_info->zone_active_bgs_lock);
}
} else {
- kfree(cache->physical_map);
+ btrfs_free_chunk_map(cache->physical_map);
cache->physical_map = NULL;
}
bitmap_free(active);
kfree(zone_info);
- free_extent_map(em);
+ btrfs_free_chunk_map(map);
return ret;
}
@@ -1715,22 +1702,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache)
cache->zone_unusable = unusable;
}
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb)
-{
- if (!btrfs_is_zoned(eb->fs_info) ||
- btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN))
- return;
-
- ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-
- memzero_extent_buffer(eb, 0, eb->len);
- set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags);
- set_extent_buffer_dirty(eb);
- set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1,
- EXTENT_DIRTY, NULL);
-}
-
bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT);
@@ -2082,7 +2053,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
bool btrfs_zone_activate(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
struct btrfs_device *device;
u64 physical;
const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA);
@@ -2094,6 +2065,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
map = block_group->physical_map;
+ spin_lock(&fs_info->zone_active_bgs_lock);
spin_lock(&block_group->lock);
if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags)) {
ret = true;
@@ -2106,7 +2078,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
- spin_lock(&fs_info->zone_active_bgs_lock);
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_zoned_device_info *zinfo;
int reserved = 0;
@@ -2126,20 +2097,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
*/
if (atomic_read(&zinfo->active_zones_left) <= reserved) {
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!btrfs_dev_set_active_zone(device, physical)) {
/* Cannot activate the zone */
ret = false;
- spin_unlock(&fs_info->zone_active_bgs_lock);
goto out_unlock;
}
if (!is_data)
zinfo->reserved_active_zones--;
}
- spin_unlock(&fs_info->zone_active_bgs_lock);
/* Successfully activated all the zones */
set_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags);
@@ -2147,8 +2115,6 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
/* For the active block group list */
btrfs_get_block_group(block_group);
-
- spin_lock(&fs_info->zone_active_bgs_lock);
list_add_tail(&block_group->active_bg_list, &fs_info->zone_active_bgs);
spin_unlock(&fs_info->zone_active_bgs_lock);
@@ -2156,6 +2122,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
out_unlock:
spin_unlock(&block_group->lock);
+ spin_unlock(&fs_info->zone_active_bgs_lock);
return ret;
}
@@ -2194,7 +2161,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
- struct map_lookup *map;
+ struct btrfs_chunk_map *map;
const bool is_metadata = (block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM));
int ret = 0;
@@ -2643,7 +2610,7 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info)
/* Release reservation for currently active block groups. */
spin_lock(&fs_info->zone_active_bgs_lock);
list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) {
- struct map_lookup *map = block_group->physical_map;
+ struct btrfs_chunk_map *map = block_group->physical_map;
if (!(block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)))
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index b9cec523b778..f573bda496fb 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -45,7 +45,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
-int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
+int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt);
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
u64 *bytenr_ret);
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
@@ -59,8 +59,6 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
-void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb);
bool btrfs_use_zone_append(struct btrfs_bio *bbio);
void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
@@ -123,7 +121,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
return -EOPNOTSUPP;
}
-static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
+static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info,
+ unsigned long *mount_opt)
{
return 0;
}
@@ -180,9 +179,6 @@ static inline int btrfs_load_block_group_zone_info(
static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
-static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
- struct extent_buffer *eb) { }
-
static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
{
return false;
@@ -323,8 +319,8 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i
(bdev_zone_sectors(bdev) << SECTOR_SHIFT);
}
- /* Do not allow Host Manged zoned device */
- return bdev_zoned_model(bdev) != BLK_ZONED_HM;
+ /* Do not allow Host Managed zoned device. */
+ return !bdev_is_zoned(bdev);
}
static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 5511766485cd..0d66db8bc1d4 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -410,9 +410,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
workspace->in_buf.pos = 0;
workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE);
-
/* Allocate and map in the output buffer */
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -457,7 +456,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
@@ -514,7 +513,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
ret = -E2BIG;
goto out;
}
- out_page = alloc_page(GFP_NOFS);
+ out_page = btrfs_alloc_compr_page();
if (out_page == NULL) {
ret = -ENOMEM;
goto out;
diff --git a/fs/buffer.c b/fs/buffer.c
index 967f34b70aa8..d3bcf601d3e5 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -180,11 +180,11 @@ EXPORT_SYMBOL(end_buffer_write_sync);
* Various filesystems appear to want __find_get_block to be non-blocking.
* But it's the page lock which protects the buffers. To get around this,
* we get exclusion from try_to_free_buffers with the blockdev mapping's
- * private_lock.
+ * i_private_lock.
*
- * Hack idea: for the blockdev mapping, private_lock contention
+ * Hack idea: for the blockdev mapping, i_private_lock contention
* may be quite high. This code could TryLock the page, and if that
- * succeeds, there is no need to take private_lock.
+ * succeeds, there is no need to take i_private_lock.
*/
static struct buffer_head *
__find_get_block_slow(struct block_device *bdev, sector_t block)
@@ -199,12 +199,12 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
int all_mapped = 1;
static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
- index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
folio = __filemap_get_folio(bd_mapping, index, FGP_ACCESSED, 0);
if (IS_ERR(folio))
goto out;
- spin_lock(&bd_mapping->private_lock);
+ spin_lock(&bd_mapping->i_private_lock);
head = folio_buffers(folio);
if (!head)
goto out_unlock;
@@ -236,7 +236,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
1 << bd_inode->i_blkbits);
}
out_unlock:
- spin_unlock(&bd_mapping->private_lock);
+ spin_unlock(&bd_mapping->i_private_lock);
folio_put(folio);
out:
return ret;
@@ -372,10 +372,10 @@ static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
}
/*
- * Completion handler for block_write_full_page() - pages which are unlocked
- * during I/O, and which have PageWriteback cleared upon I/O completion.
+ * Completion handler for block_write_full_folio() - folios which are unlocked
+ * during I/O, and which have the writeback flag cleared upon I/O completion.
*/
-void end_buffer_async_write(struct buffer_head *bh, int uptodate)
+static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
{
unsigned long flags;
struct buffer_head *first;
@@ -415,7 +415,6 @@ still_busy:
spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
return;
}
-EXPORT_SYMBOL(end_buffer_async_write);
/*
* If a page's buffers are under async readin (end_buffer_async_read
@@ -467,25 +466,25 @@ EXPORT_SYMBOL(mark_buffer_async_write);
*
* The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
* inode_has_buffers() and invalidate_inode_buffers() are provided for the
- * management of a list of dependent buffers at ->i_mapping->private_list.
+ * management of a list of dependent buffers at ->i_mapping->i_private_list.
*
* Locking is a little subtle: try_to_free_buffers() will remove buffers
* from their controlling inode's queue when they are being freed. But
* try_to_free_buffers() will be operating against the *blockdev* mapping
* at the time, not against the S_ISREG file which depends on those buffers.
- * So the locking for private_list is via the private_lock in the address_space
+ * So the locking for i_private_list is via the i_private_lock in the address_space
* which backs the buffers. Which is different from the address_space
* against which the buffers are listed. So for a particular address_space,
- * mapping->private_lock does *not* protect mapping->private_list! In fact,
- * mapping->private_list will always be protected by the backing blockdev's
- * ->private_lock.
+ * mapping->i_private_lock does *not* protect mapping->i_private_list! In fact,
+ * mapping->i_private_list will always be protected by the backing blockdev's
+ * ->i_private_lock.
*
* Which introduces a requirement: all buffers on an address_space's
- * ->private_list must be from the same address_space: the blockdev's.
+ * ->i_private_list must be from the same address_space: the blockdev's.
*
- * address_spaces which do not place buffers at ->private_list via these
- * utility functions are free to use private_lock and private_list for
- * whatever they want. The only requirement is that list_empty(private_list)
+ * address_spaces which do not place buffers at ->i_private_list via these
+ * utility functions are free to use i_private_lock and i_private_list for
+ * whatever they want. The only requirement is that list_empty(i_private_list)
* be true at clear_inode() time.
*
* FIXME: clear_inode should not call invalidate_inode_buffers(). The
@@ -508,7 +507,7 @@ EXPORT_SYMBOL(mark_buffer_async_write);
*/
/*
- * The buffer's backing address_space's private_lock must be held
+ * The buffer's backing address_space's i_private_lock must be held
*/
static void __remove_assoc_queue(struct buffer_head *bh)
{
@@ -519,7 +518,7 @@ static void __remove_assoc_queue(struct buffer_head *bh)
int inode_has_buffers(struct inode *inode)
{
- return !list_empty(&inode->i_data.private_list);
+ return !list_empty(&inode->i_data.i_private_list);
}
/*
@@ -561,7 +560,7 @@ repeat:
* sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
* @mapping: the mapping which wants those buffers written
*
- * Starts I/O against the buffers at mapping->private_list, and waits upon
+ * Starts I/O against the buffers at mapping->i_private_list, and waits upon
* that I/O.
*
* Basically, this is a convenience function for fsync().
@@ -570,13 +569,13 @@ repeat:
*/
int sync_mapping_buffers(struct address_space *mapping)
{
- struct address_space *buffer_mapping = mapping->private_data;
+ struct address_space *buffer_mapping = mapping->i_private_data;
- if (buffer_mapping == NULL || list_empty(&mapping->private_list))
+ if (buffer_mapping == NULL || list_empty(&mapping->i_private_list))
return 0;
- return fsync_buffers_list(&buffer_mapping->private_lock,
- &mapping->private_list);
+ return fsync_buffers_list(&buffer_mapping->i_private_lock,
+ &mapping->i_private_list);
}
EXPORT_SYMBOL(sync_mapping_buffers);
@@ -673,17 +672,17 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
struct address_space *buffer_mapping = bh->b_folio->mapping;
mark_buffer_dirty(bh);
- if (!mapping->private_data) {
- mapping->private_data = buffer_mapping;
+ if (!mapping->i_private_data) {
+ mapping->i_private_data = buffer_mapping;
} else {
- BUG_ON(mapping->private_data != buffer_mapping);
+ BUG_ON(mapping->i_private_data != buffer_mapping);
}
if (!bh->b_assoc_map) {
- spin_lock(&buffer_mapping->private_lock);
+ spin_lock(&buffer_mapping->i_private_lock);
list_move_tail(&bh->b_assoc_buffers,
- &mapping->private_list);
+ &mapping->i_private_list);
bh->b_assoc_map = mapping;
- spin_unlock(&buffer_mapping->private_lock);
+ spin_unlock(&buffer_mapping->i_private_lock);
}
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);
@@ -706,7 +705,7 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
* bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
* page on the dirty page list.
*
- * We use private_lock to lock against try_to_free_buffers while using the
+ * We use i_private_lock to lock against try_to_free_buffers while using the
* page's buffer list. Also use this to protect against clean buffers being
* added to the page after it was set dirty.
*
@@ -718,7 +717,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
struct buffer_head *head;
bool newly_dirty;
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
head = folio_buffers(folio);
if (head) {
struct buffer_head *bh = head;
@@ -734,7 +733,7 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
*/
folio_memcg_lock(folio);
newly_dirty = !folio_test_set_dirty(folio);
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
if (newly_dirty)
__folio_mark_dirty(folio, mapping, 1);
@@ -827,7 +826,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
smp_mb();
if (buffer_dirty(bh)) {
list_add(&bh->b_assoc_buffers,
- &mapping->private_list);
+ &mapping->i_private_list);
bh->b_assoc_map = mapping;
}
spin_unlock(lock);
@@ -851,7 +850,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
* probably unmounting the fs, but that doesn't mean we have already
* done a sync(). Just drop the buffers from the inode list.
*
- * NOTE: we take the inode's blockdev's mapping's private_lock. Which
+ * NOTE: we take the inode's blockdev's mapping's i_private_lock. Which
* assumes that all the buffers are against the blockdev. Not true
* for reiserfs.
*/
@@ -859,13 +858,13 @@ void invalidate_inode_buffers(struct inode *inode)
{
if (inode_has_buffers(inode)) {
struct address_space *mapping = &inode->i_data;
- struct list_head *list = &mapping->private_list;
- struct address_space *buffer_mapping = mapping->private_data;
+ struct list_head *list = &mapping->i_private_list;
+ struct address_space *buffer_mapping = mapping->i_private_data;
- spin_lock(&buffer_mapping->private_lock);
+ spin_lock(&buffer_mapping->i_private_lock);
while (!list_empty(list))
__remove_assoc_queue(BH_ENTRY(list->next));
- spin_unlock(&buffer_mapping->private_lock);
+ spin_unlock(&buffer_mapping->i_private_lock);
}
}
EXPORT_SYMBOL(invalidate_inode_buffers);
@@ -882,10 +881,10 @@ int remove_inode_buffers(struct inode *inode)
if (inode_has_buffers(inode)) {
struct address_space *mapping = &inode->i_data;
- struct list_head *list = &mapping->private_list;
- struct address_space *buffer_mapping = mapping->private_data;
+ struct list_head *list = &mapping->i_private_list;
+ struct address_space *buffer_mapping = mapping->i_private_data;
- spin_lock(&buffer_mapping->private_lock);
+ spin_lock(&buffer_mapping->i_private_lock);
while (!list_empty(list)) {
struct buffer_head *bh = BH_ENTRY(list->next);
if (buffer_dirty(bh)) {
@@ -894,7 +893,7 @@ int remove_inode_buffers(struct inode *inode)
}
__remove_assoc_queue(bh);
}
- spin_unlock(&buffer_mapping->private_lock);
+ spin_unlock(&buffer_mapping->i_private_lock);
}
return ret;
}
@@ -995,11 +994,12 @@ static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
* Initialise the state of a blockdev folio's buffers.
*/
static sector_t folio_init_buffers(struct folio *folio,
- struct block_device *bdev, sector_t block, int size)
+ struct block_device *bdev, unsigned size)
{
struct buffer_head *head = folio_buffers(folio);
struct buffer_head *bh = head;
bool uptodate = folio_test_uptodate(folio);
+ sector_t block = div_u64(folio_pos(folio), size);
sector_t end_block = blkdev_max_block(bdev, size);
do {
@@ -1024,86 +1024,88 @@ static sector_t folio_init_buffers(struct folio *folio,
}
/*
- * Create the page-cache page that contains the requested block.
+ * Create the page-cache folio that contains the requested block.
*
* This is used purely for blockdev mappings.
+ *
+ * Returns false if we have a failure which cannot be cured by retrying
+ * without sleeping. Returns true if we succeeded, or the caller should retry.
*/
-static int
-grow_dev_page(struct block_device *bdev, sector_t block,
- pgoff_t index, int size, int sizebits, gfp_t gfp)
+static bool grow_dev_folio(struct block_device *bdev, sector_t block,
+ pgoff_t index, unsigned size, gfp_t gfp)
{
struct inode *inode = bdev->bd_inode;
struct folio *folio;
struct buffer_head *bh;
- sector_t end_block;
- int ret = 0;
+ sector_t end_block = 0;
folio = __filemap_get_folio(inode->i_mapping, index,
FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp);
if (IS_ERR(folio))
- return PTR_ERR(folio);
+ return false;
bh = folio_buffers(folio);
if (bh) {
if (bh->b_size == size) {
- end_block = folio_init_buffers(folio, bdev,
- (sector_t)index << sizebits, size);
- goto done;
+ end_block = folio_init_buffers(folio, bdev, size);
+ goto unlock;
+ }
+
+ /*
+ * Retrying may succeed; for example the folio may finish
+ * writeback, or buffers may be cleaned. This should not
+ * happen very often; maybe we have old buffers attached to
+ * this blockdev's page cache and we're trying to change
+ * the block size?
+ */
+ if (!try_to_free_buffers(folio)) {
+ end_block = ~0ULL;
+ goto unlock;
}
- if (!try_to_free_buffers(folio))
- goto failed;
}
- ret = -ENOMEM;
bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT);
if (!bh)
- goto failed;
+ goto unlock;
/*
* Link the folio to the buffers and initialise them. Take the
* lock to be atomic wrt __find_get_block(), which does not
* run under the folio lock.
*/
- spin_lock(&inode->i_mapping->private_lock);
+ spin_lock(&inode->i_mapping->i_private_lock);
link_dev_buffers(folio, bh);
- end_block = folio_init_buffers(folio, bdev,
- (sector_t)index << sizebits, size);
- spin_unlock(&inode->i_mapping->private_lock);
-done:
- ret = (block < end_block) ? 1 : -ENXIO;
-failed:
+ end_block = folio_init_buffers(folio, bdev, size);
+ spin_unlock(&inode->i_mapping->i_private_lock);
+unlock:
folio_unlock(folio);
folio_put(folio);
- return ret;
+ return block < end_block;
}
/*
- * Create buffers for the specified block device block's page. If
- * that page was dirty, the buffers are set dirty also.
+ * Create buffers for the specified block device block's folio. If
+ * that folio was dirty, the buffers are set dirty also. Returns false
+ * if we've hit a permanent error.
*/
-static int
-grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
+static bool grow_buffers(struct block_device *bdev, sector_t block,
+ unsigned size, gfp_t gfp)
{
- pgoff_t index;
- int sizebits;
-
- sizebits = PAGE_SHIFT - __ffs(size);
- index = block >> sizebits;
+ loff_t pos;
/*
- * Check for a block which wants to lie outside our maximum possible
- * pagecache index. (this comparison is done using sector_t types).
+ * Check for a block which lies outside our maximum possible
+ * pagecache index.
*/
- if (unlikely(index != block >> sizebits)) {
- printk(KERN_ERR "%s: requested out-of-range block %llu for "
- "device %pg\n",
+ if (check_mul_overflow(block, (sector_t)size, &pos) || pos > MAX_LFS_FILESIZE) {
+ printk(KERN_ERR "%s: requested out-of-range block %llu for device %pg\n",
__func__, (unsigned long long)block,
bdev);
- return -EIO;
+ return false;
}
- /* Create a page with the proper size buffers.. */
- return grow_dev_page(bdev, block, index, size, sizebits, gfp);
+ /* Create a folio with the proper size buffers */
+ return grow_dev_folio(bdev, block, pos / PAGE_SIZE, size, gfp);
}
static struct buffer_head *
@@ -1124,14 +1126,12 @@ __getblk_slow(struct block_device *bdev, sector_t block,
for (;;) {
struct buffer_head *bh;
- int ret;
bh = __find_get_block(bdev, block, size);
if (bh)
return bh;
- ret = grow_buffers(bdev, block, size, gfp);
- if (ret < 0)
+ if (!grow_buffers(bdev, block, size, gfp))
return NULL;
}
}
@@ -1168,7 +1168,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
* and then attach the address_space's inode to its superblock's dirty
* inode list.
*
- * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->private_lock,
+ * mark_buffer_dirty() is atomic. It takes bh->b_folio->mapping->i_private_lock,
* i_pages lock and mapping->host->i_lock.
*/
void mark_buffer_dirty(struct buffer_head *bh)
@@ -1246,10 +1246,10 @@ void __bforget(struct buffer_head *bh)
if (bh->b_assoc_map) {
struct address_space *buffer_mapping = bh->b_folio->mapping;
- spin_lock(&buffer_mapping->private_lock);
+ spin_lock(&buffer_mapping->i_private_lock);
list_del_init(&bh->b_assoc_buffers);
bh->b_assoc_map = NULL;
- spin_unlock(&buffer_mapping->private_lock);
+ spin_unlock(&buffer_mapping->i_private_lock);
}
__brelse(bh);
}
@@ -1638,7 +1638,7 @@ EXPORT_SYMBOL(block_invalidate_folio);
/*
* We attach and possibly dirty the buffers atomically wrt
- * block_dirty_folio() via private_lock. try_to_free_buffers
+ * block_dirty_folio() via i_private_lock. try_to_free_buffers
* is already excluded via the folio lock.
*/
struct buffer_head *create_empty_buffers(struct folio *folio,
@@ -1656,7 +1656,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio,
} while (bh);
tail->b_this_page = head;
- spin_lock(&folio->mapping->private_lock);
+ spin_lock(&folio->mapping->i_private_lock);
if (folio_test_uptodate(folio) || folio_test_dirty(folio)) {
bh = head;
do {
@@ -1668,7 +1668,7 @@ struct buffer_head *create_empty_buffers(struct folio *folio,
} while (bh != head);
}
folio_attach_private(folio, head);
- spin_unlock(&folio->mapping->private_lock);
+ spin_unlock(&folio->mapping->i_private_lock);
return head;
}
@@ -1699,13 +1699,13 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
struct inode *bd_inode = bdev->bd_inode;
struct address_space *bd_mapping = bd_inode->i_mapping;
struct folio_batch fbatch;
- pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ pgoff_t index = ((loff_t)block << bd_inode->i_blkbits) / PAGE_SIZE;
pgoff_t end;
int i, count;
struct buffer_head *bh;
struct buffer_head *head;
- end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
+ end = ((loff_t)(block + len - 1) << bd_inode->i_blkbits) / PAGE_SIZE;
folio_batch_init(&fbatch);
while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
count = folio_batch_count(&fbatch);
@@ -1715,7 +1715,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
if (!folio_buffers(folio))
continue;
/*
- * We use folio lock instead of bd_mapping->private_lock
+ * We use folio lock instead of bd_mapping->i_private_lock
* to pin buffers here since we can afford to sleep and
* it scales better than a global spinlock lock.
*/
@@ -1748,19 +1748,6 @@ unlock_page:
}
EXPORT_SYMBOL(clean_bdev_aliases);
-/*
- * Size is a power-of-two in the range 512..PAGE_SIZE,
- * and the case we care about most is PAGE_SIZE.
- *
- * So this *could* possibly be written with those
- * constraints in mind (relevant mostly if some
- * architecture has a slow bit-scan instruction)
- */
-static inline int block_size_bits(unsigned int blocksize)
-{
- return ilog2(blocksize);
-}
-
static struct buffer_head *folio_create_buffers(struct folio *folio,
struct inode *inode,
unsigned int b_state)
@@ -1790,30 +1777,29 @@ static struct buffer_head *folio_create_buffers(struct folio *folio,
*/
/*
- * While block_write_full_page is writing back the dirty buffers under
+ * While block_write_full_folio is writing back the dirty buffers under
* the page lock, whoever dirtied the buffers may decide to clean them
* again at any time. We handle that by only looking at the buffer
* state inside lock_buffer().
*
- * If block_write_full_page() is called for regular writeback
+ * If block_write_full_folio() is called for regular writeback
* (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
* locked buffer. This only can happen if someone has written the buffer
* directly, with submit_bh(). At the address_space level PageWriteback
* prevents this contention from occurring.
*
- * If block_write_full_page() is called with wbc->sync_mode ==
+ * If block_write_full_folio() is called with wbc->sync_mode ==
* WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
* causes the writes to be flagged as synchronous writes.
*/
int __block_write_full_folio(struct inode *inode, struct folio *folio,
- get_block_t *get_block, struct writeback_control *wbc,
- bh_end_io_t *handler)
+ get_block_t *get_block, struct writeback_control *wbc)
{
int err;
sector_t block;
sector_t last_block;
struct buffer_head *bh, *head;
- unsigned int blocksize, bbits;
+ size_t blocksize;
int nr_underway = 0;
blk_opf_t write_flags = wbc_to_write_flags(wbc);
@@ -1832,10 +1818,9 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
bh = head;
blocksize = bh->b_size;
- bbits = block_size_bits(blocksize);
- block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
- last_block = (i_size_read(inode) - 1) >> bbits;
+ block = div_u64(folio_pos(folio), blocksize);
+ last_block = div_u64(i_size_read(inode) - 1, blocksize);
/*
* Get all the dirty buffers mapped to disk addresses and
@@ -1849,7 +1834,7 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
* truncate in progress.
*/
/*
- * The buffer was zeroed by block_write_full_page()
+ * The buffer was zeroed by block_write_full_folio()
*/
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
@@ -1887,7 +1872,8 @@ int __block_write_full_folio(struct inode *inode, struct folio *folio,
continue;
}
if (test_clear_buffer_dirty(bh)) {
- mark_buffer_async_write_endio(bh, handler);
+ mark_buffer_async_write_endio(bh,
+ end_buffer_async_write);
} else {
unlock_buffer(bh);
}
@@ -1940,7 +1926,8 @@ recover:
if (buffer_mapped(bh) && buffer_dirty(bh) &&
!buffer_delay(bh)) {
lock_buffer(bh);
- mark_buffer_async_write_endio(bh, handler);
+ mark_buffer_async_write_endio(bh,
+ end_buffer_async_write);
} else {
/*
* The buffer may have been set dirty during
@@ -2014,7 +2001,7 @@ static int
iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
const struct iomap *iomap)
{
- loff_t offset = block << inode->i_blkbits;
+ loff_t offset = (loff_t)block << inode->i_blkbits;
bh->b_bdev = iomap->bdev;
@@ -2081,27 +2068,24 @@ iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
get_block_t *get_block, const struct iomap *iomap)
{
- unsigned from = pos & (PAGE_SIZE - 1);
- unsigned to = from + len;
+ size_t from = offset_in_folio(folio, pos);
+ size_t to = from + len;
struct inode *inode = folio->mapping->host;
- unsigned block_start, block_end;
+ size_t block_start, block_end;
sector_t block;
int err = 0;
- unsigned blocksize, bbits;
+ size_t blocksize;
struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
BUG_ON(!folio_test_locked(folio));
- BUG_ON(from > PAGE_SIZE);
- BUG_ON(to > PAGE_SIZE);
+ BUG_ON(to > folio_size(folio));
BUG_ON(from > to);
head = folio_create_buffers(folio, inode, 0);
blocksize = head->b_size;
- bbits = block_size_bits(blocksize);
+ block = div_u64(folio_pos(folio), blocksize);
- block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
-
- for(bh = head, block_start = 0; bh != head || !block_start;
+ for (bh = head, block_start = 0; bh != head || !block_start;
block++, block_start=block_end, bh = bh->b_this_page) {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
@@ -2364,7 +2348,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
struct inode *inode = folio->mapping->host;
sector_t iblock, lblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
- unsigned int blocksize, bbits;
+ size_t blocksize;
int nr, i;
int fully_mapped = 1;
bool page_error = false;
@@ -2378,10 +2362,9 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
head = folio_create_buffers(folio, inode, 0);
blocksize = head->b_size;
- bbits = block_size_bits(blocksize);
- iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
- lblock = (limit+blocksize-1) >> bbits;
+ iblock = div_u64(folio_pos(folio), blocksize);
+ lblock = div_u64(limit + blocksize - 1, blocksize);
bh = head;
nr = 0;
i = 0;
@@ -2666,8 +2649,8 @@ int block_truncate_page(struct address_space *mapping,
return 0;
length = blocksize - length;
- iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
-
+ iblock = ((loff_t)index * PAGE_SIZE) >> inode->i_blkbits;
+
folio = filemap_grab_folio(mapping, index);
if (IS_ERR(folio))
return PTR_ERR(folio);
@@ -2720,17 +2703,15 @@ EXPORT_SYMBOL(block_truncate_page);
/*
* The generic ->writepage function for buffer-backed address_spaces
*/
-int block_write_full_page(struct page *page, get_block_t *get_block,
- struct writeback_control *wbc)
+int block_write_full_folio(struct folio *folio, struct writeback_control *wbc,
+ void *get_block)
{
- struct folio *folio = page_folio(page);
struct inode * const inode = folio->mapping->host;
loff_t i_size = i_size_read(inode);
/* Is the folio fully inside i_size? */
if (folio_pos(folio) + folio_size(folio) <= i_size)
- return __block_write_full_folio(inode, folio, get_block, wbc,
- end_buffer_async_write);
+ return __block_write_full_folio(inode, folio, get_block, wbc);
/* Is the folio fully outside i_size? (truncate in progress) */
if (folio_pos(folio) >= i_size) {
@@ -2747,10 +2728,8 @@ int block_write_full_page(struct page *page, get_block_t *get_block,
*/
folio_zero_segment(folio, offset_in_folio(folio, i_size),
folio_size(folio));
- return __block_write_full_folio(inode, folio, get_block, wbc,
- end_buffer_async_write);
+ return __block_write_full_folio(inode, folio, get_block, wbc);
}
-EXPORT_SYMBOL(block_write_full_page);
sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
get_block_t *get_block)
@@ -2883,7 +2862,7 @@ EXPORT_SYMBOL(sync_dirty_buffer);
* are unused, and releases them if so.
*
* Exclusion against try_to_free_buffers may be obtained by either
- * locking the folio or by holding its mapping's private_lock.
+ * locking the folio or by holding its mapping's i_private_lock.
*
* If the folio is dirty but all the buffers are clean then we need to
* be sure to mark the folio clean as well. This is because the folio
@@ -2894,7 +2873,7 @@ EXPORT_SYMBOL(sync_dirty_buffer);
* The same applies to regular filesystem folios: if all the buffers are
* clean then we set the folio clean and proceed. To do that, we require
* total exclusion from block_dirty_folio(). That is obtained with
- * private_lock.
+ * i_private_lock.
*
* try_to_free_buffers() is non-blocking.
*/
@@ -2946,7 +2925,7 @@ bool try_to_free_buffers(struct folio *folio)
goto out;
}
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
ret = drop_buffers(folio, &buffers_to_free);
/*
@@ -2959,13 +2938,13 @@ bool try_to_free_buffers(struct folio *folio)
* the folio's buffers clean. We discover that here and clean
* the folio also.
*
- * private_lock must be held over this entire operation in order
+ * i_private_lock must be held over this entire operation in order
* to synchronise against block_dirty_folio and prevent the
* dirty bit from being lost.
*/
if (ret)
folio_cancel_dirty(folio);
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
out:
if (buffers_to_free) {
struct buffer_head *bh = buffers_to_free;
diff --git a/fs/cachefiles/Kconfig b/fs/cachefiles/Kconfig
index 8df715640a48..c5a070550ee3 100644
--- a/fs/cachefiles/Kconfig
+++ b/fs/cachefiles/Kconfig
@@ -2,7 +2,7 @@
config CACHEFILES
tristate "Filesystem caching on files"
- depends on FSCACHE && BLOCK
+ depends on NETFS_SUPPORT && FSCACHE && BLOCK
help
This permits use of a mounted filesystem as a cache for other
filesystems - primarily networking filesystems - thus allowing fast
diff --git a/fs/cachefiles/cache.c b/fs/cachefiles/cache.c
index 7077f72e6f47..f449f7340aad 100644
--- a/fs/cachefiles/cache.c
+++ b/fs/cachefiles/cache.c
@@ -168,6 +168,8 @@ error_unsupported:
dput(root);
error_open_root:
cachefiles_end_secure(cache, saved_cred);
+ put_cred(cache->cache_cred);
+ cache->cache_cred = NULL;
error_getsec:
fscache_relinquish_cache(cache_cookie);
cache->cache = NULL;
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index aa4efcabb5e3..6465e2574230 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -77,6 +77,7 @@ static const struct cachefiles_daemon_cmd cachefiles_daemon_cmds[] = {
{ "tag", cachefiles_daemon_tag },
#ifdef CONFIG_CACHEFILES_ONDEMAND
{ "copen", cachefiles_ondemand_copen },
+ { "restore", cachefiles_ondemand_restore },
#endif
{ "", NULL }
};
@@ -355,14 +356,24 @@ static __poll_t cachefiles_daemon_poll(struct file *file,
struct poll_table_struct *poll)
{
struct cachefiles_cache *cache = file->private_data;
+ XA_STATE(xas, &cache->reqs, 0);
+ struct cachefiles_req *req;
__poll_t mask;
poll_wait(file, &cache->daemon_pollwq, poll);
mask = 0;
if (cachefiles_in_ondemand_mode(cache)) {
- if (!xa_empty(&cache->reqs))
- mask |= EPOLLIN;
+ if (!xa_empty(&cache->reqs)) {
+ rcu_read_lock();
+ xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
+ if (!cachefiles_ondemand_is_reopening_read(req)) {
+ mask |= EPOLLIN;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ }
} else {
if (test_bit(CACHEFILES_STATE_CHANGED, &cache->flags))
mask |= EPOLLIN;
@@ -805,6 +816,7 @@ static void cachefiles_daemon_unbind(struct cachefiles_cache *cache)
cachefiles_put_directory(cache->graveyard);
cachefiles_put_directory(cache->store);
mntput(cache->mnt);
+ put_cred(cache->cache_cred);
kfree(cache->rootdirname);
kfree(cache->secctx);
diff --git a/fs/cachefiles/error_inject.c b/fs/cachefiles/error_inject.c
index 18de8a876b02..1715d5ca2b2d 100644
--- a/fs/cachefiles/error_inject.c
+++ b/fs/cachefiles/error_inject.c
@@ -19,7 +19,6 @@ static struct ctl_table cachefiles_sysctls[] = {
.mode = 0644,
.proc_handler = proc_douintvec,
},
- {}
};
int __init cachefiles_register_error_injection(void)
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 40052bdb3365..35ba2117a6f6 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -31,6 +31,11 @@ struct cachefiles_object *cachefiles_alloc_object(struct fscache_cookie *cookie)
if (!object)
return NULL;
+ if (cachefiles_ondemand_init_obj_info(object, volume)) {
+ kmem_cache_free(cachefiles_object_jar, object);
+ return NULL;
+ }
+
refcount_set(&object->ref, 1);
spin_lock_init(&object->lock);
@@ -88,7 +93,7 @@ void cachefiles_put_object(struct cachefiles_object *object,
ASSERTCMP(object->file, ==, NULL);
kfree(object->d_name);
-
+ cachefiles_ondemand_deinit_obj_info(object);
cache = object->volume->cache->cache;
fscache_put_cookie(object->cookie, fscache_cookie_put_object);
object->cookie = NULL;
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index 2ad58c465208..d33169f0018b 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -44,6 +44,19 @@ struct cachefiles_volume {
struct dentry *fanout[256]; /* Fanout subdirs */
};
+enum cachefiles_object_state {
+ CACHEFILES_ONDEMAND_OBJSTATE_CLOSE, /* Anonymous fd closed by daemon or initial state */
+ CACHEFILES_ONDEMAND_OBJSTATE_OPEN, /* Anonymous fd associated with object is available */
+ CACHEFILES_ONDEMAND_OBJSTATE_REOPENING, /* Object that was closed and is being reopened. */
+};
+
+struct cachefiles_ondemand_info {
+ struct work_struct ondemand_work;
+ int ondemand_id;
+ enum cachefiles_object_state state;
+ struct cachefiles_object *object;
+};
+
/*
* Backing file state.
*/
@@ -61,7 +74,7 @@ struct cachefiles_object {
unsigned long flags;
#define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */
#ifdef CONFIG_CACHEFILES_ONDEMAND
- int ondemand_id;
+ struct cachefiles_ondemand_info *ondemand;
#endif
};
@@ -233,7 +246,7 @@ extern bool cachefiles_begin_operation(struct netfs_cache_resources *cres,
enum fscache_want_state want_state);
extern int __cachefiles_prepare_write(struct cachefiles_object *object,
struct file *file,
- loff_t *_start, size_t *_len,
+ loff_t *_start, size_t *_len, size_t upper_len,
bool no_space_allocated_yet);
extern int __cachefiles_write(struct cachefiles_object *object,
struct file *file,
@@ -290,12 +303,42 @@ extern ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
extern int cachefiles_ondemand_copen(struct cachefiles_cache *cache,
char *args);
+extern int cachefiles_ondemand_restore(struct cachefiles_cache *cache,
+ char *args);
+
extern int cachefiles_ondemand_init_object(struct cachefiles_object *object);
extern void cachefiles_ondemand_clean_object(struct cachefiles_object *object);
extern int cachefiles_ondemand_read(struct cachefiles_object *object,
loff_t pos, size_t len);
+extern int cachefiles_ondemand_init_obj_info(struct cachefiles_object *obj,
+ struct cachefiles_volume *volume);
+extern void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj);
+
+#define CACHEFILES_OBJECT_STATE_FUNCS(_state, _STATE) \
+static inline bool \
+cachefiles_ondemand_object_is_##_state(const struct cachefiles_object *object) \
+{ \
+ return object->ondemand->state == CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \
+} \
+ \
+static inline void \
+cachefiles_ondemand_set_object_##_state(struct cachefiles_object *object) \
+{ \
+ object->ondemand->state = CACHEFILES_ONDEMAND_OBJSTATE_##_STATE; \
+}
+
+CACHEFILES_OBJECT_STATE_FUNCS(open, OPEN);
+CACHEFILES_OBJECT_STATE_FUNCS(close, CLOSE);
+CACHEFILES_OBJECT_STATE_FUNCS(reopening, REOPENING);
+
+static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req)
+{
+ return cachefiles_ondemand_object_is_reopening(req->object) &&
+ req->msg.opcode == CACHEFILES_OP_READ;
+}
+
#else
static inline ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
char __user *_buffer, size_t buflen)
@@ -317,6 +360,20 @@ static inline int cachefiles_ondemand_read(struct cachefiles_object *object,
{
return -EOPNOTSUPP;
}
+
+static inline int cachefiles_ondemand_init_obj_info(struct cachefiles_object *obj,
+ struct cachefiles_volume *volume)
+{
+ return 0;
+}
+static inline void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *obj)
+{
+}
+
+static inline bool cachefiles_ondemand_is_reopening_read(struct cachefiles_req *req)
+{
+ return false;
+}
#endif
/*
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
index 009d23cd435b..1d685357e67f 100644
--- a/fs/cachefiles/io.c
+++ b/fs/cachefiles/io.c
@@ -259,7 +259,8 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret)
_enter("%ld", ret);
- kiocb_end_write(iocb);
+ if (ki->was_async)
+ kiocb_end_write(iocb);
if (ret < 0)
trace_cachefiles_io_error(object, inode, ret,
@@ -319,8 +320,6 @@ int __cachefiles_write(struct cachefiles_object *object,
ki->iocb.ki_complete = cachefiles_write_complete;
atomic_long_add(ki->b_writing, &cache->b_writing);
- kiocb_start_write(&ki->iocb);
-
get_file(ki->iocb.ki_filp);
cachefiles_grab_object(object, cachefiles_obj_get_ioreq);
@@ -518,18 +517,26 @@ cachefiles_prepare_ondemand_read(struct netfs_cache_resources *cres,
*/
int __cachefiles_prepare_write(struct cachefiles_object *object,
struct file *file,
- loff_t *_start, size_t *_len,
+ loff_t *_start, size_t *_len, size_t upper_len,
bool no_space_allocated_yet)
{
struct cachefiles_cache *cache = object->volume->cache;
loff_t start = *_start, pos;
- size_t len = *_len, down;
+ size_t len = *_len;
int ret;
/* Round to DIO size */
- down = start - round_down(start, PAGE_SIZE);
- *_start = start - down;
- *_len = round_up(down + len, PAGE_SIZE);
+ start = round_down(*_start, PAGE_SIZE);
+ if (start != *_start || *_len > upper_len) {
+ /* Probably asked to cache a streaming write written into the
+ * pagecache when the cookie was temporarily out of service to
+ * culling.
+ */
+ fscache_count_dio_misfit();
+ return -ENOBUFS;
+ }
+
+ *_len = round_up(len, PAGE_SIZE);
/* We need to work out whether there's sufficient disk space to perform
* the write - but we can skip that check if we have space already
@@ -540,7 +547,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
pos = cachefiles_inject_read_error();
if (pos == 0)
- pos = vfs_llseek(file, *_start, SEEK_DATA);
+ pos = vfs_llseek(file, start, SEEK_DATA);
if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
if (pos == -ENXIO)
goto check_space; /* Unallocated tail */
@@ -548,7 +555,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
cachefiles_trace_seek_error);
return pos;
}
- if ((u64)pos >= (u64)*_start + *_len)
+ if ((u64)pos >= (u64)start + *_len)
goto check_space; /* Unallocated region */
/* We have a block that's at least partially filled - if we're low on
@@ -561,13 +568,13 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
pos = cachefiles_inject_read_error();
if (pos == 0)
- pos = vfs_llseek(file, *_start, SEEK_HOLE);
+ pos = vfs_llseek(file, start, SEEK_HOLE);
if (pos < 0 && pos >= (loff_t)-MAX_ERRNO) {
trace_cachefiles_io_error(object, file_inode(file), pos,
cachefiles_trace_seek_error);
return pos;
}
- if ((u64)pos >= (u64)*_start + *_len)
+ if ((u64)pos >= (u64)start + *_len)
return 0; /* Fully allocated */
/* Partially allocated, but insufficient space: cull. */
@@ -575,7 +582,7 @@ int __cachefiles_prepare_write(struct cachefiles_object *object,
ret = cachefiles_inject_remove_error();
if (ret == 0)
ret = vfs_fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
- *_start, *_len);
+ start, *_len);
if (ret < 0) {
trace_cachefiles_io_error(object, file_inode(file), ret,
cachefiles_trace_fallocate_error);
@@ -592,8 +599,8 @@ check_space:
}
static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
- loff_t *_start, size_t *_len, loff_t i_size,
- bool no_space_allocated_yet)
+ loff_t *_start, size_t *_len, size_t upper_len,
+ loff_t i_size, bool no_space_allocated_yet)
{
struct cachefiles_object *object = cachefiles_cres_object(cres);
struct cachefiles_cache *cache = object->volume->cache;
@@ -609,7 +616,7 @@ static int cachefiles_prepare_write(struct netfs_cache_resources *cres,
cachefiles_begin_secure(cache, &saved_cred);
ret = __cachefiles_prepare_write(object, cachefiles_cres_file(cres),
- _start, _len,
+ _start, _len, upper_len,
no_space_allocated_yet);
cachefiles_end_secure(cache, saved_cred);
return ret;
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 7bf7a5fcc045..7ade836beb58 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -305,6 +305,8 @@ try_again:
/* do the multiway lock magic */
trap = lock_rename(cache->graveyard, dir);
+ if (IS_ERR(trap))
+ return PTR_ERR(trap);
/* do some checks before getting the grave dentry */
if (rep->d_parent != dir || IS_DEADDIR(d_inode(rep))) {
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index 0254ed39f68c..4ba42f1fa3b4 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -9,21 +9,19 @@ static int cachefiles_ondemand_fd_release(struct inode *inode,
{
struct cachefiles_object *object = file->private_data;
struct cachefiles_cache *cache = object->volume->cache;
- int object_id = object->ondemand_id;
+ struct cachefiles_ondemand_info *info = object->ondemand;
+ int object_id = info->ondemand_id;
struct cachefiles_req *req;
XA_STATE(xas, &cache->reqs, 0);
xa_lock(&cache->reqs);
- object->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
+ info->ondemand_id = CACHEFILES_ONDEMAND_ID_CLOSED;
+ cachefiles_ondemand_set_object_close(object);
- /*
- * Flush all pending READ requests since their completion depends on
- * anon_fd.
- */
- xas_for_each(&xas, req, ULONG_MAX) {
+ /* Only flush CACHEFILES_REQ_NEW marked req to avoid race with daemon_read */
+ xas_for_each_marked(&xas, req, ULONG_MAX, CACHEFILES_REQ_NEW) {
if (req->msg.object_id == object_id &&
- req->msg.opcode == CACHEFILES_OP_READ) {
- req->error = -EIO;
+ req->msg.opcode == CACHEFILES_OP_CLOSE) {
complete(&req->done);
xas_store(&xas, NULL);
}
@@ -52,7 +50,7 @@ static ssize_t cachefiles_ondemand_fd_write_iter(struct kiocb *kiocb,
return -ENOBUFS;
cachefiles_begin_secure(cache, &saved_cred);
- ret = __cachefiles_prepare_write(object, file, &pos, &len, true);
+ ret = __cachefiles_prepare_write(object, file, &pos, &len, len, true);
cachefiles_end_secure(cache, saved_cred);
if (ret < 0)
return ret;
@@ -176,11 +174,37 @@ int cachefiles_ondemand_copen(struct cachefiles_cache *cache, char *args)
set_bit(FSCACHE_COOKIE_NO_DATA_TO_READ, &cookie->flags);
trace_cachefiles_ondemand_copen(req->object, id, size);
+ cachefiles_ondemand_set_object_open(req->object);
+ wake_up_all(&cache->daemon_pollwq);
+
out:
complete(&req->done);
return ret;
}
+int cachefiles_ondemand_restore(struct cachefiles_cache *cache, char *args)
+{
+ struct cachefiles_req *req;
+
+ XA_STATE(xas, &cache->reqs, 0);
+
+ if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
+ return -EOPNOTSUPP;
+
+ /*
+ * Reset the requests to CACHEFILES_REQ_NEW state, so that the
+ * requests have been processed halfway before the crash of the
+ * user daemon could be reprocessed after the recovery.
+ */
+ xas_lock(&xas);
+ xas_for_each(&xas, req, ULONG_MAX)
+ xas_set_mark(&xas, CACHEFILES_REQ_NEW);
+ xas_unlock(&xas);
+
+ wake_up_all(&cache->daemon_pollwq);
+ return 0;
+}
+
static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
{
struct cachefiles_object *object;
@@ -218,8 +242,7 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req)
load = (void *)req->msg.data;
load->fd = fd;
- req->msg.object_id = object_id;
- object->ondemand_id = object_id;
+ object->ondemand->ondemand_id = object_id;
cachefiles_get_unbind_pincount(cache);
trace_cachefiles_ondemand_open(object, &req->msg, load);
@@ -234,6 +257,43 @@ err:
return ret;
}
+static void ondemand_object_worker(struct work_struct *work)
+{
+ struct cachefiles_ondemand_info *info =
+ container_of(work, struct cachefiles_ondemand_info, ondemand_work);
+
+ cachefiles_ondemand_init_object(info->object);
+}
+
+/*
+ * If there are any inflight or subsequent READ requests on the
+ * closed object, reopen it.
+ * Skip read requests whose related object is reopening.
+ */
+static struct cachefiles_req *cachefiles_ondemand_select_req(struct xa_state *xas,
+ unsigned long xa_max)
+{
+ struct cachefiles_req *req;
+ struct cachefiles_object *object;
+ struct cachefiles_ondemand_info *info;
+
+ xas_for_each_marked(xas, req, xa_max, CACHEFILES_REQ_NEW) {
+ if (req->msg.opcode != CACHEFILES_OP_READ)
+ return req;
+ object = req->object;
+ info = object->ondemand;
+ if (cachefiles_ondemand_object_is_close(object)) {
+ cachefiles_ondemand_set_object_reopening(object);
+ queue_work(fscache_wq, &info->ondemand_work);
+ continue;
+ }
+ if (cachefiles_ondemand_object_is_reopening(object))
+ continue;
+ return req;
+ }
+ return NULL;
+}
+
ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
char __user *_buffer, size_t buflen)
{
@@ -244,16 +304,16 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
int ret = 0;
XA_STATE(xas, &cache->reqs, cache->req_id_next);
+ xa_lock(&cache->reqs);
/*
* Cyclically search for a request that has not ever been processed,
* to prevent requests from being processed repeatedly, and make
* request distribution fair.
*/
- xa_lock(&cache->reqs);
- req = xas_find_marked(&xas, UINT_MAX, CACHEFILES_REQ_NEW);
+ req = cachefiles_ondemand_select_req(&xas, ULONG_MAX);
if (!req && cache->req_id_next > 0) {
xas_set(&xas, 0);
- req = xas_find_marked(&xas, cache->req_id_next - 1, CACHEFILES_REQ_NEW);
+ req = cachefiles_ondemand_select_req(&xas, cache->req_id_next - 1);
}
if (!req) {
xa_unlock(&cache->reqs);
@@ -273,14 +333,18 @@ ssize_t cachefiles_ondemand_daemon_read(struct cachefiles_cache *cache,
xa_unlock(&cache->reqs);
id = xas.xa_index;
- msg->msg_id = id;
if (msg->opcode == CACHEFILES_OP_OPEN) {
ret = cachefiles_ondemand_get_fd(req);
- if (ret)
+ if (ret) {
+ cachefiles_ondemand_set_object_close(req->object);
goto error;
+ }
}
+ msg->msg_id = id;
+ msg->object_id = req->object->ondemand->ondemand_id;
+
if (copy_to_user(_buffer, msg, n) != 0) {
ret = -EFAULT;
goto err_put_fd;
@@ -313,19 +377,23 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
void *private)
{
struct cachefiles_cache *cache = object->volume->cache;
- struct cachefiles_req *req;
+ struct cachefiles_req *req = NULL;
XA_STATE(xas, &cache->reqs, 0);
int ret;
if (!test_bit(CACHEFILES_ONDEMAND_MODE, &cache->flags))
return 0;
- if (test_bit(CACHEFILES_DEAD, &cache->flags))
- return -EIO;
+ if (test_bit(CACHEFILES_DEAD, &cache->flags)) {
+ ret = -EIO;
+ goto out;
+ }
req = kzalloc(sizeof(*req) + data_len, GFP_KERNEL);
- if (!req)
- return -ENOMEM;
+ if (!req) {
+ ret = -ENOMEM;
+ goto out;
+ }
req->object = object;
init_completion(&req->done);
@@ -363,8 +431,9 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
/* coupled with the barrier in cachefiles_flush_reqs() */
smp_mb();
- if (opcode != CACHEFILES_OP_OPEN && object->ondemand_id <= 0) {
- WARN_ON_ONCE(object->ondemand_id == 0);
+ if (opcode == CACHEFILES_OP_CLOSE &&
+ !cachefiles_ondemand_object_is_open(object)) {
+ WARN_ON_ONCE(object->ondemand->ondemand_id == 0);
xas_unlock(&xas);
ret = -EIO;
goto out;
@@ -387,7 +456,15 @@ static int cachefiles_ondemand_send_req(struct cachefiles_object *object,
wake_up_all(&cache->daemon_pollwq);
wait_for_completion(&req->done);
ret = req->error;
+ kfree(req);
+ return ret;
out:
+ /* Reset the object to close state in error handling path.
+ * If error occurs after creating the anonymous fd,
+ * cachefiles_ondemand_fd_release() will set object to close.
+ */
+ if (opcode == CACHEFILES_OP_OPEN)
+ cachefiles_ondemand_set_object_close(object);
kfree(req);
return ret;
}
@@ -430,18 +507,10 @@ static int cachefiles_ondemand_init_close_req(struct cachefiles_req *req,
void *private)
{
struct cachefiles_object *object = req->object;
- int object_id = object->ondemand_id;
- /*
- * It's possible that object id is still 0 if the cookie looking up
- * phase failed before OPEN request has ever been sent. Also avoid
- * sending CLOSE request for CACHEFILES_ONDEMAND_ID_CLOSED, which means
- * anon_fd has already been closed.
- */
- if (object_id <= 0)
+ if (!cachefiles_ondemand_object_is_open(object))
return -ENOENT;
- req->msg.object_id = object_id;
trace_cachefiles_ondemand_close(object, &req->msg);
return 0;
}
@@ -457,16 +526,7 @@ static int cachefiles_ondemand_init_read_req(struct cachefiles_req *req,
struct cachefiles_object *object = req->object;
struct cachefiles_read *load = (void *)req->msg.data;
struct cachefiles_read_ctx *read_ctx = private;
- int object_id = object->ondemand_id;
-
- /* Stop enqueuing requests when daemon has closed anon_fd. */
- if (object_id <= 0) {
- WARN_ON_ONCE(object_id == 0);
- pr_info_once("READ: anonymous fd closed prematurely.\n");
- return -EIO;
- }
- req->msg.object_id = object_id;
load->off = read_ctx->off;
load->len = read_ctx->len;
trace_cachefiles_ondemand_read(object, &req->msg, load);
@@ -479,13 +539,16 @@ int cachefiles_ondemand_init_object(struct cachefiles_object *object)
struct fscache_volume *volume = object->volume->vcookie;
size_t volume_key_size, cookie_key_size, data_len;
+ if (!object->ondemand)
+ return 0;
+
/*
* CacheFiles will firstly check the cache file under the root cache
* directory. If the coherency check failed, it will fallback to
* creating a new tmpfile as the cache file. Reuse the previously
* allocated object ID if any.
*/
- if (object->ondemand_id > 0)
+ if (cachefiles_ondemand_object_is_open(object))
return 0;
volume_key_size = volume->key[0] + 1;
@@ -503,6 +566,28 @@ void cachefiles_ondemand_clean_object(struct cachefiles_object *object)
cachefiles_ondemand_init_close_req, NULL);
}
+int cachefiles_ondemand_init_obj_info(struct cachefiles_object *object,
+ struct cachefiles_volume *volume)
+{
+ if (!cachefiles_in_ondemand_mode(volume->cache))
+ return 0;
+
+ object->ondemand = kzalloc(sizeof(struct cachefiles_ondemand_info),
+ GFP_KERNEL);
+ if (!object->ondemand)
+ return -ENOMEM;
+
+ object->ondemand->object = object;
+ INIT_WORK(&object->ondemand->ondemand_work, ondemand_object_worker);
+ return 0;
+}
+
+void cachefiles_ondemand_deinit_obj_info(struct cachefiles_object *object)
+{
+ kfree(object->ondemand);
+ object->ondemand = NULL;
+}
+
int cachefiles_ondemand_read(struct cachefiles_object *object,
loff_t pos, size_t len)
{
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 94df854147d3..7249d70e1a43 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -7,6 +7,7 @@ config CEPH_FS
select CRYPTO_AES
select CRYPTO
select NETFS_SUPPORT
+ select FS_ENCRYPTION_ALGS if FS_ENCRYPTION
default n
help
Choose Y or M here to include support for mounting the
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 85be3bf18cdf..1340d77124ae 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -159,27 +159,7 @@ static void ceph_invalidate_folio(struct folio *folio, size_t offset,
ceph_put_snap_context(snapc);
}
- folio_wait_fscache(folio);
-}
-
-static bool ceph_release_folio(struct folio *folio, gfp_t gfp)
-{
- struct inode *inode = folio->mapping->host;
- struct ceph_client *cl = ceph_inode_to_client(inode);
-
- doutc(cl, "%llx.%llx idx %lu (%sdirty)\n", ceph_vinop(inode),
- folio->index, folio_test_dirty(folio) ? "" : "not ");
-
- if (folio_test_private(folio))
- return false;
-
- if (folio_test_fscache(folio)) {
- if (current_is_kswapd() || !(gfp & __GFP_FS))
- return false;
- folio_wait_fscache(folio);
- }
- ceph_fscache_note_page_release(inode);
- return true;
+ netfs_invalidate_folio(folio, offset, length);
}
static void ceph_netfs_expand_readahead(struct netfs_io_request *rreq)
@@ -357,6 +337,7 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
u64 len = subreq->len;
bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
u64 off = subreq->start;
+ int extent_cnt;
if (ceph_inode_is_shutdown(inode)) {
err = -EIO;
@@ -370,8 +351,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
- CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
- NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
+ CEPH_OSD_FLAG_READ, NULL, ci->i_truncate_seq,
+ ci->i_truncate_size, false);
if (IS_ERR(req)) {
err = PTR_ERR(req);
req = NULL;
@@ -379,7 +360,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
}
if (sparse) {
- err = ceph_alloc_sparse_ext_map(&req->r_ops[0]);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, len);
+ err = ceph_alloc_sparse_ext_map(&req->r_ops[0], extent_cnt);
if (err)
goto out;
}
@@ -509,7 +491,6 @@ static void ceph_netfs_free_request(struct netfs_io_request *rreq)
const struct netfs_request_ops ceph_netfs_ops = {
.init_request = ceph_init_request,
.free_request = ceph_netfs_free_request,
- .begin_cache_operation = ceph_begin_cache_operation,
.issue_read = ceph_netfs_issue_read,
.expand_readahead = ceph_netfs_expand_readahead,
.clamp_length = ceph_netfs_clamp_length,
@@ -907,8 +888,8 @@ static void writepages_finish(struct ceph_osd_request *req)
doutc(cl, "unlocking %p\n", page);
if (remove_page)
- generic_error_remove_page(inode->i_mapping,
- page);
+ generic_error_remove_folio(inode->i_mapping,
+ page_folio(page));
unlock_page(page);
}
@@ -1586,7 +1567,7 @@ const struct address_space_operations ceph_aops = {
.write_end = ceph_write_end,
.dirty_folio = ceph_dirty_folio,
.invalidate_folio = ceph_invalidate_folio,
- .release_folio = ceph_release_folio,
+ .release_folio = netfs_release_folio,
.direct_IO = noop_direct_IO,
};
diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h
index dc502daac49a..20efac020394 100644
--- a/fs/ceph/cache.h
+++ b/fs/ceph/cache.h
@@ -43,38 +43,19 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
}
}
-static inline void ceph_fscache_unpin_writeback(struct inode *inode,
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
struct writeback_control *wbc)
{
- fscache_unpin_writeback(wbc, ceph_fscache_cookie(ceph_inode(inode)));
+ return netfs_unpin_writeback(inode, wbc);
}
-static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
- struct folio *folio)
-{
- struct ceph_inode_info *ci = ceph_inode(mapping->host);
-
- return fscache_dirty_folio(mapping, folio, ceph_fscache_cookie(ci));
-}
-
-static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
-{
- struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode));
-
- return fscache_begin_read_operation(&rreq->cache_resources, cookie);
-}
+#define ceph_fscache_dirty_folio netfs_dirty_folio
static inline bool ceph_is_cache_enabled(struct inode *inode)
{
return fscache_cookie_enabled(ceph_fscache_cookie(ceph_inode(inode)));
}
-static inline void ceph_fscache_note_page_release(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
-
- fscache_note_page_release(ceph_fscache_cookie(ci));
-}
#else /* CONFIG_CEPH_FSCACHE */
static inline int ceph_fscache_register_fs(struct ceph_fs_client* fsc,
struct fs_context *fc)
@@ -119,30 +100,18 @@ static inline void ceph_fscache_resize(struct inode *inode, loff_t to)
{
}
-static inline void ceph_fscache_unpin_writeback(struct inode *inode,
- struct writeback_control *wbc)
+static inline int ceph_fscache_unpin_writeback(struct inode *inode,
+ struct writeback_control *wbc)
{
+ return 0;
}
-static inline int ceph_fscache_dirty_folio(struct address_space *mapping,
- struct folio *folio)
-{
- return filemap_dirty_folio(mapping, folio);
-}
+#define ceph_fscache_dirty_folio filemap_dirty_folio
static inline bool ceph_is_cache_enabled(struct inode *inode)
{
return false;
}
-
-static inline int ceph_begin_cache_operation(struct netfs_io_request *rreq)
-{
- return -ENOBUFS;
-}
-
-static inline void ceph_fscache_note_page_release(struct inode *inode)
-{
-}
#endif /* CONFIG_CEPH_FSCACHE */
#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2c0b8dc3dd0d..7fb4aae97412 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1452,7 +1452,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap,
if (flushing & CEPH_CAP_XATTR_EXCL) {
arg->old_xattr_buf = __ceph_build_xattrs_blob(ci);
arg->xattr_version = ci->i_xattrs.version;
- arg->xattr_buf = ci->i_xattrs.blob;
+ arg->xattr_buf = ceph_buffer_get(ci->i_xattrs.blob);
} else {
arg->xattr_buf = NULL;
arg->old_xattr_buf = NULL;
@@ -1553,6 +1553,7 @@ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci)
encode_cap_msg(msg, arg);
ceph_con_send(&arg->session->s_con, msg);
ceph_buffer_put(arg->old_xattr_buf);
+ ceph_buffer_put(arg->xattr_buf);
if (arg->wake)
wake_up_all(&ci->i_cap_wq);
}
@@ -2155,6 +2156,30 @@ retry:
ceph_cap_string(cap->implemented),
ceph_cap_string(revoking));
+ /* completed revocation? going down and there are no caps? */
+ if (revoking) {
+ if ((revoking & cap_used) == 0) {
+ doutc(cl, "completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /*
+ * If the "i_wrbuffer_ref" was increased by mmap or generic
+ * cache write just before the ceph_check_caps() is called,
+ * the Fb capability revoking will fail this time. Then we
+ * must wait for the BDI's delayed work to flush the dirty
+ * pages and to release the "i_wrbuffer_ref", which will cost
+ * at most 5 seconds. That means the MDS needs to wait at
+ * most 5 seconds to finished the Fb capability's revocation.
+ *
+ * Let's queue a writeback for it.
+ */
+ if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
+ (revoking & CEPH_CAP_FILE_BUFFER))
+ queue_writeback = true;
+ }
+
if (cap == ci->i_auth_cap &&
(cap->issued & CEPH_CAP_FILE_WR)) {
/* request larger max_size from MDS? */
@@ -2182,30 +2207,6 @@ retry:
}
}
- /* completed revocation? going down and there are no caps? */
- if (revoking) {
- if ((revoking & cap_used) == 0) {
- doutc(cl, "completed revocation of %s\n",
- ceph_cap_string(cap->implemented & ~cap->issued));
- goto ack;
- }
-
- /*
- * If the "i_wrbuffer_ref" was increased by mmap or generic
- * cache write just before the ceph_check_caps() is called,
- * the Fb capability revoking will fail this time. Then we
- * must wait for the BDI's delayed work to flush the dirty
- * pages and to release the "i_wrbuffer_ref", which will cost
- * at most 5 seconds. That means the MDS needs to wait at
- * most 5 seconds to finished the Fb capability's revocation.
- *
- * Let's queue a writeback for it.
- */
- if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
- (revoking & CEPH_CAP_FILE_BUFFER))
- queue_writeback = true;
- }
-
/* want more caps from mds? */
if (want & ~cap->mds_wanted) {
if (want & ~(cap->mds_wanted | cap->issued))
@@ -3215,7 +3216,6 @@ static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
enum put_cap_refs_mode {
PUT_CAP_REFS_SYNC = 0,
- PUT_CAP_REFS_NO_CHECK,
PUT_CAP_REFS_ASYNC,
};
@@ -3331,11 +3331,6 @@ void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had)
__ceph_put_cap_refs(ci, had, PUT_CAP_REFS_ASYNC);
}
-void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci, int had)
-{
- __ceph_put_cap_refs(ci, had, PUT_CAP_REFS_NO_CHECK);
-}
-
/*
* Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
* context. Adjust per-snap dirty page accounting as appropriate.
@@ -4777,7 +4772,22 @@ int ceph_drop_caps_for_unlink(struct inode *inode)
if (__ceph_caps_dirty(ci)) {
struct ceph_mds_client *mdsc =
ceph_inode_to_fs_client(inode)->mdsc;
- __cap_delay_requeue_front(mdsc, ci);
+
+ doutc(mdsc->fsc->client, "%p %llx.%llx\n", inode,
+ ceph_vinop(inode));
+ spin_lock(&mdsc->cap_unlink_delay_lock);
+ ci->i_ceph_flags |= CEPH_I_FLUSH;
+ if (!list_empty(&ci->i_cap_delay_list))
+ list_del_init(&ci->i_cap_delay_list);
+ list_add_tail(&ci->i_cap_delay_list,
+ &mdsc->cap_unlink_delay_list);
+ spin_unlock(&mdsc->cap_unlink_delay_lock);
+
+ /*
+ * Fire the work immediately, because the MDS maybe
+ * waiting for caps release.
+ */
+ ceph_queue_cap_unlink_work(mdsc);
}
}
spin_unlock(&ci->i_ceph_lock);
@@ -4887,13 +4897,15 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
struct inode *dir,
int mds, int drop, int unless)
{
- struct dentry *parent = NULL;
struct ceph_mds_request_release *rel = *p;
struct ceph_dentry_info *di = ceph_dentry(dentry);
struct ceph_client *cl;
int force = 0;
int ret;
+ /* This shouldn't happen */
+ BUG_ON(!dir);
+
/*
* force an record for the directory caps if we have a dentry lease.
* this is racy (can't take i_ceph_lock and d_lock together), but it
@@ -4903,14 +4915,9 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry,
spin_lock(&dentry->d_lock);
if (di->lease_session && di->lease_session->s_mds == mds)
force = 1;
- if (!dir) {
- parent = dget(dentry->d_parent);
- dir = d_inode(parent);
- }
spin_unlock(&dentry->d_lock);
ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
- dput(parent);
cl = ceph_inode_to_client(dir);
spin_lock(&dentry->d_lock);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 91709934c8b1..0e9f56eaba1e 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -174,7 +174,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
/*
* When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on
- * d_child when we initially get results back from the MDS, and
+ * d_children when we initially get results back from the MDS, and
* falling back to a "normal" sync readdir if any dentries in the dir
* are dropped.
*
@@ -1593,10 +1593,12 @@ struct ceph_lease_walk_control {
unsigned long dir_lease_ttl;
};
+static int __dir_lease_check(const struct dentry *, struct ceph_lease_walk_control *);
+static int __dentry_lease_check(const struct dentry *);
+
static unsigned long
__dentry_leases_walk(struct ceph_mds_client *mdsc,
- struct ceph_lease_walk_control *lwc,
- int (*check)(struct dentry*, void*))
+ struct ceph_lease_walk_control *lwc)
{
struct ceph_dentry_info *di, *tmp;
struct dentry *dentry, *last = NULL;
@@ -1624,7 +1626,10 @@ __dentry_leases_walk(struct ceph_mds_client *mdsc,
goto next;
}
- ret = check(dentry, lwc);
+ if (lwc->dir_lease)
+ ret = __dir_lease_check(dentry, lwc);
+ else
+ ret = __dentry_lease_check(dentry);
if (ret & TOUCH) {
/* move it into tail of dir lease list */
__dentry_dir_lease_touch(mdsc, di);
@@ -1681,7 +1686,7 @@ next:
return freed;
}
-static int __dentry_lease_check(struct dentry *dentry, void *arg)
+static int __dentry_lease_check(const struct dentry *dentry)
{
struct ceph_dentry_info *di = ceph_dentry(dentry);
int ret;
@@ -1696,9 +1701,9 @@ static int __dentry_lease_check(struct dentry *dentry, void *arg)
return DELETE;
}
-static int __dir_lease_check(struct dentry *dentry, void *arg)
+static int __dir_lease_check(const struct dentry *dentry,
+ struct ceph_lease_walk_control *lwc)
{
- struct ceph_lease_walk_control *lwc = arg;
struct ceph_dentry_info *di = ceph_dentry(dentry);
int ret = __dir_lease_try_check(dentry);
@@ -1737,7 +1742,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc)
lwc.dir_lease = false;
lwc.nr_to_scan = CEPH_CAPS_PER_RELEASE * 2;
- freed = __dentry_leases_walk(mdsc, &lwc, __dentry_lease_check);
+ freed = __dentry_leases_walk(mdsc, &lwc);
if (!lwc.nr_to_scan) /* more invalid leases */
return -EAGAIN;
@@ -1747,7 +1752,7 @@ int ceph_trim_dentries(struct ceph_mds_client *mdsc)
lwc.dir_lease = true;
lwc.expire_dir_lease = freed < count;
lwc.dir_lease_ttl = mdsc->fsc->mount_options->caps_wanted_delay_max * HZ;
- freed +=__dentry_leases_walk(mdsc, &lwc, __dir_lease_check);
+ freed +=__dentry_leases_walk(mdsc, &lwc);
if (!lwc.nr_to_scan) /* more to check */
return -EAGAIN;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 726af69d4d62..a79f163ae4ed 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -286,8 +286,6 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb,
doutc(cl, "%llx.%llx parent %llx hash %x err=%d", vino.ino,
vino.snap, sfh->parent_ino, sfh->hash, err);
}
- if (IS_ERR(inode))
- return ERR_CAST(inode);
/* see comments in ceph_get_parent() */
return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode);
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3b5aae29e944..abe8028d95bf 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -12,6 +12,7 @@
#include <linux/falloc.h>
#include <linux/iversion.h>
#include <linux/ktime.h>
+#include <linux/splice.h>
#include "super.h"
#include "mds_client.h"
@@ -1028,6 +1029,7 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
struct ceph_osd_req_op *op;
u64 read_off = off;
u64 read_len = len;
+ int extent_cnt;
/* determine new offset/length if encrypted */
ceph_fscrypt_adjust_off_and_len(inode, &read_off, &read_len);
@@ -1067,7 +1069,8 @@ ssize_t __ceph_sync_read(struct inode *inode, loff_t *ki_pos,
op = &req->r_ops[0];
if (sparse) {
- ret = ceph_alloc_sparse_ext_map(op);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, read_len);
+ ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
if (ret) {
ceph_osdc_put_request(req);
break;
@@ -1464,6 +1467,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
ssize_t len;
struct ceph_osd_req_op *op;
int readop = sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ;
+ int extent_cnt;
if (write)
size = min_t(u64, size, fsc->mount_options->wsize);
@@ -1527,7 +1531,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
osd_req_op_extent_osd_data_bvecs(req, 0, bvecs, num_pages, len);
op = &req->r_ops[0];
if (sparse) {
- ret = ceph_alloc_sparse_ext_map(op);
+ extent_cnt = __ceph_sparse_read_ext_count(inode, size);
+ ret = ceph_alloc_sparse_ext_map(op, extent_cnt);
if (ret) {
ceph_osdc_put_request(req);
break;
@@ -3010,8 +3015,8 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
* {read,write}_iter, which will get caps again.
*/
put_rd_wr_caps(src_ci, src_got, dst_ci, dst_got);
- ret = do_splice_direct(src_file, &src_off, dst_file,
- &dst_off, src_objlen, flags);
+ ret = splice_file_range(src_file, &src_off, dst_file, &dst_off,
+ src_objlen);
/* Abort on short copies or on error */
if (ret < (long)src_objlen) {
doutc(cl, "Failed partial copy (%zd)\n", ret);
@@ -3065,8 +3070,8 @@ out_caps:
*/
if (len && (len < src_ci->i_layout.object_size)) {
doutc(cl, "Final partial copy of %zu bytes\n", len);
- bytes = do_splice_direct(src_file, &src_off, dst_file,
- &dst_off, len, flags);
+ bytes = splice_file_range(src_file, &src_off, dst_file,
+ &dst_off, len);
if (bytes > 0)
ret += bytes;
else
@@ -3089,8 +3094,8 @@ static ssize_t ceph_copy_file_range(struct file *src_file, loff_t src_off,
len, flags);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
- ret = generic_copy_file_range(src_file, src_off, dst_file,
- dst_off, len, flags);
+ ret = splice_copy_file_range(src_file, src_off, dst_file,
+ dst_off, len);
return ret;
}
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 0679240f06db..7b2e77517f23 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -78,6 +78,8 @@ struct inode *ceph_new_inode(struct inode *dir, struct dentry *dentry,
if (!inode)
return ERR_PTR(-ENOMEM);
+ inode->i_blkbits = CEPH_FSCRYPT_BLOCK_SHIFT;
+
if (!S_ISLNK(*mode)) {
err = ceph_pre_init_acls(dir, mode, as_ctx);
if (err < 0)
@@ -574,7 +576,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
doutc(fsc->client, "%p\n", &ci->netfs.inode);
/* Set parameters for the netfs library */
- netfs_inode_init(&ci->netfs, &ceph_netfs_ops);
+ netfs_inode_init(&ci->netfs, &ceph_netfs_ops, false);
spin_lock_init(&ci->i_ceph_lock);
@@ -694,7 +696,7 @@ void ceph_evict_inode(struct inode *inode)
percpu_counter_dec(&mdsc->metric.total_inodes);
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_FSCACHE_WB)
+ if (inode->i_state & I_PINNING_NETFS_WB)
ceph_fscache_unuse_cookie(inode, true);
clear_inode(inode);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d95eb525519a..3ab9c268a8bb 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1089,7 +1089,7 @@ void ceph_mdsc_release_request(struct kref *kref)
struct ceph_mds_request *req = container_of(kref,
struct ceph_mds_request,
r_kref);
- ceph_mdsc_release_dir_caps_no_check(req);
+ ceph_mdsc_release_dir_caps_async(req);
destroy_reply_info(&req->r_reply_info);
if (req->r_request)
ceph_msg_put(req->r_request);
@@ -1534,7 +1534,8 @@ static int encode_metric_spec(void **p, void *end)
* session message, specialization for CEPH_SESSION_REQUEST_OPEN
* to include additional client metadata fields.
*/
-static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq)
+static struct ceph_msg *
+create_session_full_msg(struct ceph_mds_client *mdsc, int op, u64 seq)
{
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
@@ -1578,6 +1579,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
size = METRIC_BYTES(count);
extra_bytes += 2 + 4 + 4 + size;
+ /* flags, mds auth caps and oldest_client_tid */
+ extra_bytes += 4 + 4 + 8;
+
/* Allocate the message */
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
@@ -1589,16 +1593,16 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
end = p + msg->front.iov_len;
h = p;
- h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN);
+ h->op = cpu_to_le32(op);
h->seq = cpu_to_le64(seq);
/*
* Serialize client metadata into waiting buffer space, using
* the format that userspace expects for map<string, string>
*
- * ClientSession messages with metadata are v4
+ * ClientSession messages with metadata are v7
*/
- msg->hdr.version = cpu_to_le16(4);
+ msg->hdr.version = cpu_to_le16(7);
msg->hdr.compat_version = cpu_to_le16(1);
/* The write pointer, following the session_head structure */
@@ -1634,6 +1638,15 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
return ERR_PTR(ret);
}
+ /* version == 5, flags */
+ ceph_encode_32(&p, 0);
+
+ /* version == 6, mds auth caps */
+ ceph_encode_32(&p, 0);
+
+ /* version == 7, oldest_client_tid */
+ ceph_encode_64(&p, mdsc->oldest_tid);
+
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -1663,7 +1676,8 @@ static int __open_session(struct ceph_mds_client *mdsc,
session->s_renew_requested = jiffies;
/* send connect message */
- msg = create_session_open_msg(mdsc, session->s_seq);
+ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_OPEN,
+ session->s_seq);
if (IS_ERR(msg))
return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
@@ -2028,10 +2042,10 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
doutc(cl, "to mds%d (%s)\n", session->s_mds,
ceph_mds_state_name(state));
- msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+ msg = create_session_full_msg(mdsc, CEPH_SESSION_REQUEST_RENEWCAPS,
++session->s_renew_seq);
- if (!msg)
- return -ENOMEM;
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
ceph_con_send(&session->s_con, msg);
return 0;
}
@@ -2128,7 +2142,7 @@ static bool drop_negative_children(struct dentry *dentry)
goto out;
spin_lock(&dentry->d_lock);
- list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ hlist_for_each_entry(child, &dentry->d_children, d_sib) {
if (d_really_is_positive(child)) {
all_negative = false;
break;
@@ -2470,6 +2484,50 @@ void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr)
}
}
+void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ if (mdsc->stopping)
+ return;
+
+ if (queue_work(mdsc->fsc->cap_wq, &mdsc->cap_unlink_work)) {
+ doutc(cl, "caps unlink work queued\n");
+ } else {
+ doutc(cl, "failed to queue caps unlink work\n");
+ }
+}
+
+static void ceph_cap_unlink_work(struct work_struct *work)
+{
+ struct ceph_mds_client *mdsc =
+ container_of(work, struct ceph_mds_client, cap_unlink_work);
+ struct ceph_client *cl = mdsc->fsc->client;
+
+ doutc(cl, "begin\n");
+ spin_lock(&mdsc->cap_unlink_delay_lock);
+ while (!list_empty(&mdsc->cap_unlink_delay_list)) {
+ struct ceph_inode_info *ci;
+ struct inode *inode;
+
+ ci = list_first_entry(&mdsc->cap_unlink_delay_list,
+ struct ceph_inode_info,
+ i_cap_delay_list);
+ list_del_init(&ci->i_cap_delay_list);
+
+ inode = igrab(&ci->netfs.inode);
+ if (inode) {
+ spin_unlock(&mdsc->cap_unlink_delay_lock);
+ doutc(cl, "on %p %llx.%llx\n", inode,
+ ceph_vinop(inode));
+ ceph_check_caps(ci, CHECK_CAPS_FLUSH);
+ iput(inode);
+ spin_lock(&mdsc->cap_unlink_delay_lock);
+ }
+ }
+ spin_unlock(&mdsc->cap_unlink_delay_lock);
+ doutc(cl, "done\n");
+}
+
/*
* requests
*/
@@ -4128,12 +4186,12 @@ static void handle_session(struct ceph_mds_session *session,
pr_info_client(cl, "mds%d reconnect success\n",
session->s_mds);
+ session->s_features = features;
if (session->s_state == CEPH_MDS_SESSION_OPEN) {
pr_notice_client(cl, "mds%d is already opened\n",
session->s_mds);
} else {
session->s_state = CEPH_MDS_SESSION_OPEN;
- session->s_features = features;
renewed_caps(mdsc, session, 0);
if (test_bit(CEPHFS_FEATURE_METRIC_COLLECT,
&session->s_features))
@@ -4247,7 +4305,7 @@ void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req)
}
}
-void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
+void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req)
{
struct ceph_client *cl = req->r_mdsc->fsc->client;
int dcaps;
@@ -4255,8 +4313,7 @@ void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req)
dcaps = xchg(&req->r_dir_caps, 0);
if (dcaps) {
doutc(cl, "releasing r_dir_caps=%s\n", ceph_cap_string(dcaps));
- ceph_put_cap_refs_no_check_caps(ceph_inode(req->r_parent),
- dcaps);
+ ceph_put_cap_refs_async(ceph_inode(req->r_parent), dcaps);
}
}
@@ -4292,7 +4349,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
if (req->r_session->s_mds != session->s_mds)
continue;
- ceph_mdsc_release_dir_caps_no_check(req);
+ ceph_mdsc_release_dir_caps_async(req);
__send_request(session, req, true);
}
@@ -5346,6 +5403,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_LIST_HEAD(&mdsc->cap_delay_list);
INIT_LIST_HEAD(&mdsc->cap_wait_list);
spin_lock_init(&mdsc->cap_delay_lock);
+ INIT_LIST_HEAD(&mdsc->cap_unlink_delay_list);
+ spin_lock_init(&mdsc->cap_unlink_delay_lock);
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->last_cap_flush_tid = 1;
@@ -5354,6 +5413,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->cap_dirty_lock);
init_waitqueue_head(&mdsc->cap_flushing_wq);
INIT_WORK(&mdsc->cap_reclaim_work, ceph_cap_reclaim_work);
+ INIT_WORK(&mdsc->cap_unlink_work, ceph_cap_unlink_work);
err = ceph_metric_init(&mdsc->metric);
if (err)
goto err_mdsmap;
@@ -5627,6 +5687,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
ceph_cleanup_global_and_empty_realms(mdsc);
cancel_work_sync(&mdsc->cap_reclaim_work);
+ cancel_work_sync(&mdsc->cap_unlink_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
doutc(cl, "done\n");
@@ -5870,7 +5931,8 @@ static void mds_peer_reset(struct ceph_connection *con)
pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n",
s->s_mds);
- if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO)
+ if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO &&
+ ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT)
send_mds_reconnect(mdsc, s);
}
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 2e6ddaa13d72..03f8ff00874f 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -462,6 +462,8 @@ struct ceph_mds_client {
unsigned long last_renew_caps; /* last time we renewed our caps */
struct list_head cap_delay_list; /* caps with delayed release */
spinlock_t cap_delay_lock; /* protects cap_delay_list */
+ struct list_head cap_unlink_delay_list; /* caps with delayed release for unlink */
+ spinlock_t cap_unlink_delay_lock; /* protects cap_unlink_delay_list */
struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock;
@@ -475,6 +477,8 @@ struct ceph_mds_client {
struct work_struct cap_reclaim_work;
atomic_t cap_reclaim_pending;
+ struct work_struct cap_unlink_work;
+
/*
* Cap reservations
*
@@ -552,7 +556,7 @@ extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
extern void ceph_mdsc_release_dir_caps(struct ceph_mds_request *req);
-extern void ceph_mdsc_release_dir_caps_no_check(struct ceph_mds_request *req);
+extern void ceph_mdsc_release_dir_caps_async(struct ceph_mds_request *req);
static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
{
kref_get(&req->r_kref);
@@ -574,6 +578,7 @@ extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session);
extern void ceph_queue_cap_reclaim_work(struct ceph_mds_client *mdsc);
extern void ceph_reclaim_caps_nr(struct ceph_mds_client *mdsc, int nr);
+extern void ceph_queue_cap_unlink_work(struct ceph_mds_client *mdsc);
extern int ceph_iterate_session_caps(struct ceph_mds_session *session,
int (*cb)(struct inode *, int mds, void *),
void *arg);
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index fae97c25ce58..8109aba66e02 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -380,10 +380,11 @@ struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p,
ceph_decode_skip_8(p, end, bad_ext);
/* required_client_features */
ceph_decode_skip_set(p, end, 64, bad_ext);
+ /* bal_rank_mask */
+ ceph_decode_skip_string(p, end, bad_ext);
+ }
+ if (mdsmap_ev >= 18) {
ceph_decode_64_safe(p, end, m->m_max_xattr_size, bad_ext);
- } else {
- /* This forces the usage of the (sync) SETXATTR Op */
- m->m_max_xattr_size = 0;
}
bad_ext:
doutc(cl, "m_enabled: %d, m_damaged: %d, m_num_laggy: %d\n",
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
index 89f1931f1ba6..1f2171dd01bf 100644
--- a/fs/ceph/mdsmap.h
+++ b/fs/ceph/mdsmap.h
@@ -27,7 +27,11 @@ struct ceph_mdsmap {
u32 m_session_timeout; /* seconds */
u32 m_session_autoclose; /* seconds */
u64 m_max_file_size;
- u64 m_max_xattr_size; /* maximum size for xattrs blob */
+ /*
+ * maximum size for xattrs blob.
+ * Zeroed by default to force the usage of the (sync) SETXATTR Op.
+ */
+ u64 m_max_xattr_size;
u32 m_max_mds; /* expected up:active mds number */
u32 m_num_active_mds; /* actual up:active mds number */
u32 possible_max_rank; /* possible max rank index */
diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c
index 9d36c3532de1..06ee397e0c3a 100644
--- a/fs/ceph/quota.c
+++ b/fs/ceph/quota.c
@@ -197,10 +197,10 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
}
/*
- * This function walks through the snaprealm for an inode and returns the
- * ceph_snap_realm for the first snaprealm that has quotas set (max_files,
+ * This function walks through the snaprealm for an inode and set the
+ * realmp with the first snaprealm that has quotas set (max_files,
* max_bytes, or any, depending on the 'which_quota' argument). If the root is
- * reached, return the root ceph_snap_realm instead.
+ * reached, set the realmp with the root ceph_snap_realm instead.
*
* Note that the caller is responsible for calling ceph_put_snap_realm() on the
* returned realm.
@@ -211,10 +211,9 @@ void ceph_cleanup_quotarealms_inodes(struct ceph_mds_client *mdsc)
* this function will return -EAGAIN; otherwise, the snaprealms walk-through
* will be restarted.
*/
-static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
- struct inode *inode,
- enum quota_get_realm which_quota,
- bool retry)
+static int get_quota_realm(struct ceph_mds_client *mdsc, struct inode *inode,
+ enum quota_get_realm which_quota,
+ struct ceph_snap_realm **realmp, bool retry)
{
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_inode_info *ci = NULL;
@@ -222,8 +221,10 @@ static struct ceph_snap_realm *get_quota_realm(struct ceph_mds_client *mdsc,
struct inode *in;
bool has_quota;
+ if (realmp)
+ *realmp = NULL;
if (ceph_snap(inode) != CEPH_NOSNAP)
- return NULL;
+ return 0;
restart:
realm = ceph_inode(inode)->i_snap_realm;
@@ -250,7 +251,7 @@ restart:
break;
ceph_put_snap_realm(mdsc, realm);
if (!retry)
- return ERR_PTR(-EAGAIN);
+ return -EAGAIN;
goto restart;
}
@@ -259,8 +260,11 @@ restart:
iput(in);
next = realm->parent;
- if (has_quota || !next)
- return realm;
+ if (has_quota || !next) {
+ if (realmp)
+ *realmp = realm;
+ return 0;
+ }
ceph_get_snap_realm(mdsc, next);
ceph_put_snap_realm(mdsc, realm);
@@ -269,7 +273,7 @@ restart:
if (realm)
ceph_put_snap_realm(mdsc, realm);
- return NULL;
+ return 0;
}
bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
@@ -277,6 +281,7 @@ bool ceph_quota_is_same_realm(struct inode *old, struct inode *new)
struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb);
struct ceph_snap_realm *old_realm, *new_realm;
bool is_same;
+ int ret;
restart:
/*
@@ -286,9 +291,9 @@ restart:
* dropped and we can then restart the whole operation.
*/
down_read(&mdsc->snap_rwsem);
- old_realm = get_quota_realm(mdsc, old, QUOTA_GET_ANY, true);
- new_realm = get_quota_realm(mdsc, new, QUOTA_GET_ANY, false);
- if (PTR_ERR(new_realm) == -EAGAIN) {
+ get_quota_realm(mdsc, old, QUOTA_GET_ANY, &old_realm, true);
+ ret = get_quota_realm(mdsc, new, QUOTA_GET_ANY, &new_realm, false);
+ if (ret == -EAGAIN) {
up_read(&mdsc->snap_rwsem);
if (old_realm)
ceph_put_snap_realm(mdsc, old_realm);
@@ -492,8 +497,8 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf)
bool is_updated = false;
down_read(&mdsc->snap_rwsem);
- realm = get_quota_realm(mdsc, d_inode(fsc->sb->s_root),
- QUOTA_GET_MAX_BYTES, true);
+ get_quota_realm(mdsc, d_inode(fsc->sb->s_root), QUOTA_GET_MAX_BYTES,
+ &realm, true);
up_read(&mdsc->snap_rwsem);
if (!realm)
return false;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index fe0f64a0acb2..b63b4cd9b5b6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -3,6 +3,7 @@
#define _FS_CEPH_SUPER_H
#include <linux/ceph/ceph_debug.h>
+#include <linux/ceph/osd_client.h>
#include <asm/unaligned.h>
#include <linux/backing-dev.h>
@@ -1254,8 +1255,6 @@ extern void ceph_take_cap_refs(struct ceph_inode_info *ci, int caps,
extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_cap_refs_async(struct ceph_inode_info *ci, int had);
-extern void ceph_put_cap_refs_no_check_caps(struct ceph_inode_info *ci,
- int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc);
extern void __ceph_remove_capsnap(struct inode *inode,
@@ -1407,6 +1406,19 @@ static inline void __ceph_update_quota(struct ceph_inode_info *ci,
ceph_adjust_quota_realms_count(&ci->netfs.inode, has_quota);
}
+static inline int __ceph_sparse_read_ext_count(struct inode *inode, u64 len)
+{
+ int cnt = 0;
+
+ if (IS_ENCRYPTED(inode)) {
+ cnt = len >> CEPH_FSCRYPT_BLOCK_SHIFT;
+ if (cnt > CEPH_SPARSE_EXT_ARRAY_INITIAL)
+ cnt = 0;
+ }
+
+ return cnt;
+}
+
extern void ceph_handle_quota(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_msg *msg);
diff --git a/fs/coda/cache.c b/fs/coda/cache.c
index 3b8c4513118f..970f0022ec52 100644
--- a/fs/coda/cache.c
+++ b/fs/coda/cache.c
@@ -93,13 +93,13 @@ static void coda_flag_children(struct dentry *parent, int flag)
struct dentry *de;
spin_lock(&parent->d_lock);
- list_for_each_entry(de, &parent->d_subdirs, d_child) {
+ hlist_for_each_entry(de, &parent->d_children, d_sib) {
+ struct inode *inode = d_inode_rcu(de);
/* don't know what to do with negative dentries */
- if (d_inode(de) )
- coda_flag_inode(d_inode(de), flag);
+ if (inode)
+ coda_flag_inode(inode, flag);
}
spin_unlock(&parent->d_lock);
- return;
}
void coda_flag_inode_children(struct inode *inode, int flag)
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 16acc58311ea..148856a582a9 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -79,14 +79,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
if (ret)
goto finish_write;
- file_start_write(host_file);
inode_lock(coda_inode);
ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos, 0);
coda_inode->i_size = file_inode(host_file)->i_size;
coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
inode_set_mtime_to_ts(coda_inode, inode_set_ctime_current(coda_inode));
inode_unlock(coda_inode);
- file_end_write(host_file);
finish_write:
venus_access_intent(coda_inode->i_sb, coda_i2f(coda_inode),
diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c
index a247c14aaab7..9f2d5743e2c8 100644
--- a/fs/coda/sysctl.c
+++ b/fs/coda/sysctl.c
@@ -36,7 +36,6 @@ static struct ctl_table coda_table[] = {
.mode = 0600,
.proc_handler = proc_dointvec
},
- {}
};
void coda_sysctl_init(void)
diff --git a/fs/coredump.c b/fs/coredump.c
index 9d235fa14ab9..be6403b4b14b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -872,6 +872,9 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
loff_t pos;
ssize_t n;
+ if (!page)
+ return 0;
+
if (cprm->to_skip) {
if (!__dump_skip(cprm, cprm->to_skip))
return 0;
@@ -884,7 +887,6 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
pos = file->f_pos;
bvec_set_page(&bvec, page, PAGE_SIZE, 0);
iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
- iov_iter_set_copy_mc(&iter);
n = __kernel_write_iter(cprm->file, &iter, &pos);
if (n != PAGE_SIZE)
return 0;
@@ -895,10 +897,44 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
return 1;
}
+/*
+ * If we might get machine checks from kernel accesses during the
+ * core dump, let's get those errors early rather than during the
+ * IO. This is not performance-critical enough to warrant having
+ * all the machine check logic in the iovec paths.
+ */
+#ifdef copy_mc_to_kernel
+
+#define dump_page_alloc() alloc_page(GFP_KERNEL)
+#define dump_page_free(x) __free_page(x)
+static struct page *dump_page_copy(struct page *src, struct page *dst)
+{
+ void *buf = kmap_local_page(src);
+ size_t left = copy_mc_to_kernel(page_address(dst), buf, PAGE_SIZE);
+ kunmap_local(buf);
+ return left ? NULL : dst;
+}
+
+#else
+
+/* We just want to return non-NULL; it's never used. */
+#define dump_page_alloc() ERR_PTR(-EINVAL)
+#define dump_page_free(x) ((void)(x))
+static inline struct page *dump_page_copy(struct page *src, struct page *dst)
+{
+ return src;
+}
+#endif
+
int dump_user_range(struct coredump_params *cprm, unsigned long start,
unsigned long len)
{
unsigned long addr;
+ struct page *dump_page;
+
+ dump_page = dump_page_alloc();
+ if (!dump_page)
+ return 0;
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
struct page *page;
@@ -912,14 +948,17 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
*/
page = get_dump_page(addr);
if (page) {
- int stop = !dump_emit_page(cprm, page);
+ int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page));
put_page(page);
- if (stop)
+ if (stop) {
+ dump_page_free(dump_page);
return 0;
+ }
} else {
dump_skip(cprm, PAGE_SIZE);
}
}
+ dump_page_free(dump_page);
return 1;
}
#endif
@@ -981,7 +1020,6 @@ static struct ctl_table coredump_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
static int __init init_fs_coredump_sysctls(void)
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 2d0c8922f635..5aff5934baa1 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -11,7 +11,7 @@ config FS_ENCRYPTION
feature is similar to ecryptfs, but it is more memory
efficient since it avoids caching the encrypted and
decrypted pages in the page cache. Currently Ext4,
- F2FS and UBIFS make use of this feature.
+ F2FS, UBIFS, and CephFS make use of this feature.
# Filesystems supporting encryption must select this if FS_ENCRYPTION. This
# allows the algorithms to be built as modules when all the filesystems are,
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index f34a9b0b9e92..0edf0b58daa7 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -1002,9 +1002,9 @@ static int try_to_lock_encrypted_files(struct super_block *sb,
* FS_IOC_REMOVE_ENCRYPTION_KEY_ALL_USERS (all_users=true) always removes the
* key itself.
*
- * To "remove the key itself", first we wipe the actual master key secret, so
- * that no more inodes can be unlocked with it. Then we try to evict all cached
- * inodes that had been unlocked with the key.
+ * To "remove the key itself", first we transition the key to the "incompletely
+ * removed" state, so that no more inodes can be unlocked with it. Then we try
+ * to evict all cached inodes that had been unlocked with the key.
*
* If all inodes were evicted, then we unlink the fscrypt_master_key from the
* keyring. Otherwise it remains in the keyring in the "incompletely removed"
diff --git a/fs/dax.c b/fs/dax.c
index 3380b43cb6bb..423fc1607dfa 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1128,7 +1128,7 @@ static int dax_iomap_copy_around(loff_t pos, uint64_t length, size_t align_size,
/* zero the edges if srcmap is a HOLE or IOMAP_UNWRITTEN */
bool zero_edge = srcmap->flags & IOMAP_F_SHARED ||
srcmap->type == IOMAP_UNWRITTEN;
- void *saddr = 0;
+ void *saddr = NULL;
int ret = 0;
if (!zero_edge) {
diff --git a/fs/dcache.c b/fs/dcache.c
index c82ae731df9a..6ebccba33336 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -51,8 +51,8 @@
* - d_lru
* - d_count
* - d_unhashed()
- * - d_parent and d_subdirs
- * - childrens' d_child and d_parent
+ * - d_parent and d_chilren
+ * - childrens' d_sib and d_parent
* - d_u.d_alias, d_inode
*
* Ordering:
@@ -191,7 +191,6 @@ static struct ctl_table fs_dcache_sysctls[] = {
.mode = 0444,
.proc_handler = proc_nr_dentry,
},
- { }
};
static int __init init_fs_dcache_sysctls(void)
@@ -344,7 +343,7 @@ static inline void __d_set_inode_and_type(struct dentry *dentry,
dentry->d_inode = inode;
flags = READ_ONCE(dentry->d_flags);
- flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+ flags &= ~DCACHE_ENTRY_TYPE;
flags |= type_flags;
smp_store_release(&dentry->d_flags, flags);
}
@@ -353,7 +352,7 @@ static inline void __d_clear_type_and_inode(struct dentry *dentry)
{
unsigned flags = READ_ONCE(dentry->d_flags);
- flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU);
+ flags &= ~DCACHE_ENTRY_TYPE;
WRITE_ONCE(dentry->d_flags, flags);
dentry->d_inode = NULL;
if (dentry->d_flags & DCACHE_LRU_LIST)
@@ -428,7 +427,8 @@ static void d_lru_add(struct dentry *dentry)
this_cpu_inc(nr_dentry_unused);
if (d_is_negative(dentry))
this_cpu_inc(nr_dentry_negative);
- WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+ WARN_ON_ONCE(!list_lru_add_obj(
+ &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}
static void d_lru_del(struct dentry *dentry)
@@ -438,7 +438,8 @@ static void d_lru_del(struct dentry *dentry)
this_cpu_dec(nr_dentry_unused);
if (d_is_negative(dentry))
this_cpu_dec(nr_dentry_negative);
- WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+ WARN_ON_ONCE(!list_lru_del_obj(
+ &dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}
static void d_shrink_del(struct dentry *dentry)
@@ -537,7 +538,7 @@ void d_drop(struct dentry *dentry)
}
EXPORT_SYMBOL(d_drop);
-static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
+static inline void dentry_unlist(struct dentry *dentry)
{
struct dentry *next;
/*
@@ -545,12 +546,12 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
* attached to the dentry tree
*/
dentry->d_flags |= DCACHE_DENTRY_KILLED;
- if (unlikely(list_empty(&dentry->d_child)))
+ if (unlikely(hlist_unhashed(&dentry->d_sib)))
return;
- __list_del_entry(&dentry->d_child);
+ __hlist_del(&dentry->d_sib);
/*
* Cursors can move around the list of children. While we'd been
- * a normal list member, it didn't matter - ->d_child.next would've
+ * a normal list member, it didn't matter - ->d_sib.next would've
* been updated. However, from now on it won't be and for the
* things like d_walk() it might end up with a nasty surprise.
* Normally d_walk() doesn't care about cursors moving around -
@@ -558,29 +559,27 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent)
* of its own, we get through it without ever unlocking the parent.
* There is one exception, though - if we ascend from a child that
* gets killed as soon as we unlock it, the next sibling is found
- * using the value left in its ->d_child.next. And if _that_
+ * using the value left in its ->d_sib.next. And if _that_
* pointed to a cursor, and cursor got moved (e.g. by lseek())
* before d_walk() regains parent->d_lock, we'll end up skipping
* everything the cursor had been moved past.
*
- * Solution: make sure that the pointer left behind in ->d_child.next
+ * Solution: make sure that the pointer left behind in ->d_sib.next
* points to something that won't be moving around. I.e. skip the
* cursors.
*/
- while (dentry->d_child.next != &parent->d_subdirs) {
- next = list_entry(dentry->d_child.next, struct dentry, d_child);
+ while (dentry->d_sib.next) {
+ next = hlist_entry(dentry->d_sib.next, struct dentry, d_sib);
if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR)))
break;
- dentry->d_child.next = next->d_child.next;
+ dentry->d_sib.next = next->d_sib.next;
}
}
-static void __dentry_kill(struct dentry *dentry)
+static struct dentry *__dentry_kill(struct dentry *dentry)
{
struct dentry *parent = NULL;
bool can_free = true;
- if (!IS_ROOT(dentry))
- parent = dentry->d_parent;
/*
* The dentry is now unrecoverably dead to the world.
@@ -600,9 +599,6 @@ static void __dentry_kill(struct dentry *dentry)
}
/* if it was on the hash then remove it */
__d_drop(dentry);
- dentry_unlist(dentry, parent);
- if (parent)
- spin_unlock(&parent->d_lock);
if (dentry->d_inode)
dentry_unlink_inode(dentry);
else
@@ -611,80 +607,114 @@ static void __dentry_kill(struct dentry *dentry)
if (dentry->d_op && dentry->d_op->d_release)
dentry->d_op->d_release(dentry);
- spin_lock(&dentry->d_lock);
- if (dentry->d_flags & DCACHE_SHRINK_LIST) {
- dentry->d_flags |= DCACHE_MAY_FREE;
- can_free = false;
+ cond_resched();
+ /* now that it's negative, ->d_parent is stable */
+ if (!IS_ROOT(dentry)) {
+ parent = dentry->d_parent;
+ spin_lock(&parent->d_lock);
}
+ spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+ dentry_unlist(dentry);
+ if (dentry->d_flags & DCACHE_SHRINK_LIST)
+ can_free = false;
spin_unlock(&dentry->d_lock);
if (likely(can_free))
dentry_free(dentry);
- cond_resched();
-}
-
-static struct dentry *__lock_parent(struct dentry *dentry)
-{
- struct dentry *parent;
- rcu_read_lock();
- spin_unlock(&dentry->d_lock);
-again:
- parent = READ_ONCE(dentry->d_parent);
- spin_lock(&parent->d_lock);
- /*
- * We can't blindly lock dentry until we are sure
- * that we won't violate the locking order.
- * Any changes of dentry->d_parent must have
- * been done with parent->d_lock held, so
- * spin_lock() above is enough of a barrier
- * for checking if it's still our child.
- */
- if (unlikely(parent != dentry->d_parent)) {
+ if (parent && --parent->d_lockref.count) {
spin_unlock(&parent->d_lock);
- goto again;
+ return NULL;
}
- rcu_read_unlock();
- if (parent != dentry)
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- else
- parent = NULL;
return parent;
}
-static inline struct dentry *lock_parent(struct dentry *dentry)
+/*
+ * Lock a dentry for feeding it to __dentry_kill().
+ * Called under rcu_read_lock() and dentry->d_lock; the former
+ * guarantees that nothing we access will be freed under us.
+ * Note that dentry is *not* protected from concurrent dentry_kill(),
+ * d_delete(), etc.
+ *
+ * Return false if dentry is busy. Otherwise, return true and have
+ * that dentry's inode locked.
+ */
+
+static bool lock_for_kill(struct dentry *dentry)
{
- struct dentry *parent = dentry->d_parent;
- if (IS_ROOT(dentry))
- return NULL;
- if (likely(spin_trylock(&parent->d_lock)))
- return parent;
- return __lock_parent(dentry);
+ struct inode *inode = dentry->d_inode;
+
+ if (unlikely(dentry->d_lockref.count))
+ return false;
+
+ if (!inode || likely(spin_trylock(&inode->i_lock)))
+ return true;
+
+ do {
+ spin_unlock(&dentry->d_lock);
+ spin_lock(&inode->i_lock);
+ spin_lock(&dentry->d_lock);
+ if (likely(inode == dentry->d_inode))
+ break;
+ spin_unlock(&inode->i_lock);
+ inode = dentry->d_inode;
+ } while (inode);
+ if (likely(!dentry->d_lockref.count))
+ return true;
+ if (inode)
+ spin_unlock(&inode->i_lock);
+ return false;
}
-static inline bool retain_dentry(struct dentry *dentry)
+/*
+ * Decide if dentry is worth retaining. Usually this is called with dentry
+ * locked; if not locked, we are more limited and might not be able to tell
+ * without a lock. False in this case means "punt to locked path and recheck".
+ *
+ * In case we aren't locked, these predicates are not "stable". However, it is
+ * sufficient that at some point after we dropped the reference the dentry was
+ * hashed and the flags had the proper value. Other dentry users may have
+ * re-gotten a reference to the dentry and change that, but our work is done -
+ * we can leave the dentry around with a zero refcount.
+ */
+static inline bool retain_dentry(struct dentry *dentry, bool locked)
{
- WARN_ON(d_in_lookup(dentry));
+ unsigned int d_flags;
- /* Unreachable? Get rid of it */
+ smp_rmb();
+ d_flags = READ_ONCE(dentry->d_flags);
+
+ // Unreachable? Nobody would be able to look it up, no point retaining
if (unlikely(d_unhashed(dentry)))
return false;
- if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED))
+ // Same if it's disconnected
+ if (unlikely(d_flags & DCACHE_DISCONNECTED))
return false;
- if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) {
- if (dentry->d_op->d_delete(dentry))
+ // ->d_delete() might tell us not to bother, but that requires
+ // ->d_lock; can't decide without it
+ if (unlikely(d_flags & DCACHE_OP_DELETE)) {
+ if (!locked || dentry->d_op->d_delete(dentry))
return false;
}
- if (unlikely(dentry->d_flags & DCACHE_DONTCACHE))
+ // Explicitly told not to bother
+ if (unlikely(d_flags & DCACHE_DONTCACHE))
return false;
- /* retain; LRU fodder */
- dentry->d_lockref.count--;
- if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+ // At this point it looks like we ought to keep it. We also might
+ // need to do something - put it on LRU if it wasn't there already
+ // and mark it referenced if it was on LRU, but not marked yet.
+ // Unfortunately, both actions require ->d_lock, so in lockless
+ // case we'd have to punt rather than doing those.
+ if (unlikely(!(d_flags & DCACHE_LRU_LIST))) {
+ if (!locked)
+ return false;
d_lru_add(dentry);
- else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED)))
+ } else if (unlikely(!(d_flags & DCACHE_REFERENCED))) {
+ if (!locked)
+ return false;
dentry->d_flags |= DCACHE_REFERENCED;
+ }
return true;
}
@@ -704,60 +734,11 @@ void d_mark_dontcache(struct inode *inode)
EXPORT_SYMBOL(d_mark_dontcache);
/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *dentry_kill(struct dentry *dentry)
- __releases(dentry->d_lock)
-{
- struct inode *inode = dentry->d_inode;
- struct dentry *parent = NULL;
-
- if (inode && unlikely(!spin_trylock(&inode->i_lock)))
- goto slow_positive;
-
- if (!IS_ROOT(dentry)) {
- parent = dentry->d_parent;
- if (unlikely(!spin_trylock(&parent->d_lock))) {
- parent = __lock_parent(dentry);
- if (likely(inode || !dentry->d_inode))
- goto got_locks;
- /* negative that became positive */
- if (parent)
- spin_unlock(&parent->d_lock);
- inode = dentry->d_inode;
- goto slow_positive;
- }
- }
- __dentry_kill(dentry);
- return parent;
-
-slow_positive:
- spin_unlock(&dentry->d_lock);
- spin_lock(&inode->i_lock);
- spin_lock(&dentry->d_lock);
- parent = lock_parent(dentry);
-got_locks:
- if (unlikely(dentry->d_lockref.count != 1)) {
- dentry->d_lockref.count--;
- } else if (likely(!retain_dentry(dentry))) {
- __dentry_kill(dentry);
- return parent;
- }
- /* we are keeping it, after all */
- if (inode)
- spin_unlock(&inode->i_lock);
- if (parent)
- spin_unlock(&parent->d_lock);
- spin_unlock(&dentry->d_lock);
- return NULL;
-}
-
-/*
* Try to do a lockless dput(), and return whether that was successful.
*
* If unsuccessful, we return false, having already taken the dentry lock.
+ * In that case refcount is guaranteed to be zero and we have already
+ * decided that it's not worth keeping around.
*
* The caller needs to hold the RCU read lock, so that the dentry is
* guaranteed to stay around even if the refcount goes down to zero!
@@ -765,18 +746,9 @@ got_locks:
static inline bool fast_dput(struct dentry *dentry)
{
int ret;
- unsigned int d_flags;
-
- /*
- * If we have a d_op->d_delete() operation, we sould not
- * let the dentry count go to zero, so use "put_or_lock".
- */
- if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
- return lockref_put_or_lock(&dentry->d_lockref);
/*
- * .. otherwise, we can try to just decrement the
- * lockref optimistically.
+ * try to decrement the lockref optimistically.
*/
ret = lockref_put_return(&dentry->d_lockref);
@@ -787,12 +759,12 @@ static inline bool fast_dput(struct dentry *dentry)
*/
if (unlikely(ret < 0)) {
spin_lock(&dentry->d_lock);
- if (dentry->d_lockref.count > 1) {
- dentry->d_lockref.count--;
+ if (WARN_ON_ONCE(dentry->d_lockref.count <= 0)) {
spin_unlock(&dentry->d_lock);
return true;
}
- return false;
+ dentry->d_lockref.count--;
+ goto locked;
}
/*
@@ -802,45 +774,18 @@ static inline bool fast_dput(struct dentry *dentry)
return true;
/*
- * Careful, careful. The reference count went down
- * to zero, but we don't hold the dentry lock, so
- * somebody else could get it again, and do another
- * dput(), and we need to not race with that.
- *
- * However, there is a very special and common case
- * where we don't care, because there is nothing to
- * do: the dentry is still hashed, it does not have
- * a 'delete' op, and it's referenced and already on
- * the LRU list.
- *
- * NOTE! Since we aren't locked, these values are
- * not "stable". However, it is sufficient that at
- * some point after we dropped the reference the
- * dentry was hashed and the flags had the proper
- * value. Other dentry users may have re-gotten
- * a reference to the dentry and change that, but
- * our work is done - we can leave the dentry
- * around with a zero refcount.
- *
- * Nevertheless, there are two cases that we should kill
- * the dentry anyway.
- * 1. free disconnected dentries as soon as their refcount
- * reached zero.
- * 2. free dentries if they should not be cached.
+ * Can we decide that decrement of refcount is all we needed without
+ * taking the lock? There's a very common case when it's all we need -
+ * dentry looks like it ought to be retained and there's nothing else
+ * to do.
*/
- smp_rmb();
- d_flags = READ_ONCE(dentry->d_flags);
- d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST |
- DCACHE_DISCONNECTED | DCACHE_DONTCACHE;
-
- /* Nothing to do? Dropping the reference was all we needed? */
- if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
+ if (retain_dentry(dentry, false))
return true;
/*
- * Not the fast normal case? Get the lock. We've already decremented
- * the refcount, but we'll need to re-check the situation after
- * getting the lock.
+ * Either not worth retaining or we can't tell without the lock.
+ * Get the lock, then. We've already decremented the refcount to 0,
+ * but we'll need to re-check the situation after getting the lock.
*/
spin_lock(&dentry->d_lock);
@@ -850,17 +795,11 @@ static inline bool fast_dput(struct dentry *dentry)
* else could have killed it and marked it dead. Either way, we
* don't need to do anything else.
*/
- if (dentry->d_lockref.count) {
+locked:
+ if (dentry->d_lockref.count || retain_dentry(dentry, true)) {
spin_unlock(&dentry->d_lock);
return true;
}
-
- /*
- * Re-get the reference we optimistically dropped. We hold the
- * lock, and we just tested that it was zero, so we can just
- * set it to 1.
- */
- dentry->d_lockref.count = 1;
return false;
}
@@ -893,39 +832,37 @@ static inline bool fast_dput(struct dentry *dentry)
*/
void dput(struct dentry *dentry)
{
- while (dentry) {
- might_sleep();
-
- rcu_read_lock();
- if (likely(fast_dput(dentry))) {
- rcu_read_unlock();
- return;
- }
-
- /* Slow case: now with the dentry lock held */
+ if (!dentry)
+ return;
+ might_sleep();
+ rcu_read_lock();
+ if (likely(fast_dput(dentry))) {
rcu_read_unlock();
-
- if (likely(retain_dentry(dentry))) {
+ return;
+ }
+ while (lock_for_kill(dentry)) {
+ rcu_read_unlock();
+ dentry = __dentry_kill(dentry);
+ if (!dentry)
+ return;
+ if (retain_dentry(dentry, true)) {
spin_unlock(&dentry->d_lock);
return;
}
-
- dentry = dentry_kill(dentry);
+ rcu_read_lock();
}
+ rcu_read_unlock();
+ spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL(dput);
-static void __dput_to_list(struct dentry *dentry, struct list_head *list)
+static void to_shrink_list(struct dentry *dentry, struct list_head *list)
__must_hold(&dentry->d_lock)
{
- if (dentry->d_flags & DCACHE_SHRINK_LIST) {
- /* let the owner of the list it's on deal with it */
- --dentry->d_lockref.count;
- } else {
+ if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
if (dentry->d_flags & DCACHE_LRU_LIST)
d_lru_del(dentry);
- if (!--dentry->d_lockref.count)
- d_shrink_add(dentry, list);
+ d_shrink_add(dentry, list);
}
}
@@ -937,22 +874,10 @@ void dput_to_list(struct dentry *dentry, struct list_head *list)
return;
}
rcu_read_unlock();
- if (!retain_dentry(dentry))
- __dput_to_list(dentry, list);
+ to_shrink_list(dentry, list);
spin_unlock(&dentry->d_lock);
}
-/* This must be called with d_lock held */
-static inline void __dget_dlock(struct dentry *dentry)
-{
- dentry->d_lockref.count++;
-}
-
-static inline void __dget(struct dentry *dentry)
-{
- lockref_get(&dentry->d_lockref);
-}
-
struct dentry *dget_parent(struct dentry *dentry)
{
int gotref;
@@ -1002,7 +927,7 @@ static struct dentry * __d_find_any_alias(struct inode *inode)
if (hlist_empty(&inode->i_dentry))
return NULL;
alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias);
- __dget(alias);
+ lockref_get(&alias->d_lockref);
return alias;
}
@@ -1034,7 +959,7 @@ static struct dentry *__d_find_alias(struct inode *inode)
hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) {
spin_lock(&alias->d_lock);
if (!d_unhashed(alias)) {
- __dget_dlock(alias);
+ dget_dlock(alias);
spin_unlock(&alias->d_lock);
return alias;
}
@@ -1101,104 +1026,53 @@ struct dentry *d_find_alias_rcu(struct inode *inode)
*/
void d_prune_aliases(struct inode *inode)
{
+ LIST_HEAD(dispose);
struct dentry *dentry;
-restart:
+
spin_lock(&inode->i_lock);
hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {
spin_lock(&dentry->d_lock);
- if (!dentry->d_lockref.count) {
- struct dentry *parent = lock_parent(dentry);
- if (likely(!dentry->d_lockref.count)) {
- __dentry_kill(dentry);
- dput(parent);
- goto restart;
- }
- if (parent)
- spin_unlock(&parent->d_lock);
- }
+ if (!dentry->d_lockref.count)
+ to_shrink_list(dentry, &dispose);
spin_unlock(&dentry->d_lock);
}
spin_unlock(&inode->i_lock);
+ shrink_dentry_list(&dispose);
}
EXPORT_SYMBOL(d_prune_aliases);
-/*
- * Lock a dentry from shrink list.
- * Called under rcu_read_lock() and dentry->d_lock; the former
- * guarantees that nothing we access will be freed under us.
- * Note that dentry is *not* protected from concurrent dentry_kill(),
- * d_delete(), etc.
- *
- * Return false if dentry has been disrupted or grabbed, leaving
- * the caller to kick it off-list. Otherwise, return true and have
- * that dentry's inode and parent both locked.
- */
-static bool shrink_lock_dentry(struct dentry *dentry)
+static inline void shrink_kill(struct dentry *victim)
{
- struct inode *inode;
- struct dentry *parent;
-
- if (dentry->d_lockref.count)
- return false;
-
- inode = dentry->d_inode;
- if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
- spin_unlock(&dentry->d_lock);
- spin_lock(&inode->i_lock);
- spin_lock(&dentry->d_lock);
- if (unlikely(dentry->d_lockref.count))
- goto out;
- /* changed inode means that somebody had grabbed it */
- if (unlikely(inode != dentry->d_inode))
- goto out;
- }
-
- parent = dentry->d_parent;
- if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock)))
- return true;
-
- spin_unlock(&dentry->d_lock);
- spin_lock(&parent->d_lock);
- if (unlikely(parent != dentry->d_parent)) {
- spin_unlock(&parent->d_lock);
- spin_lock(&dentry->d_lock);
- goto out;
- }
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
- if (likely(!dentry->d_lockref.count))
- return true;
- spin_unlock(&parent->d_lock);
-out:
- if (inode)
- spin_unlock(&inode->i_lock);
- return false;
+ do {
+ rcu_read_unlock();
+ victim = __dentry_kill(victim);
+ rcu_read_lock();
+ } while (victim && lock_for_kill(victim));
+ rcu_read_unlock();
+ if (victim)
+ spin_unlock(&victim->d_lock);
}
void shrink_dentry_list(struct list_head *list)
{
while (!list_empty(list)) {
- struct dentry *dentry, *parent;
+ struct dentry *dentry;
dentry = list_entry(list->prev, struct dentry, d_lru);
spin_lock(&dentry->d_lock);
rcu_read_lock();
- if (!shrink_lock_dentry(dentry)) {
- bool can_free = false;
+ if (!lock_for_kill(dentry)) {
+ bool can_free;
rcu_read_unlock();
d_shrink_del(dentry);
- if (dentry->d_lockref.count < 0)
- can_free = dentry->d_flags & DCACHE_MAY_FREE;
+ can_free = dentry->d_flags & DCACHE_DENTRY_KILLED;
spin_unlock(&dentry->d_lock);
if (can_free)
dentry_free(dentry);
continue;
}
- rcu_read_unlock();
d_shrink_del(dentry);
- parent = dentry->d_parent;
- if (parent != dentry)
- __dput_to_list(parent, list);
- __dentry_kill(dentry);
+ shrink_kill(dentry);
}
}
@@ -1240,7 +1114,7 @@ static enum lru_status dentry_lru_isolate(struct list_head *item,
*
* This is guaranteed by the fact that all LRU management
* functions are intermediated by the LRU API calls like
- * list_lru_add and list_lru_del. List movement in this file
+ * list_lru_add_obj and list_lru_del_obj. List movement in this file
* only ever occur through this functions or through callbacks
* like this one, that are called from the LRU API.
*
@@ -1348,8 +1222,7 @@ enum d_walk_ret {
static void d_walk(struct dentry *parent, void *data,
enum d_walk_ret (*enter)(void *, struct dentry *))
{
- struct dentry *this_parent;
- struct list_head *next;
+ struct dentry *this_parent, *dentry;
unsigned seq = 0;
enum d_walk_ret ret;
bool retry = true;
@@ -1371,13 +1244,9 @@ again:
break;
}
repeat:
- next = this_parent->d_subdirs.next;
+ dentry = d_first_child(this_parent);
resume:
- while (next != &this_parent->d_subdirs) {
- struct list_head *tmp = next;
- struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
- next = tmp->next;
-
+ hlist_for_each_entry_from(dentry, d_sib) {
if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR))
continue;
@@ -1398,7 +1267,7 @@ resume:
continue;
}
- if (!list_empty(&dentry->d_subdirs)) {
+ if (!hlist_empty(&dentry->d_children)) {
spin_unlock(&this_parent->d_lock);
spin_release(&dentry->d_lock.dep_map, _RET_IP_);
this_parent = dentry;
@@ -1413,24 +1282,23 @@ resume:
rcu_read_lock();
ascend:
if (this_parent != parent) {
- struct dentry *child = this_parent;
- this_parent = child->d_parent;
+ dentry = this_parent;
+ this_parent = dentry->d_parent;
- spin_unlock(&child->d_lock);
+ spin_unlock(&dentry->d_lock);
spin_lock(&this_parent->d_lock);
/* might go back up the wrong parent if we have had a rename. */
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
/* go into the first sibling still alive */
- do {
- next = child->d_child.next;
- if (next == &this_parent->d_subdirs)
- goto ascend;
- child = list_entry(next, struct dentry, d_child);
- } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
- rcu_read_unlock();
- goto resume;
+ hlist_for_each_entry_continue(dentry, d_sib) {
+ if (likely(!(dentry->d_flags & DCACHE_DENTRY_KILLED))) {
+ rcu_read_unlock();
+ goto resume;
+ }
+ }
+ goto ascend;
}
if (need_seqretry(&rename_lock, seq))
goto rename_retry;
@@ -1530,7 +1398,7 @@ out:
* Search the dentry child list of the specified parent,
* and move any unused dentries to the end of the unused
* list for prune_dcache(). We descend to the next level
- * whenever the d_subdirs list is non-empty and continue
+ * whenever the d_children list is non-empty and continue
* searching.
*
* It returns zero iff there are no unused children,
@@ -1560,13 +1428,11 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
if (dentry->d_flags & DCACHE_SHRINK_LIST) {
data->found++;
- } else {
- if (dentry->d_flags & DCACHE_LRU_LIST)
- d_lru_del(dentry);
- if (!dentry->d_lockref.count) {
- d_shrink_add(dentry, &data->dispose);
- data->found++;
- }
+ } else if (!dentry->d_lockref.count) {
+ to_shrink_list(dentry, &data->dispose);
+ data->found++;
+ } else if (dentry->d_lockref.count < 0) {
+ data->found++;
}
/*
* We can return to the caller if we have found some (this
@@ -1587,17 +1453,13 @@ static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry)
if (data->start == dentry)
goto out;
- if (dentry->d_flags & DCACHE_SHRINK_LIST) {
- if (!dentry->d_lockref.count) {
+ if (!dentry->d_lockref.count) {
+ if (dentry->d_flags & DCACHE_SHRINK_LIST) {
rcu_read_lock();
data->victim = dentry;
return D_WALK_QUIT;
}
- } else {
- if (dentry->d_flags & DCACHE_LRU_LIST)
- d_lru_del(dentry);
- if (!dentry->d_lockref.count)
- d_shrink_add(dentry, &data->dispose);
+ to_shrink_list(dentry, &data->dispose);
}
/*
* We can return to the caller if we have found some (this
@@ -1635,17 +1497,12 @@ void shrink_dcache_parent(struct dentry *parent)
data.victim = NULL;
d_walk(parent, &data, select_collect2);
if (data.victim) {
- struct dentry *parent;
spin_lock(&data.victim->d_lock);
- if (!shrink_lock_dentry(data.victim)) {
+ if (!lock_for_kill(data.victim)) {
spin_unlock(&data.victim->d_lock);
rcu_read_unlock();
} else {
- rcu_read_unlock();
- parent = data.victim->d_parent;
- if (parent != data.victim)
- __dput_to_list(parent, &data.dispose);
- __dentry_kill(data.victim);
+ shrink_kill(data.victim);
}
}
if (!list_empty(&data.dispose))
@@ -1657,7 +1514,7 @@ EXPORT_SYMBOL(shrink_dcache_parent);
static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
{
/* it has busy descendents; complain about those instead */
- if (!list_empty(&dentry->d_subdirs))
+ if (!hlist_empty(&dentry->d_children))
return D_WALK_CONTINUE;
/* root with refcount 1 is fine */
@@ -1707,8 +1564,7 @@ static enum d_walk_ret find_submount(void *_data, struct dentry *dentry)
{
struct dentry **victim = _data;
if (d_mountpoint(dentry)) {
- __dget_dlock(dentry);
- *victim = dentry;
+ *victim = dget_dlock(dentry);
return D_WALK_QUIT;
}
return D_WALK_CONTINUE;
@@ -1814,9 +1670,9 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
dentry->d_fsdata = NULL;
INIT_HLIST_BL_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
- INIT_LIST_HEAD(&dentry->d_subdirs);
+ INIT_HLIST_HEAD(&dentry->d_children);
INIT_HLIST_NODE(&dentry->d_u.d_alias);
- INIT_LIST_HEAD(&dentry->d_child);
+ INIT_HLIST_NODE(&dentry->d_sib);
d_set_d_op(dentry, dentry->d_sb->s_d_op);
if (dentry->d_op && dentry->d_op->d_init) {
@@ -1853,9 +1709,8 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
* don't need child lock because it is not subject
* to concurrency here
*/
- __dget_dlock(parent);
- dentry->d_parent = parent;
- list_add(&dentry->d_child, &parent->d_subdirs);
+ dentry->d_parent = dget_dlock(parent);
+ hlist_add_head(&dentry->d_sib, &parent->d_children);
spin_unlock(&parent->d_lock);
return dentry;
@@ -1895,9 +1750,15 @@ struct dentry *d_alloc_cursor(struct dentry * parent)
*/
struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name)
{
+ static const struct dentry_operations anon_ops = {
+ .d_dname = simple_dname
+ };
struct dentry *dentry = __d_alloc(sb, name);
- if (likely(dentry))
+ if (likely(dentry)) {
dentry->d_flags |= DCACHE_NORCU;
+ if (!sb->s_d_op)
+ d_set_d_op(dentry, &anon_ops);
+ }
return dentry;
}
@@ -1941,22 +1802,6 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op)
}
EXPORT_SYMBOL(d_set_d_op);
-
-/*
- * d_set_fallthru - Mark a dentry as falling through to a lower layer
- * @dentry - The dentry to mark
- *
- * Mark a dentry as falling through to the lower layer (as set with
- * d_pin_lower()). This flag may be recorded on the medium.
- */
-void d_set_fallthru(struct dentry *dentry)
-{
- spin_lock(&dentry->d_lock);
- dentry->d_flags |= DCACHE_FALLTHRU;
- spin_unlock(&dentry->d_lock);
-}
-EXPORT_SYMBOL(d_set_fallthru);
-
static unsigned d_flags_for_inode(struct inode *inode)
{
unsigned add_flags = DCACHE_REGULAR_TYPE;
@@ -2075,75 +1920,55 @@ struct dentry *d_make_root(struct inode *root_inode)
}
EXPORT_SYMBOL(d_make_root);
-static struct dentry *__d_instantiate_anon(struct dentry *dentry,
- struct inode *inode,
- bool disconnected)
-{
- struct dentry *res;
- unsigned add_flags;
-
- security_d_instantiate(dentry, inode);
- spin_lock(&inode->i_lock);
- res = __d_find_any_alias(inode);
- if (res) {
- spin_unlock(&inode->i_lock);
- dput(dentry);
- goto out_iput;
- }
-
- /* attach a disconnected dentry */
- add_flags = d_flags_for_inode(inode);
-
- if (disconnected)
- add_flags |= DCACHE_DISCONNECTED;
-
- spin_lock(&dentry->d_lock);
- __d_set_inode_and_type(dentry, inode, add_flags);
- hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry);
- if (!disconnected) {
- hlist_bl_lock(&dentry->d_sb->s_roots);
- hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots);
- hlist_bl_unlock(&dentry->d_sb->s_roots);
- }
- spin_unlock(&dentry->d_lock);
- spin_unlock(&inode->i_lock);
-
- return dentry;
-
- out_iput:
- iput(inode);
- return res;
-}
-
-struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode)
-{
- return __d_instantiate_anon(dentry, inode, true);
-}
-EXPORT_SYMBOL(d_instantiate_anon);
-
static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected)
{
- struct dentry *tmp;
- struct dentry *res;
+ struct super_block *sb;
+ struct dentry *new, *res;
if (!inode)
return ERR_PTR(-ESTALE);
if (IS_ERR(inode))
return ERR_CAST(inode);
- res = d_find_any_alias(inode);
+ sb = inode->i_sb;
+
+ res = d_find_any_alias(inode); /* existing alias? */
if (res)
- goto out_iput;
+ goto out;
- tmp = d_alloc_anon(inode->i_sb);
- if (!tmp) {
+ new = d_alloc_anon(sb);
+ if (!new) {
res = ERR_PTR(-ENOMEM);
- goto out_iput;
+ goto out;
}
- return __d_instantiate_anon(tmp, inode, disconnected);
+ security_d_instantiate(new, inode);
+ spin_lock(&inode->i_lock);
+ res = __d_find_any_alias(inode); /* recheck under lock */
+ if (likely(!res)) { /* still no alias, attach a disconnected dentry */
+ unsigned add_flags = d_flags_for_inode(inode);
+
+ if (disconnected)
+ add_flags |= DCACHE_DISCONNECTED;
+
+ spin_lock(&new->d_lock);
+ __d_set_inode_and_type(new, inode, add_flags);
+ hlist_add_head(&new->d_u.d_alias, &inode->i_dentry);
+ if (!disconnected) {
+ hlist_bl_lock(&sb->s_roots);
+ hlist_bl_add_head(&new->d_hash, &sb->s_roots);
+ hlist_bl_unlock(&sb->s_roots);
+ }
+ spin_unlock(&new->d_lock);
+ spin_unlock(&inode->i_lock);
+ inode = NULL; /* consumed by new->d_inode */
+ res = new;
+ } else {
+ spin_unlock(&inode->i_lock);
+ dput(new);
+ }
-out_iput:
+ out:
iput(inode);
return res;
}
@@ -2727,7 +2552,7 @@ retry:
/* we can't take ->d_lock here; it's OK, though. */
new->d_flags |= DCACHE_PAR_LOOKUP;
new->d_wait = wq;
- hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b);
+ hlist_bl_add_head(&new->d_u.d_in_lookup_hash, b);
hlist_bl_unlock(b);
return new;
mismatch:
@@ -2851,7 +2676,7 @@ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode)
spin_unlock(&alias->d_lock);
alias = NULL;
} else {
- __dget_dlock(alias);
+ dget_dlock(alias);
__d_rehash(alias);
spin_unlock(&alias->d_lock);
}
@@ -2993,11 +2818,15 @@ static void __d_move(struct dentry *dentry, struct dentry *target,
} else {
target->d_parent = old_parent;
swap_names(dentry, target);
- list_move(&target->d_child, &target->d_parent->d_subdirs);
+ if (!hlist_unhashed(&target->d_sib))
+ __hlist_del(&target->d_sib);
+ hlist_add_head(&target->d_sib, &target->d_parent->d_children);
__d_rehash(target);
fsnotify_update_flags(target);
}
- list_move(&dentry->d_child, &dentry->d_parent->d_subdirs);
+ if (!hlist_unhashed(&dentry->d_sib))
+ __hlist_del(&dentry->d_sib);
+ hlist_add_head(&dentry->d_sib, &dentry->d_parent->d_children);
__d_rehash(dentry);
fsnotify_update_flags(dentry);
fscrypt_handle_d_move(dentry);
@@ -3080,8 +2909,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2)
* Note: If ever the locking in lock_rename() changes, then please
* remember to update this too...
*/
-static int __d_unalias(struct inode *inode,
- struct dentry *dentry, struct dentry *alias)
+static int __d_unalias(struct dentry *dentry, struct dentry *alias)
{
struct mutex *m1 = NULL;
struct rw_semaphore *m2 = NULL;
@@ -3162,7 +2990,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
inode->i_sb->s_id);
} else if (!IS_ROOT(new)) {
struct dentry *old_parent = dget(new->d_parent);
- int err = __d_unalias(inode, dentry, new);
+ int err = __d_unalias(dentry, new);
write_sequnlock(&rename_lock);
if (err) {
dput(new);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 6d7c1a49581f..c6f4a9a98b85 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -1100,17 +1100,35 @@ static ssize_t read_file_blob(struct file *file, char __user *user_buf,
return r;
}
+static ssize_t write_file_blob(struct file *file, const char __user *user_buf,
+ size_t count, loff_t *ppos)
+{
+ struct debugfs_blob_wrapper *blob = file->private_data;
+ struct dentry *dentry = F_DENTRY(file);
+ ssize_t r;
+
+ r = debugfs_file_get(dentry);
+ if (unlikely(r))
+ return r;
+ r = simple_write_to_buffer(blob->data, blob->size, ppos, user_buf,
+ count);
+
+ debugfs_file_put(dentry);
+ return r;
+}
+
static const struct file_operations fops_blob = {
.read = read_file_blob,
+ .write = write_file_blob,
.open = simple_open,
.llseek = default_llseek,
};
/**
- * debugfs_create_blob - create a debugfs file that is used to read a binary blob
+ * debugfs_create_blob - create a debugfs file that is used to read and write
+ * a binary blob
* @name: a pointer to a string containing the name of the file to create.
- * @mode: the read permission that the file should have (other permissions are
- * masked out)
+ * @mode: the permission that the file should have
* @parent: a pointer to the parent dentry for this file. This should be a
* directory dentry if set. If this parameter is %NULL, then the
* file will be created in the root of the debugfs filesystem.
@@ -1119,7 +1137,7 @@ static const struct file_operations fops_blob = {
*
* This function creates a file in debugfs with the given name that exports
* @blob->data as a binary blob. If the @mode variable is so set it can be
- * read from. Writing is not supported.
+ * read from and written to.
*
* This function will return a pointer to a dentry if it succeeds. This
* pointer must be passed to the debugfs_remove() function when the file is
@@ -1134,7 +1152,7 @@ struct dentry *debugfs_create_blob(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_blob_wrapper *blob)
{
- return debugfs_create_file_unsafe(name, mode & 0444, parent, blob, &fops_blob);
+ return debugfs_create_file_unsafe(name, mode & 0644, parent, blob, &fops_blob);
}
EXPORT_SYMBOL_GPL(debugfs_create_blob);
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c830261aa883..b20e565b9c5e 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -69,7 +69,6 @@ static struct ctl_table pty_table[] = {
.data = &pty_count,
.proc_handler = proc_dointvec,
},
- {}
};
struct pts_mount_opts {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 20533266ade6..60456263a338 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1114,7 +1114,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
loff_t offset = iocb->ki_pos;
const loff_t end = offset + count;
struct dio *dio;
- struct dio_submit sdio = { 0, };
+ struct dio_submit sdio = { NULL, };
struct buffer_head map_bh = { 0, };
struct blk_plug plug;
unsigned long align = offset | iov_iter_alignment(iter);
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 42f332f46359..4fa11d9ddbb6 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -443,14 +443,14 @@ static int table_seq_show(struct seq_file *seq, void *iter_ptr)
break;
case 3:
if (ri->header) {
- seq_puts(seq, "version rsb 1.1 lvb 1.1 lkb 1.1\n");
+ seq_puts(seq, "rsb ptr nodeid first_lkid flags !root_list_empty !recover_list_empty recover_locks_count len\n");
ri->header = 0;
}
print_format3(ri->rsb, seq);
break;
case 4:
if (ri->header) {
- seq_puts(seq, "version 4 rsb 2\n");
+ seq_puts(seq, "rsb ptr nodeid master_nodeid dir_nodeid our_nodeid toss_time flags len str|hex name\n");
ri->header = 0;
}
print_format4(ri->rsb, seq);
@@ -748,7 +748,7 @@ static int table_open4(struct inode *inode, struct file *file)
struct seq_file *seq;
int ret;
- ret = seq_open(file, &format5_seq_ops);
+ ret = seq_open(file, &format4_seq_ops);
if (ret)
return ret;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 67f8dd8a05ef..6296c62c10fa 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1817,8 +1817,8 @@ static int dlm_tcp_bind(struct socket *sock)
memcpy(&src_addr, &dlm_local_addr[0], sizeof(src_addr));
make_sockaddr(&src_addr, 0, &addr_len);
- result = sock->ops->bind(sock, (struct sockaddr *)&src_addr,
- addr_len);
+ result = kernel_bind(sock, (struct sockaddr *)&src_addr,
+ addr_len);
if (result < 0) {
/* This *may* not indicate a critical error */
log_print("could not bind for connect: %d", result);
@@ -1830,7 +1830,7 @@ static int dlm_tcp_bind(struct socket *sock)
static int dlm_tcp_connect(struct connection *con, struct socket *sock,
struct sockaddr *addr, int addr_len)
{
- return sock->ops->connect(sock, addr, addr_len, O_NONBLOCK);
+ return kernel_connect(sock, addr, addr_len, O_NONBLOCK);
}
static int dlm_tcp_listen_validate(void)
@@ -1862,8 +1862,8 @@ static int dlm_tcp_listen_bind(struct socket *sock)
/* Bind to our port */
make_sockaddr(&dlm_local_addr[0], dlm_config.ci_tcp_port, &addr_len);
- return sock->ops->bind(sock, (struct sockaddr *)&dlm_local_addr[0],
- addr_len);
+ return kernel_bind(sock, (struct sockaddr *)&dlm_local_addr[0],
+ addr_len);
}
static const struct dlm_proto_ops dlm_tcp_ops = {
@@ -1888,12 +1888,12 @@ static int dlm_sctp_connect(struct connection *con, struct socket *sock,
int ret;
/*
- * Make sock->ops->connect() function return in specified time,
+ * Make kernel_connect() function return in specified time,
* since O_NONBLOCK argument in connect() function does not work here,
* then, we should restore the default value of this attribute.
*/
sock_set_sndtimeo(sock->sk, 5);
- ret = sock->ops->connect(sock, addr, addr_len, 0);
+ ret = kernel_connect(sock, addr, addr_len, 0);
sock_set_sndtimeo(sock->sk, 0);
return ret;
}
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index e6b4c1a21446..d814c5121367 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -140,11 +140,12 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
op->info.optype = DLM_PLOCK_OP_LOCK;
op->info.pid = fl->fl_pid;
op->info.ex = (fl->fl_type == F_WRLCK);
- op->info.wait = IS_SETLKW(cmd);
+ op->info.wait = !!(fl->fl_flags & FL_SLEEP);
op->info.fsid = ls->ls_global_id;
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
+ op->info.owner = (__u64)(long)fl->fl_owner;
/* async handling */
if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
op_data = kzalloc(sizeof(*op_data), GFP_NOFS);
@@ -154,9 +155,6 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
goto out;
}
- /* fl_owner is lockd which doesn't distinguish
- processes on the nfs client */
- op->info.owner = (__u64) fl->fl_pid;
op_data->callback = fl->fl_lmops->lm_grant;
locks_init_lock(&op_data->flc);
locks_copy_lock(&op_data->flc, fl);
@@ -168,8 +166,6 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
send_op(op);
rv = FILE_LOCK_DEFERRED;
goto out;
- } else {
- op->info.owner = (__u64)(long) fl->fl_owner;
}
send_op(op);
@@ -326,10 +322,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- if (fl->fl_lmops && fl->fl_lmops->lm_grant)
- op->info.owner = (__u64) fl->fl_pid;
- else
- op->info.owner = (__u64)(long) fl->fl_owner;
+ op->info.owner = (__u64)(long)fl->fl_owner;
if (fl->fl_flags & FL_CLOSE) {
op->info.flags |= DLM_PLOCK_FL_CLOSE;
@@ -389,7 +382,7 @@ int dlm_posix_cancel(dlm_lockspace_t *lockspace, u64 number, struct file *file,
info.number = number;
info.start = fl->fl_start;
info.end = fl->fl_end;
- info.owner = (__u64)fl->fl_pid;
+ info.owner = (__u64)(long)fl->fl_owner;
rv = do_lock_cancel(&info);
switch (rv) {
@@ -450,10 +443,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
op->info.number = number;
op->info.start = fl->fl_start;
op->info.end = fl->fl_end;
- if (fl->fl_lmops && fl->fl_lmops->lm_grant)
- op->info.owner = (__u64) fl->fl_pid;
- else
- op->info.owner = (__u64)(long) fl->fl_owner;
+ op->info.owner = (__u64)(long)fl->fl_owner;
send_op(op);
wait_event(recv_wq, (op->done != 0));
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index b0e8774c435a..5ed1e4cf6c0b 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -78,6 +78,14 @@ static struct inode *__ecryptfs_get_inode(struct inode *lower_inode,
if (lower_inode->i_sb != ecryptfs_superblock_to_lower(sb))
return ERR_PTR(-EXDEV);
+
+ /* Reject dealing with casefold directories. */
+ if (IS_CASEFOLDED(lower_inode)) {
+ pr_err_ratelimited("%s: Can't handle casefolded directory.\n",
+ __func__);
+ return ERR_PTR(-EREMOTE);
+ }
+
if (!igrab(lower_inode))
return ERR_PTR(-ESTALE);
inode = iget5_locked(sb, (unsigned long)lower_inode,
@@ -599,6 +607,8 @@ ecryptfs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
target_inode = d_inode(new_dentry);
trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ if (IS_ERR(trap))
+ return PTR_ERR(trap);
dget(lower_new_dentry);
rc = -EINVAL;
if (lower_old_dentry->d_parent != lower_old_dir_dentry)
diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c
index 91290fe4a70b..586446e02ef7 100644
--- a/fs/efivarfs/inode.c
+++ b/fs/efivarfs/inode.c
@@ -77,6 +77,7 @@ bool efivarfs_valid_name(const char *str, int len)
static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
+ struct efivarfs_fs_info *info = dir->i_sb->s_fs_info;
struct inode *inode = NULL;
struct efivar_entry *var;
int namelen, i = 0, err = 0;
@@ -118,7 +119,7 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir,
inode->i_private = var;
kmemleak_ignore(var);
- err = efivar_entry_add(var, &efivarfs_list);
+ err = efivar_entry_add(var, &info->efivarfs_list);
if (err)
goto out;
diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h
index c66647f5c0bd..f7206158ee81 100644
--- a/fs/efivarfs/internal.h
+++ b/fs/efivarfs/internal.h
@@ -16,6 +16,9 @@ struct efivarfs_mount_opts {
struct efivarfs_fs_info {
struct efivarfs_mount_opts mount_opts;
+ struct list_head efivarfs_list;
+ struct super_block *sb;
+ struct notifier_block nb;
};
struct efi_variable {
@@ -33,8 +36,9 @@ struct efivar_entry {
struct kobject kobj;
};
-int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
- void *data, bool duplicates, struct list_head *head);
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
+ struct list_head *),
+ void *data, struct list_head *head);
int efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
void __efivar_entry_add(struct efivar_entry *entry, struct list_head *head);
@@ -64,6 +68,4 @@ extern struct inode *efivarfs_get_inode(struct super_block *sb,
const struct inode *dir, int mode, dev_t dev,
bool is_removable);
-extern struct list_head efivarfs_list;
-
#endif /* EFIVAR_FS_INTERNAL_H */
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 77240953a92e..bb14462f6d99 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -15,10 +15,29 @@
#include <linux/slab.h>
#include <linux/magic.h>
#include <linux/statfs.h>
+#include <linux/notifier.h>
+#include <linux/printk.h>
#include "internal.h"
-LIST_HEAD(efivarfs_list);
+static int efivarfs_ops_notifier(struct notifier_block *nb, unsigned long event,
+ void *data)
+{
+ struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info, nb);
+
+ switch (event) {
+ case EFIVAR_OPS_RDONLY:
+ sfi->sb->s_flags |= SB_RDONLY;
+ break;
+ case EFIVAR_OPS_RDWR:
+ sfi->sb->s_flags &= ~SB_RDONLY;
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
+}
static void efivarfs_evict_inode(struct inode *inode)
{
@@ -166,7 +185,8 @@ static struct dentry *efivarfs_alloc_dentry(struct dentry *parent, char *name)
}
static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
- unsigned long name_size, void *data)
+ unsigned long name_size, void *data,
+ struct list_head *list)
{
struct super_block *sb = (struct super_block *)data;
struct efivar_entry *entry;
@@ -221,7 +241,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
}
__efivar_entry_get(entry, NULL, &size, NULL);
- __efivar_entry_add(entry, &efivarfs_list);
+ __efivar_entry_add(entry, list);
/* copied by the above to local storage in the dentry. */
kfree(name);
@@ -291,13 +311,11 @@ static int efivarfs_parse_param(struct fs_context *fc, struct fs_parameter *para
static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct efivarfs_fs_info *sfi = sb->s_fs_info;
struct inode *inode = NULL;
struct dentry *root;
int err;
- if (!efivar_is_available())
- return -EOPNOTSUPP;
-
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_blocksize = PAGE_SIZE;
sb->s_blocksize_bits = PAGE_SHIFT;
@@ -319,13 +337,13 @@ static int efivarfs_fill_super(struct super_block *sb, struct fs_context *fc)
if (!root)
return -ENOMEM;
- INIT_LIST_HEAD(&efivarfs_list);
-
- err = efivar_init(efivarfs_callback, (void *)sb, true, &efivarfs_list);
+ sfi->sb = sb;
+ sfi->nb.notifier_call = efivarfs_ops_notifier;
+ err = blocking_notifier_chain_register(&efivar_ops_nh, &sfi->nb);
if (err)
- efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL);
+ return err;
- return err;
+ return efivar_init(efivarfs_callback, sb, &sfi->efivarfs_list);
}
static int efivarfs_get_tree(struct fs_context *fc)
@@ -333,19 +351,35 @@ static int efivarfs_get_tree(struct fs_context *fc)
return get_tree_single(fc, efivarfs_fill_super);
}
+static int efivarfs_reconfigure(struct fs_context *fc)
+{
+ if (!efivar_supports_writes() && !(fc->sb_flags & SB_RDONLY)) {
+ pr_err("Firmware does not support SetVariableRT. Can not remount with rw\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static const struct fs_context_operations efivarfs_context_ops = {
.get_tree = efivarfs_get_tree,
.parse_param = efivarfs_parse_param,
+ .reconfigure = efivarfs_reconfigure,
};
static int efivarfs_init_fs_context(struct fs_context *fc)
{
struct efivarfs_fs_info *sfi;
+ if (!efivar_is_available())
+ return -EOPNOTSUPP;
+
sfi = kzalloc(sizeof(*sfi), GFP_KERNEL);
if (!sfi)
return -ENOMEM;
+ INIT_LIST_HEAD(&sfi->efivarfs_list);
+
sfi->mount_opts.uid = GLOBAL_ROOT_UID;
sfi->mount_opts.gid = GLOBAL_ROOT_GID;
@@ -356,13 +390,14 @@ static int efivarfs_init_fs_context(struct fs_context *fc)
static void efivarfs_kill_sb(struct super_block *sb)
{
- kill_litter_super(sb);
+ struct efivarfs_fs_info *sfi = sb->s_fs_info;
- if (!efivar_is_available())
- return;
+ blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb);
+ kill_litter_super(sb);
/* Remove all entries and destroy */
- efivar_entry_iter(efivarfs_destroy, &efivarfs_list, NULL);
+ efivar_entry_iter(efivarfs_destroy, &sfi->efivarfs_list, NULL);
+ kfree(sfi);
}
static struct file_system_type efivarfs_type = {
diff --git a/fs/efivarfs/vars.c b/fs/efivarfs/vars.c
index 9e4f47808bd5..4d722af1014f 100644
--- a/fs/efivarfs/vars.c
+++ b/fs/efivarfs/vars.c
@@ -361,7 +361,6 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
* efivar_init - build the initial list of EFI variables
* @func: callback function to invoke for every variable
* @data: function-specific data to pass to @func
- * @duplicates: error if we encounter duplicates on @head?
* @head: initialised head of variable list
*
* Get every EFI variable from the firmware and invoke @func. @func
@@ -369,10 +368,11 @@ static void dup_variable_bug(efi_char16_t *str16, efi_guid_t *vendor_guid,
*
* Returns 0 on success, or a kernel error code on failure.
*/
-int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
- void *data, bool duplicates, struct list_head *head)
+int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *,
+ struct list_head *),
+ void *data, struct list_head *head)
{
- unsigned long variable_name_size = 1024;
+ unsigned long variable_name_size = 512;
efi_char16_t *variable_name;
efi_status_t status;
efi_guid_t vendor_guid;
@@ -389,12 +389,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
goto free;
/*
- * Per EFI spec, the maximum storage allocated for both
- * the variable name and variable data is 1024 bytes.
+ * A small set of old UEFI implementations reject sizes
+ * above a certain threshold, the lowest seen in the wild
+ * is 512.
*/
do {
- variable_name_size = 1024;
+ variable_name_size = 512;
status = efivar_get_next_variable(&variable_name_size,
variable_name,
@@ -412,15 +413,14 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
* we'll ever see a different variable name,
* and may end up looping here forever.
*/
- if (duplicates &&
- variable_is_present(variable_name, &vendor_guid,
+ if (variable_is_present(variable_name, &vendor_guid,
head)) {
dup_variable_bug(variable_name, &vendor_guid,
variable_name_size);
status = EFI_NOT_FOUND;
} else {
err = func(variable_name, vendor_guid,
- variable_name_size, data);
+ variable_name_size, data, head);
if (err)
status = EFI_NOT_FOUND;
}
@@ -431,9 +431,13 @@ int efivar_init(int (*func)(efi_char16_t *, efi_guid_t, unsigned long, void *),
break;
case EFI_NOT_FOUND:
break;
+ case EFI_BUFFER_TOO_SMALL:
+ pr_warn("efivars: Variable name size exceeds maximum (%lu > 512)\n",
+ variable_name_size);
+ status = EFI_NOT_FOUND;
+ break;
default:
- printk(KERN_WARNING "efivars: get_next_variable: status=%lx\n",
- status);
+ pr_warn("efivars: get_next_variable: status=%lx\n", status);
status = EFI_NOT_FOUND;
break;
}
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 1d318f85232d..fffd3919343e 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -114,8 +114,11 @@ config EROFS_FS_ZIP_DEFLATE
config EROFS_FS_ONDEMAND
bool "EROFS fscache-based on-demand read support"
- depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
- default n
+ depends on EROFS_FS
+ select NETFS_SUPPORT
+ select FSCACHE
+ select CACHEFILES
+ select CACHEFILES_ONDEMAND
help
This permits EROFS to use fscache-backed data blobs with on-demand
read support.
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 279933e007d2..7cc5841577b2 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -11,13 +11,12 @@
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
-
unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
- /* indicate the algorithm will be used for decompression */
- unsigned int alg;
+ unsigned int alg; /* the algorithm for decompression */
bool inplace_io, partial_decoding, fillgaps;
+ gfp_t gfp; /* allocation flags for extra temporary buffers */
};
struct z_erofs_decompressor {
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index c98aeda8abb2..3d9721b3faa8 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -447,5 +447,6 @@ const struct file_operations erofs_file_fops = {
.llseek = generic_file_llseek,
.read_iter = erofs_file_read_iter,
.mmap = erofs_file_mmap,
+ .get_unmapped_area = thp_get_unmapped_area,
.splice_read = filemap_splice_read,
};
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 021be5feb1bc..2ec9b2bb628d 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -111,8 +111,9 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
victim = availables[--top];
get_page(victim);
} else {
- victim = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ victim = erofs_allocpage(pagepool, rq->gfp);
+ if (!victim)
+ return -ENOMEM;
set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
}
rq->out[i] = victim;
@@ -121,11 +122,11 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
}
static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
- void *inpage, unsigned int *inputmargin, int *maptype,
- bool may_inplace)
+ void *inpage, void *out, unsigned int *inputmargin,
+ int *maptype, bool may_inplace)
{
struct z_erofs_decompress_req *rq = ctx->rq;
- unsigned int omargin, total, i, j;
+ unsigned int omargin, total, i;
struct page **in;
void *src, *tmp;
@@ -135,12 +136,13 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize))
goto docopy;
- for (i = 0; i < ctx->inpages; ++i) {
- DBG_BUGON(rq->in[i] == NULL);
- for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j)
- if (rq->out[j] == rq->in[i])
- goto docopy;
- }
+ for (i = 0; i < ctx->inpages; ++i)
+ if (rq->out[ctx->outpages - ctx->inpages + i] !=
+ rq->in[i])
+ goto docopy;
+ kunmap_local(inpage);
+ *maptype = 3;
+ return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT);
}
if (ctx->inpages <= 1) {
@@ -148,7 +150,6 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
return inpage;
}
kunmap_local(inpage);
- might_sleep();
src = erofs_vm_map_ram(rq->in, ctx->inpages);
if (!src)
return ERR_PTR(-ENOMEM);
@@ -204,12 +205,12 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
}
static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
- u8 *out)
+ u8 *dst)
{
struct z_erofs_decompress_req *rq = ctx->rq;
bool support_0padding = false, may_inplace = false;
unsigned int inputmargin;
- u8 *headpage, *src;
+ u8 *out, *headpage, *src;
int ret, maptype;
DBG_BUGON(*rq->in == NULL);
@@ -230,11 +231,12 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
}
inputmargin = rq->pageofs_in;
- src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin,
+ src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin,
&maptype, may_inplace);
if (IS_ERR(src))
return PTR_ERR(src);
+ out = dst + rq->pageofs_out;
/* legacy format could compress extra data in a pcluster. */
if (rq->partial_decoding || !support_0padding)
ret = LZ4_decompress_safe_partial(src + inputmargin, out,
@@ -246,15 +248,9 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
if (ret != rq->outputsize) {
erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
ret, rq->inputsize, inputmargin, rq->outputsize);
-
- print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
- 16, 1, src + inputmargin, rq->inputsize, true);
- print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
- 16, 1, out, rq->outputsize, true);
-
if (ret >= 0)
memset(out + ret, 0, rq->outputsize - ret);
- ret = -EIO;
+ ret = -EFSCORRUPTED;
} else {
ret = 0;
}
@@ -265,7 +261,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
vm_unmap_ram(src, ctx->inpages);
} else if (maptype == 2) {
erofs_put_pcpubuf(src);
- } else {
+ } else if (maptype != 3) {
DBG_BUGON(1);
return -EFAULT;
}
@@ -308,7 +304,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
}
dstmap_out:
- ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out);
+ ret = z_erofs_lz4_decompress_mem(&ctx, dst);
if (!dst_maptype)
kunmap_local(dst);
else if (dst_maptype == 2)
@@ -319,43 +315,59 @@ dstmap_out:
static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
struct page **pagepool)
{
- const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
- const unsigned int outpages =
+ const unsigned int nrpages_in =
+ PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
- PAGE_SIZE - rq->pageofs_out);
- const unsigned int lefthalf = rq->outputsize - righthalf;
- const unsigned int interlaced_offset =
- rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
- u8 *src;
-
- if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
+ const unsigned int bs = rq->sb->s_blocksize;
+ unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt;
+ u8 *kin;
- if (rq->out[0] == *rq->in) {
- DBG_BUGON(rq->pageofs_out);
- return 0;
+ if (rq->outputsize > rq->inputsize)
+ return -EOPNOTSUPP;
+ if (rq->alg == Z_EROFS_COMPRESSION_INTERLACED) {
+ cur = bs - (rq->pageofs_out & (bs - 1));
+ pi = (rq->pageofs_in + rq->inputsize - cur) & ~PAGE_MASK;
+ cur = min(cur, rq->outputsize);
+ if (cur && rq->out[0]) {
+ kin = kmap_local_page(rq->in[nrpages_in - 1]);
+ if (rq->out[0] == rq->in[nrpages_in - 1]) {
+ memmove(kin + rq->pageofs_out, kin + pi, cur);
+ flush_dcache_page(rq->out[0]);
+ } else {
+ memcpy_to_page(rq->out[0], rq->pageofs_out,
+ kin + pi, cur);
+ }
+ kunmap_local(kin);
+ }
+ rq->outputsize -= cur;
}
- src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
- if (rq->out[0])
- memcpy_to_page(rq->out[0], rq->pageofs_out,
- src + interlaced_offset, righthalf);
-
- if (outpages > inpages) {
- DBG_BUGON(!rq->out[outpages - 1]);
- if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
- memcpy_to_page(rq->out[outpages - 1], 0, src +
- (interlaced_offset ? 0 : righthalf),
- lefthalf);
- } else if (!interlaced_offset) {
- memmove(src, src + righthalf, lefthalf);
- flush_dcache_page(rq->in[inpages - 1]);
- }
+ for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) {
+ insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize);
+ rq->outputsize -= insz;
+ if (!rq->in[ni])
+ continue;
+ kin = kmap_local_page(rq->in[ni]);
+ pi = 0;
+ do {
+ no = (rq->pageofs_out + cur + pi) >> PAGE_SHIFT;
+ po = (rq->pageofs_out + cur + pi) & ~PAGE_MASK;
+ DBG_BUGON(no >= nrpages_out);
+ cnt = min(insz - pi, PAGE_SIZE - po);
+ if (rq->out[no] == rq->in[ni]) {
+ memmove(kin + po,
+ kin + rq->pageofs_in + pi, cnt);
+ flush_dcache_page(rq->out[no]);
+ } else if (rq->out[no]) {
+ memcpy_to_page(rq->out[no], po,
+ kin + rq->pageofs_in + pi, cnt);
+ }
+ pi += cnt;
+ } while (pi < insz);
+ kunmap_local(kin);
}
- kunmap_local(src);
+ DBG_BUGON(ni > nrpages_in);
return 0;
}
@@ -398,7 +410,7 @@ int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb)
int size, ret = 0;
if (!erofs_sb_has_compr_cfgs(sbi)) {
- sbi->available_compr_algs = Z_EROFS_COMPRESSION_LZ4;
+ sbi->available_compr_algs = 1 << Z_EROFS_COMPRESSION_LZ4;
return z_erofs_load_lz4_config(sb, dsb, NULL, 0);
}
diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c
index daf3c1bdeab8..b98872058abe 100644
--- a/fs/erofs/decompressor_deflate.c
+++ b/fs/erofs/decompressor_deflate.c
@@ -70,7 +70,7 @@ int __init z_erofs_deflate_init(void)
return 0;
out_failed:
- pr_err("failed to allocate zlib workspace\n");
+ erofs_err(NULL, "failed to allocate zlib workspace");
z_erofs_deflate_exit();
return -ENOMEM;
}
@@ -95,7 +95,7 @@ int z_erofs_load_deflate_config(struct super_block *sb,
}
int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+ struct page **pgpl)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -158,8 +158,12 @@ again:
strm->z.avail_out = min_t(u32, outsz, PAGE_SIZE - pofs);
outsz -= strm->z.avail_out;
if (!rq->out[no]) {
- rq->out[no] = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
+ if (!rq->out[no]) {
+ kout = NULL;
+ err = -ENOMEM;
+ break;
+ }
set_page_private(rq->out[no],
Z_EROFS_SHORTLIVED_PAGE);
}
@@ -211,8 +215,11 @@ again:
DBG_BUGON(erofs_page_is_managed(EROFS_SB(sb),
rq->in[j]));
- tmppage = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage) {
+ err = -ENOMEM;
+ goto failed;
+ }
set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
copy_highpage(tmppage, rq->in[j]);
rq->in[j] = tmppage;
@@ -230,7 +237,7 @@ again:
break;
}
}
-
+failed:
if (zlib_inflateEnd(&strm->z) != Z_OK && !err)
err = -EIO;
if (kout)
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
index 2dd14f99c1dc..6ca357d83cfa 100644
--- a/fs/erofs/decompressor_lzma.c
+++ b/fs/erofs/decompressor_lzma.c
@@ -148,7 +148,7 @@ again:
}
int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
- struct page **pagepool)
+ struct page **pgpl)
{
const unsigned int nrpages_out =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
@@ -215,8 +215,11 @@ again:
PAGE_SIZE - pageofs);
outlen -= strm->buf.out_size;
if (!rq->out[no] && rq->fillgaps) { /* deduped */
- rq->out[no] = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ rq->out[no] = erofs_allocpage(pgpl, rq->gfp);
+ if (!rq->out[no]) {
+ err = -ENOMEM;
+ break;
+ }
set_page_private(rq->out[no],
Z_EROFS_SHORTLIVED_PAGE);
}
@@ -258,8 +261,11 @@ again:
DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
rq->in[j]));
- tmppage = erofs_allocpage(pagepool,
- GFP_KERNEL | __GFP_NOFAIL);
+ tmppage = erofs_allocpage(pgpl, rq->gfp);
+ if (!tmppage) {
+ err = -ENOMEM;
+ goto failed;
+ }
set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
copy_highpage(tmppage, rq->in[j]);
rq->in[j] = tmppage;
@@ -277,6 +283,7 @@ again:
break;
}
}
+failed:
if (no < nrpages_out && strm->buf.out)
kunmap(rq->out[no]);
if (ni < nrpages_in)
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
index 87ff35bff8d5..89a7c2453aae 100644
--- a/fs/erofs/fscache.c
+++ b/fs/erofs/fscache.c
@@ -165,10 +165,10 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
{
int ret;
- struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private;
+ struct erofs_fscache *ctx = folio->mapping->host->i_private;
struct erofs_fscache_request *req;
- req = erofs_fscache_req_alloc(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
if (IS_ERR(req)) {
folio_unlock(folio);
@@ -276,7 +276,7 @@ static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
struct erofs_fscache_request *req;
int ret;
- req = erofs_fscache_req_alloc(folio_mapping(folio),
+ req = erofs_fscache_req_alloc(folio->mapping,
folio_pos(folio), folio_size(folio));
if (IS_ERR(req)) {
folio_unlock(folio);
@@ -381,11 +381,12 @@ static int erofs_fscache_init_domain(struct super_block *sb)
goto out;
if (!erofs_pseudo_mnt) {
- erofs_pseudo_mnt = kern_mount(&erofs_fs_type);
- if (IS_ERR(erofs_pseudo_mnt)) {
- err = PTR_ERR(erofs_pseudo_mnt);
+ struct vfsmount *mnt = kern_mount(&erofs_fs_type);
+ if (IS_ERR(mnt)) {
+ err = PTR_ERR(mnt);
goto out;
}
+ erofs_pseudo_mnt = mnt;
}
domain->volume = sbi->volume;
@@ -459,7 +460,7 @@ static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
inode->i_blkbits = EROFS_SB(sb)->blkszbits;
inode->i_private = ctx;
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 14a79d3226ab..36e638e8b53a 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -60,7 +60,7 @@ static void *erofs_read_inode(struct erofs_buf *buf,
} else {
const unsigned int gotten = sb->s_blocksize - *ofs;
- copied = kmalloc(vi->inode_isize, GFP_NOFS);
+ copied = kmalloc(vi->inode_isize, GFP_KERNEL);
if (!copied) {
err = -ENOMEM;
goto err_out;
@@ -259,8 +259,10 @@ static int erofs_fill_inode(struct inode *inode)
if (erofs_inode_is_data_compressed(vi->datalayout)) {
#ifdef CONFIG_EROFS_FS_ZIP
- if (!erofs_is_fscache_mode(inode->i_sb) &&
- inode->i_sb->s_blocksize_bits == PAGE_SHIFT) {
+ if (!erofs_is_fscache_mode(inode->i_sb)) {
+ DO_ONCE_LITE_IF(inode->i_sb->s_blocksize != PAGE_SIZE,
+ erofs_info, inode->i_sb,
+ "EXPERIMENTAL EROFS subpage compressed block support in use. Use at your own risk!");
inode->i_mapping->a_ops = &z_erofs_aops;
err = 0;
goto out_unlock;
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index d4f631d39f0f..f0110a78acb2 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -130,24 +130,24 @@ static void *erofs_find_target_block(struct erofs_buf *target,
/* string comparison without already matched prefix */
diff = erofs_dirnamecmp(name, &dname, &matched);
- if (!diff) {
- *_ndirents = 0;
- goto out;
- } else if (diff > 0) {
- head = mid + 1;
- startprfx = matched;
-
- if (!IS_ERR(candidate))
- erofs_put_metabuf(target);
- *target = buf;
- candidate = de;
- *_ndirents = ndirents;
- } else {
+ if (diff < 0) {
erofs_put_metabuf(&buf);
-
back = mid - 1;
endprfx = matched;
+ continue;
+ }
+
+ if (!IS_ERR(candidate))
+ erofs_put_metabuf(target);
+ *target = buf;
+ if (!diff) {
+ *_ndirents = 0;
+ return de;
}
+ head = mid + 1;
+ startprfx = matched;
+ candidate = de;
+ *_ndirents = ndirents;
continue;
}
out: /* free if the candidate is valid */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 3789d6224513..5f60f163bd56 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -27,7 +27,10 @@ void _erofs_err(struct super_block *sb, const char *func, const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
+ if (sb)
+ pr_err("(device %s): %s: %pV", sb->s_id, func, &vaf);
+ else
+ pr_err("%s: %pV", func, &vaf);
va_end(args);
}
@@ -41,7 +44,10 @@ void _erofs_info(struct super_block *sb, const char *func, const char *fmt, ...)
vaf.fmt = fmt;
vaf.va = &args;
- pr_info("(device %s): %pV", sb->s_id, &vaf);
+ if (sb)
+ pr_info("(device %s): %pV", sb->s_id, &vaf);
+ else
+ pr_info("%pV", &vaf);
va_end(args);
}
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index 5dea308764b4..e146d09151af 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -81,7 +81,7 @@ struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
repeat:
xa_lock(&sbi->managed_pslots);
pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
- NULL, grp, GFP_NOFS);
+ NULL, grp, GFP_KERNEL);
if (pre) {
if (xa_is_err(pre)) {
pre = ERR_PTR(xa_err(pre));
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index a7e6847f6f8f..ff0aa72b0db3 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -56,6 +56,9 @@ struct z_erofs_pcluster {
/* L: total number of bvecs */
unsigned int vcnt;
+ /* I: pcluster size (compressed size) in bytes */
+ unsigned int pclustersize;
+
/* I: page offset of start position of decompression */
unsigned short pageofs_out;
@@ -70,14 +73,6 @@ struct z_erofs_pcluster {
struct rcu_head rcu;
};
- union {
- /* I: physical cluster size in pages */
- unsigned short pclusterpages;
-
- /* I: tailpacking inline compressed size */
- unsigned short tailpacking_size;
- };
-
/* I: compression algorithm format */
unsigned char algorithmformat;
@@ -87,6 +82,9 @@ struct z_erofs_pcluster {
/* L: indicate several pageofs_outs or not */
bool multibases;
+ /* L: whether extra buffer allocations are best-effort */
+ bool besteffort;
+
/* A: compressed bvecs (can be cached or inplaced pages) */
struct z_erofs_bvec compressed_bvecs[];
};
@@ -115,9 +113,7 @@ static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
{
- if (z_erofs_is_inline_pcluster(pcl))
- return 1;
- return pcl->pclusterpages;
+ return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT;
}
/*
@@ -237,7 +233,7 @@ static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
struct page *nextpage = *candidate_bvpage;
if (!nextpage) {
- nextpage = erofs_allocpage(pagepool, GFP_NOFS);
+ nextpage = erofs_allocpage(pagepool, GFP_KERNEL);
if (!nextpage)
return -ENOMEM;
set_page_private(nextpage, Z_EROFS_SHORTLIVED_PAGE);
@@ -298,21 +294,21 @@ static int z_erofs_create_pcluster_pool(void)
return 0;
}
-static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
+static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size)
{
- int i;
+ unsigned int nrpages = PAGE_ALIGN(size) >> PAGE_SHIFT;
+ struct z_erofs_pcluster_slab *pcs = pcluster_pool;
- for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
- struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+ for (; pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
struct z_erofs_pcluster *pcl;
if (nrpages > pcs->maxpages)
continue;
- pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+ pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL);
if (!pcl)
return ERR_PTR(-ENOMEM);
- pcl->pclusterpages = nrpages;
+ pcl->pclustersize = size;
return pcl;
}
return ERR_PTR(-EINVAL);
@@ -559,6 +555,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
{
struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
struct z_erofs_pcluster *pcl = fe->pcl;
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
bool shouldalloc = z_erofs_should_alloc_cache(fe);
bool standalone = true;
/*
@@ -569,22 +566,22 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
unsigned int i;
- if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
+ if (i_blocksize(fe->inode) != PAGE_SIZE ||
+ fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
- for (i = 0; i < pcl->pclusterpages; ++i) {
- struct page *page;
+ for (i = 0; i < pclusterpages; ++i) {
+ struct page *page, *newpage;
void *t; /* mark pages just found for debugging */
- struct page *newpage = NULL;
- /* the compressed page was loaded before */
+ /* Inaccurate check w/o locking to avoid unneeded lookups */
if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
page = find_get_page(mc, pcl->obj.index + i);
-
if (page) {
t = (void *)((unsigned long)page | 1);
+ newpage = NULL;
} else {
/* I/O is needed, no possible to decompress directly */
standalone = false;
@@ -592,9 +589,8 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
continue;
/*
- * try to use cached I/O if page allocation
- * succeeds or fallback to in-place I/O instead
- * to avoid any direct reclaim.
+ * Try cached I/O if allocation succeeds or fallback to
+ * in-place I/O instead to avoid any direct reclaim.
*/
newpage = erofs_allocpage(&fe->pagepool, gfp);
if (!newpage)
@@ -602,9 +598,13 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe)
set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
t = (void *)((unsigned long)newpage | 1);
}
-
- if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t))
+ spin_lock(&pcl->obj.lockref.lock);
+ if (!pcl->compressed_bvecs[i].page) {
+ pcl->compressed_bvecs[i].page = t;
+ spin_unlock(&pcl->obj.lockref.lock);
continue;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
if (page)
put_page(page);
@@ -626,6 +626,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
{
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
int i;
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
@@ -633,7 +634,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
* refcount of workgroup is now freezed as 0,
* therefore no need to worry about available decompression users.
*/
- for (i = 0; i < pcl->pclusterpages; ++i) {
+ for (i = 0; i < pclusterpages; ++i) {
struct page *page = pcl->compressed_bvecs[i].page;
if (!page)
@@ -657,6 +658,7 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
{
struct z_erofs_pcluster *pcl = folio_get_private(folio);
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
bool ret;
int i;
@@ -669,7 +671,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp)
goto out;
DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
- for (i = 0; i < pcl->pclusterpages; ++i) {
+ for (i = 0; i < pclusterpages; ++i) {
if (pcl->compressed_bvecs[i].page == &folio->page) {
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
ret = true;
@@ -697,7 +699,7 @@ static void z_erofs_cache_invalidate_folio(struct folio *folio,
DBG_BUGON(stop > folio_size(folio) || stop < length);
if (offset == 0 && stop == folio_size(folio))
- while (!z_erofs_cache_release_folio(folio, GFP_NOFS))
+ while (!z_erofs_cache_release_folio(folio, 0))
cond_resched();
}
@@ -716,36 +718,30 @@ int erofs_init_managed_cache(struct super_block *sb)
set_nlink(inode, 1);
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &z_erofs_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL);
EROFS_SB(sb)->managed_cache = inode;
return 0;
}
-static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
- struct z_erofs_bvec *bvec)
-{
- struct z_erofs_pcluster *const pcl = fe->pcl;
-
- while (fe->icur > 0) {
- if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
- NULL, bvec->page)) {
- pcl->compressed_bvecs[fe->icur] = *bvec;
- return true;
- }
- }
- return false;
-}
-
/* callers must be with pcluster lock held */
static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
struct z_erofs_bvec *bvec, bool exclusive)
{
+ struct z_erofs_pcluster *pcl = fe->pcl;
int ret;
if (exclusive) {
/* give priority for inplaceio to use file pages first */
- if (z_erofs_try_inplace_io(fe, bvec))
+ spin_lock(&pcl->obj.lockref.lock);
+ while (fe->icur > 0) {
+ if (pcl->compressed_bvecs[--fe->icur].page)
+ continue;
+ pcl->compressed_bvecs[fe->icur] = *bvec;
+ spin_unlock(&pcl->obj.lockref.lock);
return 0;
+ }
+ spin_unlock(&pcl->obj.lockref.lock);
+
/* otherwise, check if it can be used as a bvpage */
if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
!fe->candidate_bvpage)
@@ -778,20 +774,20 @@ static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
{
struct erofs_map_blocks *map = &fe->map;
+ struct super_block *sb = fe->inode->i_sb;
bool ztailpacking = map->m_flags & EROFS_MAP_META;
struct z_erofs_pcluster *pcl;
struct erofs_workgroup *grp;
int err;
if (!(map->m_flags & EROFS_MAP_ENCODED) ||
- (!ztailpacking && !(map->m_pa >> PAGE_SHIFT))) {
+ (!ztailpacking && !erofs_blknr(sb, map->m_pa))) {
DBG_BUGON(1);
return -EFSCORRUPTED;
}
/* no available pcluster, let's allocate one */
- pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
- map->m_plen >> PAGE_SHIFT);
+ pcl = z_erofs_alloc_pcluster(map->m_plen);
if (IS_ERR(pcl))
return PTR_ERR(pcl);
@@ -815,10 +811,8 @@ static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
if (ztailpacking) {
pcl->obj.index = 0; /* which indicates ztailpacking */
- pcl->pageofs_in = erofs_blkoff(fe->inode->i_sb, map->m_pa);
- pcl->tailpacking_size = map->m_plen;
} else {
- pcl->obj.index = map->m_pa >> PAGE_SHIFT;
+ pcl->obj.index = erofs_blknr(sb, map->m_pa);
grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
if (IS_ERR(grp)) {
@@ -893,6 +887,7 @@ static int z_erofs_pcluster_begin(struct z_erofs_decompress_frontend *fe)
}
get_page(map->buf.page);
WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, map->buf.page);
+ fe->pcl->pageofs_in = map->m_pa & ~PAGE_MASK;
fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
/* file-backed inplace I/O pages are traversed in reverse order */
@@ -968,17 +963,17 @@ static int z_erofs_read_fragment(struct super_block *sb, struct page *page,
}
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page)
+ struct page *page, bool ra)
{
struct inode *const inode = fe->inode;
struct erofs_map_blocks *const map = &fe->map;
const loff_t offset = page_offset(page);
+ const unsigned int bs = i_blocksize(inode);
bool tight = true, exclusive;
unsigned int cur, end, len, split;
int err = 0;
z_erofs_onlinepage_init(page);
-
split = 0;
end = PAGE_SIZE;
repeat:
@@ -1018,6 +1013,7 @@ repeat:
err = z_erofs_pcluster_begin(fe);
if (err)
goto out;
+ fe->pcl->besteffort |= !ra;
}
/*
@@ -1027,7 +1023,7 @@ repeat:
* for inplace I/O or bvpage (should be processed in a strict order.)
*/
tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
- exclusive = (!cur && ((split <= 1) || tight));
+ exclusive = (!cur && ((split <= 1) || (tight && bs == PAGE_SIZE)));
if (cur)
tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
@@ -1206,34 +1202,27 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
struct page *page = bvec->page;
- /* compressed pages ought to be present before decompressing */
+ /* compressed data ought to be valid before decompressing */
if (!page) {
- DBG_BUGON(1);
+ err = -EIO;
continue;
}
be->compressed_pages[i] = page;
- if (z_erofs_is_inline_pcluster(pcl)) {
+ if (z_erofs_is_inline_pcluster(pcl) ||
+ erofs_page_is_managed(EROFS_SB(be->sb), page)) {
if (!PageUptodate(page))
err = -EIO;
continue;
}
DBG_BUGON(z_erofs_page_is_invalidated(page));
- if (!z_erofs_is_shortlived_page(page)) {
- if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
- if (!PageUptodate(page))
- err = -EIO;
- continue;
- }
- z_erofs_do_decompressed_bvec(be, bvec);
- *overlapped = true;
- }
+ if (z_erofs_is_shortlived_page(page))
+ continue;
+ z_erofs_do_decompressed_bvec(be, bvec);
+ *overlapped = true;
}
-
- if (err)
- return err;
- return 0;
+ return err;
}
static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
@@ -1242,10 +1231,9 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
struct z_erofs_pcluster *pcl = be->pcl;
unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
- const struct z_erofs_decompressor *decompressor =
+ const struct z_erofs_decompressor *decomp =
&erofs_decompressors[pcl->algorithmformat];
- unsigned int i, inputsize;
- int err2;
+ int i, err2;
struct page *page;
bool overlapped;
@@ -1279,29 +1267,24 @@ static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
err2 = z_erofs_parse_in_bvecs(be, &overlapped);
if (err2)
err = err2;
- if (err)
- goto out;
-
- if (z_erofs_is_inline_pcluster(pcl))
- inputsize = pcl->tailpacking_size;
- else
- inputsize = pclusterpages * PAGE_SIZE;
-
- err = decompressor->decompress(&(struct z_erofs_decompress_req) {
+ if (!err)
+ err = decomp->decompress(&(struct z_erofs_decompress_req) {
.sb = be->sb,
.in = be->compressed_pages,
.out = be->decompressed_pages,
.pageofs_in = pcl->pageofs_in,
.pageofs_out = pcl->pageofs_out,
- .inputsize = inputsize,
+ .inputsize = pcl->pclustersize,
.outputsize = pcl->length,
.alg = pcl->algorithmformat,
.inplace_io = overlapped,
.partial_decoding = pcl->partial,
.fillgaps = pcl->multibases,
+ .gfp = pcl->besteffort ?
+ GFP_KERNEL | __GFP_NOFAIL :
+ GFP_NOWAIT | __GFP_NORETRY
}, be->pagepool);
-out:
/* must handle all compressed pages before actual file pages */
if (z_erofs_is_inline_pcluster(pcl)) {
page = pcl->compressed_bvecs[0].page;
@@ -1309,12 +1292,11 @@ out:
put_page(page);
} else {
for (i = 0; i < pclusterpages; ++i) {
- page = pcl->compressed_bvecs[i].page;
+ /* consider shortlived pages added when decompressing */
+ page = be->compressed_pages[i];
- if (erofs_page_is_managed(sbi, page))
+ if (!page || erofs_page_is_managed(sbi, page))
continue;
-
- /* recycle all individual short-lived pages */
(void)z_erofs_put_shortlivedpage(be->pagepool, page);
WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
}
@@ -1343,6 +1325,7 @@ out:
pcl->length = 0;
pcl->partial = true;
pcl->multibases = false;
+ pcl->besteffort = false;
pcl->bvset.nextpage = NULL;
pcl->vcnt = 0;
@@ -1436,86 +1419,86 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
z_erofs_decompressqueue_work(&io->u.work);
}
-static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
- unsigned int nr,
- struct page **pagepool,
- struct address_space *mc)
+static void z_erofs_fill_bio_vec(struct bio_vec *bvec,
+ struct z_erofs_decompress_frontend *f,
+ struct z_erofs_pcluster *pcl,
+ unsigned int nr,
+ struct address_space *mc)
{
- const pgoff_t index = pcl->obj.index;
gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
-
+ struct z_erofs_bvec zbv;
struct address_space *mapping;
- struct page *oldpage, *page;
- int justfound;
+ struct page *page;
+ int justfound, bs = i_blocksize(f->inode);
+ /* Except for inplace pages, the entire page can be used for I/Os */
+ bvec->bv_offset = 0;
+ bvec->bv_len = PAGE_SIZE;
repeat:
- page = READ_ONCE(pcl->compressed_bvecs[nr].page);
- oldpage = page;
-
- if (!page)
- goto out_allocpage;
-
+ spin_lock(&pcl->obj.lockref.lock);
+ zbv = pcl->compressed_bvecs[nr];
+ page = zbv.page;
justfound = (unsigned long)page & 1UL;
page = (struct page *)((unsigned long)page & ~1UL);
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
+ if (!page)
+ goto out_allocpage;
+ bvec->bv_page = page;
+ DBG_BUGON(z_erofs_is_shortlived_page(page));
/*
- * preallocated cached pages, which is used to avoid direct reclaim
- * otherwise, it will go inplace I/O path instead.
+ * Handle preallocated cached pages. We tried to allocate such pages
+ * without triggering direct reclaim. If allocation failed, inplace
+ * file-backed pages will be used instead.
*/
if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
- WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
set_page_private(page, 0);
tocache = true;
goto out_tocache;
}
- mapping = READ_ONCE(page->mapping);
+ mapping = READ_ONCE(page->mapping);
/*
- * file-backed online pages in plcuster are all locked steady,
- * therefore it is impossible for `mapping' to be NULL.
+ * File-backed pages for inplace I/Os are all locked steady,
+ * therefore it is impossible for `mapping` to be NULL.
*/
- if (mapping && mapping != mc)
- /* ought to be unmanaged pages */
- goto out;
-
- /* directly return for shortlived page as well */
- if (z_erofs_is_shortlived_page(page))
- goto out;
+ if (mapping && mapping != mc) {
+ if (zbv.offset < 0)
+ bvec->bv_offset = round_up(-zbv.offset, bs);
+ bvec->bv_len = round_up(zbv.end, bs) - bvec->bv_offset;
+ return;
+ }
lock_page(page);
-
/* only true if page reclaim goes wrong, should never happen */
DBG_BUGON(justfound && PagePrivate(page));
- /* the page is still in manage cache */
+ /* the cached page is still in managed cache */
if (page->mapping == mc) {
- WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
-
+ /*
+ * The cached page is still available but without a valid
+ * `->private` pcluster hint. Let's reconnect them.
+ */
if (!PagePrivate(page)) {
- /*
- * impossible to be !PagePrivate(page) for
- * the current restriction as well if
- * the page is already in compressed_bvecs[].
- */
DBG_BUGON(!justfound);
-
- justfound = 0;
- set_page_private(page, (unsigned long)pcl);
- SetPagePrivate(page);
+ /* compressed_bvecs[] already takes a ref */
+ attach_page_private(page, pcl);
+ put_page(page);
}
- /* no need to submit io if it is already up-to-date */
+ /* no need to submit if it is already up-to-date */
if (PageUptodate(page)) {
unlock_page(page);
- page = NULL;
+ bvec->bv_page = NULL;
}
- goto out;
+ return;
}
/*
- * the managed page has been truncated, it's unsafe to
- * reuse this one, let's allocate a new cache-managed page.
+ * It has been truncated, so it's unsafe to reuse this one. Let's
+ * allocate a new page for compressed data.
*/
DBG_BUGON(page->mapping);
DBG_BUGON(!justfound);
@@ -1524,25 +1507,27 @@ repeat:
unlock_page(page);
put_page(page);
out_allocpage:
- page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
- if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
- oldpage, page)) {
- erofs_pagepool_add(pagepool, page);
+ page = erofs_allocpage(&f->pagepool, gfp | __GFP_NOFAIL);
+ spin_lock(&pcl->obj.lockref.lock);
+ if (pcl->compressed_bvecs[nr].page) {
+ erofs_pagepool_add(&f->pagepool, page);
+ spin_unlock(&pcl->obj.lockref.lock);
cond_resched();
goto repeat;
}
+ pcl->compressed_bvecs[nr].page = page;
+ spin_unlock(&pcl->obj.lockref.lock);
+ bvec->bv_page = page;
out_tocache:
- if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
- /* turn into temporary page if fails (1 ref) */
+ if (!tocache || bs != PAGE_SIZE ||
+ add_to_page_cache_lru(page, mc, pcl->obj.index + nr, gfp)) {
+ /* turn into a temporary shortlived page (1 ref) */
set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
- goto out;
+ return;
}
attach_page_private(page, pcl);
- /* drop a refcount added by allocpage (then we have 2 refs here) */
+ /* drop a refcount added by allocpage (then 2 refs in total here) */
put_page(page);
-
-out: /* the only exit (for tracing and debugging) */
- return page;
}
static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb,
@@ -1597,7 +1582,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
qtail[JQ_BYPASS] = &pcl->next;
}
-static void z_erofs_decompressqueue_endio(struct bio *bio)
+static void z_erofs_submissionqueue_endio(struct bio *bio)
{
struct z_erofs_decompressqueue *q = bio->bi_private;
blk_status_t err = bio->bi_status;
@@ -1609,7 +1594,6 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
DBG_BUGON(PageUptodate(page));
DBG_BUGON(z_erofs_page_is_invalidated(page));
-
if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
if (!err)
SetPageUptodate(page);
@@ -1632,17 +1616,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
z_erofs_next_pcluster_t owned_head = f->owned_head;
/* bio is NULL initially, so no need to initialize last_{index,bdev} */
- pgoff_t last_index;
+ erofs_off_t last_pa;
struct block_device *last_bdev;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
unsigned long pflags;
int memstall = 0;
- /*
- * if managed cache is enabled, bypass jobqueue is needed,
- * no need to read from device for all pclusters in this queue.
- */
+ /* No need to read from device for pclusters in the bypass queue. */
q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg);
@@ -1655,7 +1636,8 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
do {
struct erofs_map_dev mdev;
struct z_erofs_pcluster *pcl;
- pgoff_t cur, end;
+ erofs_off_t cur, end;
+ struct bio_vec bvec;
unsigned int i = 0;
bool bypass = true;
@@ -1674,18 +1656,14 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
};
(void)erofs_map_dev(sb, &mdev);
- cur = erofs_blknr(sb, mdev.m_pa);
- end = cur + pcl->pclusterpages;
-
+ cur = mdev.m_pa;
+ end = cur + pcl->pclustersize;
do {
- struct page *page;
-
- page = pickup_page_for_submission(pcl, i++,
- &f->pagepool, mc);
- if (!page)
+ z_erofs_fill_bio_vec(&bvec, f, pcl, i++, mc);
+ if (!bvec.bv_page)
continue;
- if (bio && (cur != last_index + 1 ||
+ if (bio && (cur != last_pa ||
last_bdev != mdev.m_bdev)) {
submit_bio_retry:
submit_bio(bio);
@@ -1696,7 +1674,8 @@ submit_bio_retry:
bio = NULL;
}
- if (unlikely(PageWorkingset(page)) && !memstall) {
+ if (unlikely(PageWorkingset(bvec.bv_page)) &&
+ !memstall) {
psi_memstall_enter(&pflags);
memstall = 1;
}
@@ -1704,23 +1683,25 @@ submit_bio_retry:
if (!bio) {
bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
REQ_OP_READ, GFP_NOIO);
- bio->bi_end_io = z_erofs_decompressqueue_endio;
-
- last_bdev = mdev.m_bdev;
- bio->bi_iter.bi_sector = (sector_t)cur <<
- (sb->s_blocksize_bits - 9);
+ bio->bi_end_io = z_erofs_submissionqueue_endio;
+ bio->bi_iter.bi_sector = cur >> 9;
bio->bi_private = q[JQ_SUBMIT];
if (readahead)
bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
+ last_bdev = mdev.m_bdev;
}
- if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
+ if (cur + bvec.bv_len > end)
+ bvec.bv_len = end - cur;
+ DBG_BUGON(bvec.bv_len < sb->s_blocksize);
+ if (!bio_add_page(bio, bvec.bv_page, bvec.bv_len,
+ bvec.bv_offset))
goto submit_bio_retry;
- last_index = cur;
+ last_pa = cur + bvec.bv_len;
bypass = false;
- } while (++cur < end);
+ } while ((cur += bvec.bv_len) < end);
if (!bypass)
qtail[JQ_SUBMIT] = &pcl->next;
@@ -1814,7 +1795,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
if (PageUptodate(page))
unlock_page(page);
else
- (void)z_erofs_do_read_page(f, page);
+ (void)z_erofs_do_read_page(f, page, !!rac);
put_page(page);
}
@@ -1835,7 +1816,7 @@ static int z_erofs_read_folio(struct file *file, struct folio *folio)
f.headoffset = (erofs_off_t)folio->index << PAGE_SHIFT;
z_erofs_pcluster_readmore(&f, NULL, true);
- err = z_erofs_do_read_page(&f, &folio->page);
+ err = z_erofs_do_read_page(&f, &folio->page, false);
z_erofs_pcluster_readmore(&f, NULL, false);
z_erofs_pcluster_end(&f);
@@ -1876,7 +1857,7 @@ static void z_erofs_readahead(struct readahead_control *rac)
folio = head;
head = folio_get_private(folio);
- err = z_erofs_do_read_page(&f, &folio->page);
+ err = z_erofs_do_read_page(&f, &folio->page, true);
if (err && err != -EINTR)
erofs_err(inode->i_sb, "readahead error at folio %lu @ nid %llu",
folio->index, EROFS_I(inode)->nid);
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 7b55111fd533..e313c936351d 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -82,29 +82,26 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m,
}
static unsigned int decode_compactedbits(unsigned int lobits,
- unsigned int lomask,
u8 *in, unsigned int pos, u8 *type)
{
const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7);
- const unsigned int lo = v & lomask;
+ const unsigned int lo = v & ((1 << lobits) - 1);
*type = (v >> lobits) & 3;
return lo;
}
-static int get_compacted_la_distance(unsigned int lclusterbits,
+static int get_compacted_la_distance(unsigned int lobits,
unsigned int encodebits,
unsigned int vcnt, u8 *in, int i)
{
- const unsigned int lomask = (1 << lclusterbits) - 1;
unsigned int lo, d1 = 0;
u8 type;
DBG_BUGON(i >= vcnt);
do {
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * i, &type);
+ lo = decode_compactedbits(lobits, in, encodebits * i, &type);
if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD)
return d1;
@@ -123,15 +120,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
{
struct erofs_inode *const vi = EROFS_I(m->inode);
const unsigned int lclusterbits = vi->z_logical_clusterbits;
- const unsigned int lomask = (1 << lclusterbits) - 1;
- unsigned int vcnt, base, lo, encodebits, nblk, eofs;
+ unsigned int vcnt, base, lo, lobits, encodebits, nblk, eofs;
int i;
u8 *in, type;
bool big_pcluster;
if (1 << amortizedshift == 4 && lclusterbits <= 14)
vcnt = 2;
- else if (1 << amortizedshift == 2 && lclusterbits == 12)
+ else if (1 << amortizedshift == 2 && lclusterbits <= 12)
vcnt = 16;
else
return -EOPNOTSUPP;
@@ -140,6 +136,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
(vcnt << amortizedshift);
big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
+ lobits = max(lclusterbits, ilog2(Z_EROFS_LI_D0_CBLKCNT) + 1U);
encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
eofs = erofs_blkoff(m->inode->i_sb, pos);
base = round_down(eofs, vcnt << amortizedshift);
@@ -147,15 +144,14 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
i = (eofs - base) >> amortizedshift;
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * i, &type);
+ lo = decode_compactedbits(lobits, in, encodebits * i, &type);
m->type = type;
if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
m->clusterofs = 1 << lclusterbits;
/* figure out lookahead_distance: delta[1] if needed */
if (lookahead)
- m->delta[1] = get_compacted_la_distance(lclusterbits,
+ m->delta[1] = get_compacted_la_distance(lobits,
encodebits, vcnt, in, i);
if (lo & Z_EROFS_LI_D0_CBLKCNT) {
if (!big_pcluster) {
@@ -174,8 +170,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
* of which lo saves delta[1] rather than delta[0].
* Hence, get delta[0] by the previous lcluster indirectly.
*/
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * (i - 1), &type);
+ lo = decode_compactedbits(lobits, in,
+ encodebits * (i - 1), &type);
if (type != Z_EROFS_LCLUSTER_TYPE_NONHEAD)
lo = 0;
else if (lo & Z_EROFS_LI_D0_CBLKCNT)
@@ -190,8 +186,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
nblk = 1;
while (i > 0) {
--i;
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * i, &type);
+ lo = decode_compactedbits(lobits, in,
+ encodebits * i, &type);
if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD)
i -= lo;
@@ -202,8 +198,8 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
nblk = 0;
while (i > 0) {
--i;
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * i, &type);
+ lo = decode_compactedbits(lobits, in,
+ encodebits * i, &type);
if (type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) {
if (lo & Z_EROFS_LI_D0_CBLKCNT) {
--i;
@@ -458,7 +454,7 @@ static int z_erofs_do_map_blocks(struct inode *inode,
.map = map,
};
int err = 0;
- unsigned int lclusterbits, endoff;
+ unsigned int lclusterbits, endoff, afmt;
unsigned long initial_lcn;
unsigned long long ofs, end;
@@ -547,17 +543,20 @@ static int z_erofs_do_map_blocks(struct inode *inode,
err = -EFSCORRUPTED;
goto unmap_out;
}
- if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
- map->m_algorithmformat =
- Z_EROFS_COMPRESSION_INTERLACED;
- else
- map->m_algorithmformat =
- Z_EROFS_COMPRESSION_SHIFTED;
- } else if (m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) {
- map->m_algorithmformat = vi->z_algorithmtype[1];
+ afmt = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER ?
+ Z_EROFS_COMPRESSION_INTERLACED :
+ Z_EROFS_COMPRESSION_SHIFTED;
} else {
- map->m_algorithmformat = vi->z_algorithmtype[0];
+ afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ?
+ vi->z_algorithmtype[1] : vi->z_algorithmtype[0];
+ if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) {
+ erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu",
+ afmt, vi->nid);
+ err = -EFSCORRUPTED;
+ goto unmap_out;
+ }
}
+ map->m_algorithmformat = afmt;
if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
((flags & EROFS_GET_BLOCKS_READMORE) &&
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 33a918f9566c..ad8186d47ba7 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -43,7 +43,17 @@ struct eventfd_ctx {
int id;
};
-__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask)
+/**
+ * eventfd_signal_mask - Increment the event counter
+ * @ctx: [in] Pointer to the eventfd context.
+ * @mask: [in] poll mask
+ *
+ * This function is supposed to be called by the kernel in paths that do not
+ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
+ * value, and we signal this as overflow condition by returning a EPOLLERR
+ * to poll(2).
+ */
+void eventfd_signal_mask(struct eventfd_ctx *ctx, __poll_t mask)
{
unsigned long flags;
@@ -56,45 +66,23 @@ __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, __poll_t mask)
* safe context.
*/
if (WARN_ON_ONCE(current->in_eventfd))
- return 0;
+ return;
spin_lock_irqsave(&ctx->wqh.lock, flags);
current->in_eventfd = 1;
- if (ULLONG_MAX - ctx->count < n)
- n = ULLONG_MAX - ctx->count;
- ctx->count += n;
+ if (ctx->count < ULLONG_MAX)
+ ctx->count++;
if (waitqueue_active(&ctx->wqh))
wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
current->in_eventfd = 0;
spin_unlock_irqrestore(&ctx->wqh.lock, flags);
-
- return n;
-}
-
-/**
- * eventfd_signal - Adds @n to the eventfd counter.
- * @ctx: [in] Pointer to the eventfd context.
- * @n: [in] Value of the counter to be added to the eventfd internal counter.
- * The value cannot be negative.
- *
- * This function is supposed to be called by the kernel in paths that do not
- * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returning a EPOLLERR
- * to poll(2).
- *
- * Returns the amount by which the counter was incremented. This will be less
- * than @n if the counter has overflowed.
- */
-__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
-{
- return eventfd_signal_mask(ctx, n, 0);
}
-EXPORT_SYMBOL_GPL(eventfd_signal);
+EXPORT_SYMBOL_GPL(eventfd_signal_mask);
static void eventfd_free_ctx(struct eventfd_ctx *ctx)
{
if (ctx->id >= 0)
- ida_simple_remove(&eventfd_ida, ctx->id);
+ ida_free(&eventfd_ida, ctx->id);
kfree(ctx);
}
@@ -407,7 +395,7 @@ static int do_eventfd(unsigned int count, int flags)
init_waitqueue_head(&ctx->wqh);
ctx->count = count;
ctx->flags = flags;
- ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
+ ctx->id = ida_alloc(&eventfd_ida, GFP_KERNEL);
flags &= EFD_SHARED_FCNTL_FLAGS;
flags |= O_RDWR;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 2877cc01cff1..3534d36a1474 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -322,7 +322,6 @@ static struct ctl_table epoll_table[] = {
.extra1 = &long_zero,
.extra2 = &long_max,
},
- { }
};
static void __init epoll_sysctls_init(void)
diff --git a/fs/exec.c b/fs/exec.c
index 4aa19b24f281..af4fbb61cd53 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -66,6 +66,7 @@
#include <linux/coredump.h>
#include <linux/time_namespace.h>
#include <linux/user_events.h>
+#include <linux/rseq.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -127,7 +128,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
struct filename *tmp = getname(library);
int error = PTR_ERR(tmp);
static const struct open_flags uselib_flags = {
- .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
+ .open_flag = O_LARGEFILE | O_RDONLY,
.acc_mode = MAY_READ | MAY_EXEC,
.intent = LOOKUP_OPEN,
.lookup_flags = LOOKUP_FOLLOW,
@@ -903,6 +904,10 @@ EXPORT_SYMBOL(transfer_args_to_stack);
#endif /* CONFIG_MMU */
+/*
+ * On success, caller must call do_close_execat() on the returned
+ * struct file to close it.
+ */
static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
struct file *file;
@@ -947,6 +952,17 @@ exit:
return ERR_PTR(err);
}
+/**
+ * open_exec - Open a path name for execution
+ *
+ * @name: path name to open with the intent of executing it.
+ *
+ * Returns ERR_PTR on failure or allocated struct file on success.
+ *
+ * As this is a wrapper for the internal do_open_execat(), callers
+ * must call allow_write_access() before fput() on release. Also see
+ * do_close_execat().
+ */
struct file *open_exec(const char *name)
{
struct filename *filename = getname_kernel(name);
@@ -1408,6 +1424,9 @@ int begin_new_exec(struct linux_binprm * bprm)
out_unlock:
up_write(&me->signal->exec_update_lock);
+ if (!bprm->cred)
+ mutex_unlock(&me->signal->cred_guard_mutex);
+
out:
return retval;
}
@@ -1483,6 +1502,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
return -ENOMEM;
}
+/* Matches do_open_execat() */
+static void do_close_execat(struct file *file)
+{
+ if (!file)
+ return;
+ allow_write_access(file);
+ fput(file);
+}
+
static void free_bprm(struct linux_binprm *bprm)
{
if (bprm->mm) {
@@ -1494,10 +1522,7 @@ static void free_bprm(struct linux_binprm *bprm)
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
- if (bprm->file) {
- allow_write_access(bprm->file);
- fput(bprm->file);
- }
+ do_close_execat(bprm->file);
if (bprm->executable)
fput(bprm->executable);
/* If a binfmt changed the interp, free it. */
@@ -1507,12 +1532,23 @@ static void free_bprm(struct linux_binprm *bprm)
kfree(bprm);
}
-static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
+static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int flags)
{
- struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ struct linux_binprm *bprm;
+ struct file *file;
int retval = -ENOMEM;
- if (!bprm)
- goto out;
+
+ file = do_open_execat(fd, filename, flags);
+ if (IS_ERR(file))
+ return ERR_CAST(file);
+
+ bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ if (!bprm) {
+ do_close_execat(file);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ bprm->file = file;
if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
@@ -1525,18 +1561,28 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
if (!bprm->fdpath)
goto out_free;
+ /*
+ * Record that a name derived from an O_CLOEXEC fd will be
+ * inaccessible after exec. This allows the code in exec to
+ * choose to fail when the executable is not mmaped into the
+ * interpreter and an open file descriptor is not passed to
+ * the interpreter. This makes for a better user experience
+ * than having the interpreter start and then immediately fail
+ * when it finds the executable is inaccessible.
+ */
+ if (get_close_on_exec(fd))
+ bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
+
bprm->filename = bprm->fdpath;
}
bprm->interp = bprm->filename;
retval = bprm_mm_init(bprm);
- if (retval)
- goto out_free;
- return bprm;
+ if (!retval)
+ return bprm;
out_free:
free_bprm(bprm);
-out:
return ERR_PTR(retval);
}
@@ -1578,16 +1624,16 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
* will be able to manipulate the current directory, etc.
* It would be nice to force an unshare instead...
*/
- t = p;
n_fs = 1;
spin_lock(&p->fs->lock);
rcu_read_lock();
- while_each_thread(p, t) {
+ for_other_threads(p, t) {
if (t->fs == p->fs)
n_fs++;
}
rcu_read_unlock();
+ /* "users" and "in_exec" locked for copy_fs() */
if (p->fs->users > n_fs)
bprm->unsafe |= LSM_UNSAFE_SHARE;
else
@@ -1804,13 +1850,8 @@ static int exec_binprm(struct linux_binprm *bprm)
return 0;
}
-/*
- * sys_execve() executes a new program.
- */
-static int bprm_execve(struct linux_binprm *bprm,
- int fd, struct filename *filename, int flags)
+static int bprm_execve(struct linux_binprm *bprm)
{
- struct file *file;
int retval;
retval = prepare_bprm_creds(bprm);
@@ -1826,26 +1867,8 @@ static int bprm_execve(struct linux_binprm *bprm,
current->in_execve = 1;
sched_mm_cid_before_execve(current);
- file = do_open_execat(fd, filename, flags);
- retval = PTR_ERR(file);
- if (IS_ERR(file))
- goto out_unmark;
-
sched_exec();
- bprm->file = file;
- /*
- * Record that a name derived from an O_CLOEXEC fd will be
- * inaccessible after exec. This allows the code in exec to
- * choose to fail when the executable is not mmaped into the
- * interpreter and an open file descriptor is not passed to
- * the interpreter. This makes for a better user experience
- * than having the interpreter start and then immediately fail
- * when it finds the executable is inaccessible.
- */
- if (bprm->fdpath && get_close_on_exec(fd))
- bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
-
/* Set the unchanging part of bprm->cred */
retval = security_bprm_creds_for_exec(bprm);
if (retval)
@@ -1875,7 +1898,6 @@ out:
if (bprm->point_of_no_return && !fatal_signal_pending(current))
force_fatal_sig(SIGSEGV);
-out_unmark:
sched_mm_cid_after_execve(current);
current->fs->in_exec = 0;
current->in_execve = 0;
@@ -1910,7 +1932,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* further execve() calls fail. */
current->flags &= ~PF_NPROC_EXCEEDED;
- bprm = alloc_bprm(fd, filename);
+ bprm = alloc_bprm(fd, filename, flags);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
@@ -1959,7 +1981,7 @@ static int do_execveat_common(int fd, struct filename *filename,
bprm->argc = 1;
}
- retval = bprm_execve(bprm, fd, filename, flags);
+ retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
@@ -1984,7 +2006,7 @@ int kernel_execve(const char *kernel_filename,
if (IS_ERR(filename))
return PTR_ERR(filename);
- bprm = alloc_bprm(fd, filename);
+ bprm = alloc_bprm(fd, filename, 0);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
@@ -2019,7 +2041,7 @@ int kernel_execve(const char *kernel_filename,
if (retval < 0)
goto out_free;
- retval = bprm_execve(bprm, fd, filename, 0);
+ retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
out_ret:
@@ -2165,7 +2187,6 @@ static struct ctl_table fs_exec_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
- { }
};
static int __init init_fs_exec_sysctls(void)
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index e918decb3735..0356c88252bd 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -5,42 +5,23 @@
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/bitmap.h>
#include <linux/buffer_head.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
-static const unsigned char free_bit[] = {
- 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/* 0 ~ 19*/
- 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3,/* 20 ~ 39*/
- 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/* 40 ~ 59*/
- 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/* 60 ~ 79*/
- 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2,/* 80 ~ 99*/
- 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3,/*100 ~ 119*/
- 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*120 ~ 139*/
- 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5,/*140 ~ 159*/
- 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2,/*160 ~ 179*/
- 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3,/*180 ~ 199*/
- 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2,/*200 ~ 219*/
- 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4,/*220 ~ 239*/
- 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 /*240 ~ 254*/
-};
-
-static const unsigned char used_bit[] = {
- 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3,/* 0 ~ 19*/
- 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4,/* 20 ~ 39*/
- 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5,/* 40 ~ 59*/
- 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,/* 60 ~ 79*/
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4,/* 80 ~ 99*/
- 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6,/*100 ~ 119*/
- 4, 5, 5, 6, 5, 6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4,/*120 ~ 139*/
- 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,/*140 ~ 159*/
- 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5,/*160 ~ 179*/
- 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5,/*180 ~ 199*/
- 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6,/*200 ~ 219*/
- 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,/*220 ~ 239*/
- 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8 /*240 ~ 255*/
-};
+#if BITS_PER_LONG == 32
+#define __le_long __le32
+#define lel_to_cpu(A) le32_to_cpu(A)
+#define cpu_to_lel(A) cpu_to_le32(A)
+#elif BITS_PER_LONG == 64
+#define __le_long __le64
+#define lel_to_cpu(A) le64_to_cpu(A)
+#define cpu_to_lel(A) cpu_to_le64(A)
+#else
+#error "BITS_PER_LONG not 32 or 64"
+#endif
/*
* Allocation Bitmap Management Functions
@@ -200,32 +181,35 @@ unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu)
{
unsigned int i, map_i, map_b, ent_idx;
unsigned int clu_base, clu_free;
- unsigned char k, clu_mask;
+ unsigned long clu_bits, clu_mask;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ __le_long bitval;
WARN_ON(clu < EXFAT_FIRST_CLUSTER);
- ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
- clu_base = BITMAP_ENT_TO_CLUSTER(ent_idx & ~(BITS_PER_BYTE_MASK));
+ ent_idx = ALIGN_DOWN(CLUSTER_TO_BITMAP_ENT(clu), BITS_PER_LONG);
+ clu_base = BITMAP_ENT_TO_CLUSTER(ent_idx);
clu_mask = IGNORED_BITS_REMAINED(clu, clu_base);
map_i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
map_b = BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent_idx);
for (i = EXFAT_FIRST_CLUSTER; i < sbi->num_clusters;
- i += BITS_PER_BYTE) {
- k = *(sbi->vol_amap[map_i]->b_data + map_b);
+ i += BITS_PER_LONG) {
+ bitval = *(__le_long *)(sbi->vol_amap[map_i]->b_data + map_b);
if (clu_mask > 0) {
- k |= clu_mask;
+ bitval |= cpu_to_lel(clu_mask);
clu_mask = 0;
}
- if (k < 0xFF) {
- clu_free = clu_base + free_bit[k];
+ if (lel_to_cpu(bitval) != ULONG_MAX) {
+ clu_bits = lel_to_cpu(bitval);
+ clu_free = clu_base + ffz(clu_bits);
if (clu_free < sbi->num_clusters)
return clu_free;
}
- clu_base += BITS_PER_BYTE;
+ clu_base += BITS_PER_LONG;
+ map_b += sizeof(long);
- if (++map_b >= sb->s_blocksize ||
+ if (map_b >= sb->s_blocksize ||
clu_base >= sbi->num_clusters) {
if (++map_i >= sbi->map_sectors) {
clu_base = EXFAT_FIRST_CLUSTER;
@@ -244,25 +228,24 @@ int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count)
unsigned int count = 0;
unsigned int i, map_i = 0, map_b = 0;
unsigned int total_clus = EXFAT_DATA_CLUSTER_COUNT(sbi);
- unsigned int last_mask = total_clus & BITS_PER_BYTE_MASK;
- unsigned char clu_bits;
- const unsigned char last_bit_mask[] = {0, 0b00000001, 0b00000011,
- 0b00000111, 0b00001111, 0b00011111, 0b00111111, 0b01111111};
+ unsigned int last_mask = total_clus & (BITS_PER_LONG - 1);
+ unsigned long *bitmap, clu_bits;
total_clus &= ~last_mask;
- for (i = 0; i < total_clus; i += BITS_PER_BYTE) {
- clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b);
- count += used_bit[clu_bits];
- if (++map_b >= (unsigned int)sb->s_blocksize) {
+ for (i = 0; i < total_clus; i += BITS_PER_LONG) {
+ bitmap = (void *)(sbi->vol_amap[map_i]->b_data + map_b);
+ count += hweight_long(*bitmap);
+ map_b += sizeof(long);
+ if (map_b >= (unsigned int)sb->s_blocksize) {
map_i++;
map_b = 0;
}
}
if (last_mask) {
- clu_bits = *(sbi->vol_amap[map_i]->b_data + map_b);
- clu_bits &= last_bit_mask[last_mask];
- count += used_bit[clu_bits];
+ bitmap = (void *)(sbi->vol_amap[map_i]->b_data + map_b);
+ clu_bits = lel_to_cpu(*(__le_long *)bitmap);
+ count += hweight_long(clu_bits & BITMAP_LAST_WORD_MASK(last_mask));
}
*ret_count = count;
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index a7a2c35d74fb..361595433480 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -135,8 +135,7 @@ enum {
#define BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent) (ent & BITS_PER_SECTOR_MASK(sb))
#define BITMAP_OFFSET_BYTE_IN_SECTOR(sb, ent) \
((ent / BITS_PER_BYTE) & ((sb)->s_blocksize - 1))
-#define BITS_PER_BYTE_MASK 0x7
-#define IGNORED_BITS_REMAINED(clu, clu_base) ((1 << ((clu) - (clu_base))) - 1)
+#define IGNORED_BITS_REMAINED(clu, clu_base) ((1UL << ((clu) - (clu_base))) - 1)
#define ES_ENTRY_NUM(name_len) (ES_IDX_LAST_FILENAME(name_len) + 1)
/* 19 entries = 1 file entry + 1 stream entry + 17 filename entries */
@@ -208,6 +207,7 @@ struct exfat_dir_entry {
unsigned char flags;
unsigned short attr;
loff_t size;
+ loff_t valid_size;
unsigned int num_subdirs;
struct timespec64 atime;
struct timespec64 mtime;
@@ -275,6 +275,7 @@ struct exfat_sb_info {
spinlock_t inode_hash_lock;
struct hlist_head inode_hashtable[EXFAT_HASH_SIZE];
+ struct rcu_head rcu;
};
#define EXFAT_CACHE_VALID 0
@@ -317,6 +318,7 @@ struct exfat_inode_info {
loff_t i_size_aligned;
/* on-disk position of directory entry or 0 */
loff_t i_pos;
+ loff_t valid_size;
/* hash by i_location */
struct hlist_node i_hash_fat;
/* protect bmap against truncate */
diff --git a/fs/exfat/file.c b/fs/exfat/file.c
index bfdfafe00993..cc00f1a7a1e1 100644
--- a/fs/exfat/file.c
+++ b/fs/exfat/file.c
@@ -11,37 +11,83 @@
#include <linux/fsnotify.h>
#include <linux/security.h>
#include <linux/msdos_fs.h>
+#include <linux/writeback.h>
#include "exfat_raw.h"
#include "exfat_fs.h"
static int exfat_cont_expand(struct inode *inode, loff_t size)
{
- struct address_space *mapping = inode->i_mapping;
- loff_t start = i_size_read(inode), count = size - i_size_read(inode);
- int err, err2;
+ int ret;
+ unsigned int num_clusters, new_num_clusters, last_clu;
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct super_block *sb = inode->i_sb;
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct exfat_chain clu;
- err = generic_cont_expand_simple(inode, size);
- if (err)
- return err;
+ ret = inode_newsize_ok(inode, size);
+ if (ret)
+ return ret;
+
+ num_clusters = EXFAT_B_TO_CLU_ROUND_UP(ei->i_size_ondisk, sbi);
+ new_num_clusters = EXFAT_B_TO_CLU_ROUND_UP(size, sbi);
+
+ if (new_num_clusters == num_clusters)
+ goto out;
+
+ if (num_clusters) {
+ exfat_chain_set(&clu, ei->start_clu, num_clusters, ei->flags);
+ ret = exfat_find_last_cluster(sb, &clu, &last_clu);
+ if (ret)
+ return ret;
+
+ clu.dir = last_clu + 1;
+ } else {
+ last_clu = EXFAT_EOF_CLUSTER;
+ clu.dir = EXFAT_EOF_CLUSTER;
+ }
+
+ clu.size = 0;
+ clu.flags = ei->flags;
+
+ ret = exfat_alloc_cluster(inode, new_num_clusters - num_clusters,
+ &clu, IS_DIRSYNC(inode));
+ if (ret)
+ return ret;
+
+ /* Append new clusters to chain */
+ if (num_clusters) {
+ if (clu.flags != ei->flags)
+ if (exfat_chain_cont_cluster(sb, ei->start_clu, num_clusters))
+ goto free_clu;
+
+ if (clu.flags == ALLOC_FAT_CHAIN)
+ if (exfat_ent_set(sb, last_clu, clu.dir))
+ goto free_clu;
+ } else
+ ei->start_clu = clu.dir;
+
+ ei->flags = clu.flags;
+out:
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
- mark_inode_dirty(inode);
+ /* Expanded range not zeroed, do not update valid_size */
+ i_size_write(inode, size);
- if (!IS_SYNC(inode))
- return 0;
+ ei->i_size_aligned = round_up(size, sb->s_blocksize);
+ ei->i_size_ondisk = ei->i_size_aligned;
+ inode->i_blocks = round_up(size, sbi->cluster_size) >> 9;
- err = filemap_fdatawrite_range(mapping, start, start + count - 1);
- err2 = sync_mapping_buffers(mapping);
- if (!err)
- err = err2;
- err2 = write_inode_now(inode, 1);
- if (!err)
- err = err2;
- if (err)
- return err;
+ if (IS_DIRSYNC(inode))
+ return write_inode_now(inode, 1);
+
+ mark_inode_dirty(inode);
+
+ return 0;
- return filemap_fdatawait_range(mapping, start, start + count - 1);
+free_clu:
+ exfat_free_cluster(inode, &clu);
+ return -EIO;
}
static bool exfat_allow_set_time(struct exfat_sb_info *sbi, struct inode *inode)
@@ -146,6 +192,9 @@ int __exfat_truncate(struct inode *inode)
ei->start_clu = EXFAT_EOF_CLUSTER;
}
+ if (i_size_read(inode) < ei->valid_size)
+ ei->valid_size = i_size_read(inode);
+
if (ei->type == TYPE_FILE)
ei->attr |= EXFAT_ATTR_ARCHIVE;
@@ -474,15 +523,124 @@ int exfat_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
return blkdev_issue_flush(inode->i_sb->s_bdev);
}
+static int exfat_file_zeroed_range(struct file *file, loff_t start, loff_t end)
+{
+ int err;
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ const struct address_space_operations *ops = mapping->a_ops;
+
+ while (start < end) {
+ u32 zerofrom, len;
+ struct page *page = NULL;
+
+ zerofrom = start & (PAGE_SIZE - 1);
+ len = PAGE_SIZE - zerofrom;
+ if (start + len > end)
+ len = end - start;
+
+ err = ops->write_begin(file, mapping, start, len, &page, NULL);
+ if (err)
+ goto out;
+
+ zero_user_segment(page, zerofrom, zerofrom + len);
+
+ err = ops->write_end(file, mapping, start, len, len, page, NULL);
+ if (err < 0)
+ goto out;
+ start += len;
+
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ }
+
+out:
+ return err;
+}
+
+static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ ssize_t ret;
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ loff_t pos = iocb->ki_pos;
+ loff_t valid_size;
+
+ inode_lock(inode);
+
+ valid_size = ei->valid_size;
+
+ ret = generic_write_checks(iocb, iter);
+ if (ret < 0)
+ goto unlock;
+
+ if (pos > valid_size) {
+ ret = exfat_file_zeroed_range(file, valid_size, pos);
+ if (ret < 0 && ret != -ENOSPC) {
+ exfat_err(inode->i_sb,
+ "write: fail to zero from %llu to %llu(%zd)",
+ valid_size, pos, ret);
+ }
+ if (ret < 0)
+ goto unlock;
+ }
+
+ ret = __generic_file_write_iter(iocb, iter);
+ if (ret < 0)
+ goto unlock;
+
+ inode_unlock(inode);
+
+ if (pos > valid_size)
+ pos = valid_size;
+
+ if (iocb_is_dsync(iocb) && iocb->ki_pos > pos) {
+ ssize_t err = vfs_fsync_range(file, pos, iocb->ki_pos - 1,
+ iocb->ki_flags & IOCB_SYNC);
+ if (err < 0)
+ return err;
+ }
+
+ return ret;
+
+unlock:
+ inode_unlock(inode);
+
+ return ret;
+}
+
+static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ int ret;
+ struct inode *inode = file_inode(file);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ loff_t start = ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ loff_t end = min_t(loff_t, i_size_read(inode),
+ start + vma->vm_end - vma->vm_start);
+
+ if ((vma->vm_flags & VM_WRITE) && ei->valid_size < end) {
+ ret = exfat_file_zeroed_range(file, ei->valid_size, end);
+ if (ret < 0) {
+ exfat_err(inode->i_sb,
+ "mmap: fail to zero from %llu to %llu(%d)",
+ start, end, ret);
+ return ret;
+ }
+ }
+
+ return generic_file_mmap(file, vma);
+}
+
const struct file_operations exfat_file_operations = {
.llseek = generic_file_llseek,
.read_iter = generic_file_read_iter,
- .write_iter = generic_file_write_iter,
+ .write_iter = exfat_file_write_iter,
.unlocked_ioctl = exfat_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = exfat_compat_ioctl,
#endif
- .mmap = generic_file_mmap,
+ .mmap = exfat_file_mmap,
.fsync = exfat_file_fsync,
.splice_read = filemap_splice_read,
.splice_write = iter_file_splice_write,
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index e7ff58b8e68c..0687f952956c 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -75,8 +75,17 @@ int __exfat_write_inode(struct inode *inode, int sync)
if (ei->start_clu == EXFAT_EOF_CLUSTER)
on_disk_size = 0;
- ep2->dentry.stream.valid_size = cpu_to_le64(on_disk_size);
- ep2->dentry.stream.size = ep2->dentry.stream.valid_size;
+ ep2->dentry.stream.size = cpu_to_le64(on_disk_size);
+ /*
+ * mmap write does not use exfat_write_end(), valid_size may be
+ * extended to the sector-aligned length in exfat_get_block().
+ * So we need to fixup valid_size to the writren length.
+ */
+ if (on_disk_size < ei->valid_size)
+ ep2->dentry.stream.valid_size = ep2->dentry.stream.size;
+ else
+ ep2->dentry.stream.valid_size = cpu_to_le64(ei->valid_size);
+
if (on_disk_size) {
ep2->dentry.stream.flags = ei->flags;
ep2->dentry.stream.start_clu = cpu_to_le32(ei->start_clu);
@@ -278,6 +287,7 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
unsigned int cluster, sec_offset;
sector_t last_block;
sector_t phys = 0;
+ sector_t valid_blks;
loff_t pos;
mutex_lock(&sbi->s_lock);
@@ -306,17 +316,32 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
mapped_blocks = sbi->sect_per_clus - sec_offset;
max_blocks = min(mapped_blocks, max_blocks);
- /* Treat newly added block / cluster */
- if (iblock < last_block)
- create = 0;
-
- if (create || buffer_delay(bh_result)) {
- pos = EXFAT_BLK_TO_B((iblock + 1), sb);
+ pos = EXFAT_BLK_TO_B((iblock + 1), sb);
+ if ((create && iblock >= last_block) || buffer_delay(bh_result)) {
if (ei->i_size_ondisk < pos)
ei->i_size_ondisk = pos;
}
+ map_bh(bh_result, sb, phys);
+ if (buffer_delay(bh_result))
+ clear_buffer_delay(bh_result);
+
if (create) {
+ valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb);
+
+ if (iblock + max_blocks < valid_blks) {
+ /* The range has been written, map it */
+ goto done;
+ } else if (iblock < valid_blks) {
+ /*
+ * The range has been partially written,
+ * map the written part.
+ */
+ max_blocks = valid_blks - iblock;
+ goto done;
+ }
+
+ /* The area has not been written, map and mark as new. */
err = exfat_map_new_buffer(ei, bh_result, pos);
if (err) {
exfat_fs_error(sb,
@@ -324,11 +349,58 @@ static int exfat_get_block(struct inode *inode, sector_t iblock,
pos, ei->i_size_aligned);
goto unlock_ret;
}
- }
- if (buffer_delay(bh_result))
- clear_buffer_delay(bh_result);
- map_bh(bh_result, sb, phys);
+ ei->valid_size = EXFAT_BLK_TO_B(iblock + max_blocks, sb);
+ mark_inode_dirty(inode);
+ } else {
+ valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb);
+
+ if (iblock + max_blocks < valid_blks) {
+ /* The range has been written, map it */
+ goto done;
+ } else if (iblock < valid_blks) {
+ /*
+ * The area has been partially written,
+ * map the written part.
+ */
+ max_blocks = valid_blks - iblock;
+ goto done;
+ } else if (iblock == valid_blks &&
+ (ei->valid_size & (sb->s_blocksize - 1))) {
+ /*
+ * The block has been partially written,
+ * zero the unwritten part and map the block.
+ */
+ loff_t size, off;
+
+ max_blocks = 1;
+
+ /*
+ * For direct read, the unwritten part will be zeroed in
+ * exfat_direct_IO()
+ */
+ if (!bh_result->b_folio)
+ goto done;
+
+ pos -= sb->s_blocksize;
+ size = ei->valid_size - pos;
+ off = pos & (PAGE_SIZE - 1);
+
+ folio_set_bh(bh_result, bh_result->b_folio, off);
+ err = bh_read(bh_result, 0);
+ if (err < 0)
+ goto unlock_ret;
+
+ folio_zero_segment(bh_result->b_folio, off + size,
+ off + sb->s_blocksize);
+ } else {
+ /*
+ * The range has not been written, clear the mapped flag
+ * to only zero the cache and do not read from disk.
+ */
+ clear_buffer_mapped(bh_result);
+ }
+ }
done:
bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb);
unlock_ret:
@@ -343,6 +415,17 @@ static int exfat_read_folio(struct file *file, struct folio *folio)
static void exfat_readahead(struct readahead_control *rac)
{
+ struct address_space *mapping = rac->mapping;
+ struct inode *inode = mapping->host;
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ loff_t pos = readahead_pos(rac);
+
+ /* Range cross valid_size, read it page by page. */
+ if (ei->valid_size < i_size_read(inode) &&
+ pos <= ei->valid_size &&
+ ei->valid_size < pos + readahead_length(rac))
+ return;
+
mpage_readahead(rac, exfat_get_block);
}
@@ -370,9 +453,7 @@ static int exfat_write_begin(struct file *file, struct address_space *mapping,
int ret;
*pagep = NULL;
- ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata,
- exfat_get_block,
- &EXFAT_I(mapping->host)->i_size_ondisk);
+ ret = block_write_begin(mapping, pos, len, pagep, exfat_get_block);
if (ret < 0)
exfat_write_failed(mapping, pos+len);
@@ -400,6 +481,11 @@ static int exfat_write_end(struct file *file, struct address_space *mapping,
if (err < len)
exfat_write_failed(mapping, pos+len);
+ if (!(err < 0) && pos + err > ei->valid_size) {
+ ei->valid_size = pos + err;
+ mark_inode_dirty(inode);
+ }
+
if (!(err < 0) && !(ei->attr & EXFAT_ATTR_ARCHIVE)) {
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
ei->attr |= EXFAT_ATTR_ARCHIVE;
@@ -413,7 +499,9 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = mapping->host;
- loff_t size = iocb->ki_pos + iov_iter_count(iter);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ loff_t pos = iocb->ki_pos;
+ loff_t size = pos + iov_iter_count(iter);
int rw = iov_iter_rw(iter);
ssize_t ret;
@@ -436,8 +524,20 @@ static ssize_t exfat_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
* condition of exfat_get_block() and ->truncate().
*/
ret = blockdev_direct_IO(iocb, inode, iter, exfat_get_block);
- if (ret < 0 && (rw & WRITE))
- exfat_write_failed(mapping, size);
+ if (ret < 0) {
+ if (rw == WRITE && ret != -EIOCBQUEUED)
+ exfat_write_failed(mapping, size);
+
+ return ret;
+ } else
+ size = pos + ret;
+
+ /* zero the unwritten part in the partially written block */
+ if (rw == READ && pos < ei->valid_size && ei->valid_size < size) {
+ iov_iter_revert(iter, size - ei->valid_size);
+ iov_iter_zero(size - ei->valid_size, iter);
+ }
+
return ret;
}
@@ -537,6 +637,7 @@ static int exfat_fill_inode(struct inode *inode, struct exfat_dir_entry *info)
ei->start_clu = info->start_clu;
ei->flags = info->flags;
ei->type = info->type;
+ ei->valid_size = info->valid_size;
ei->version = 0;
ei->hint_stat.eidx = 0;
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 5d737e0b639a..9c549fd11fc8 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -406,6 +406,7 @@ static int exfat_find_empty_entry(struct inode *inode,
i_size_write(inode, size);
ei->i_size_ondisk += sbi->cluster_size;
ei->i_size_aligned += sbi->cluster_size;
+ ei->valid_size += sbi->cluster_size;
ei->flags = p_dir->flags;
inode->i_blocks += sbi->cluster_size >> 9;
}
@@ -558,6 +559,8 @@ static int exfat_add_entry(struct inode *inode, const char *path,
info->size = clu_size;
info->num_subdirs = EXFAT_MIN_SUBDIR;
}
+ info->valid_size = info->size;
+
memset(&info->crtime, 0, sizeof(info->crtime));
memset(&info->mtime, 0, sizeof(info->mtime));
memset(&info->atime, 0, sizeof(info->atime));
@@ -660,6 +663,8 @@ static int exfat_find(struct inode *dir, struct qstr *qname,
info->type = exfat_get_entry_type(ep);
info->attr = le16_to_cpu(ep->dentry.file.attr);
info->size = le64_to_cpu(ep2->dentry.stream.valid_size);
+ info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size);
+ info->size = le64_to_cpu(ep2->dentry.stream.size);
if (info->size == 0) {
info->flags = ALLOC_NO_FAT_CHAIN;
info->start_clu = EXFAT_EOF_CLUSTER;
@@ -1288,6 +1293,7 @@ static int __exfat_rename(struct inode *old_parent_inode,
}
i_size_write(new_inode, 0);
+ new_ei->valid_size = 0;
new_ei->start_clu = EXFAT_EOF_CLUSTER;
new_ei->flags = ALLOC_NO_FAT_CHAIN;
}
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index 705710f93e2d..afdf13c34ff5 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -655,7 +655,6 @@ static int exfat_load_upcase_table(struct super_block *sb,
unsigned int sect_size = sb->s_blocksize;
unsigned int i, index = 0;
u32 chksum = 0;
- int ret;
unsigned char skip = false;
unsigned short *upcase_table;
@@ -673,8 +672,7 @@ static int exfat_load_upcase_table(struct super_block *sb,
if (!bh) {
exfat_err(sb, "failed to read sector(0x%llx)",
(unsigned long long)sector);
- ret = -EIO;
- goto free_table;
+ return -EIO;
}
sector++;
for (i = 0; i < sect_size && index <= 0xFFFF; i += 2) {
@@ -701,15 +699,12 @@ static int exfat_load_upcase_table(struct super_block *sb,
exfat_err(sb, "failed to load upcase table (idx : 0x%08x, chksum : 0x%08x, utbl_chksum : 0x%08x)",
index, chksum, utbl_checksum);
- ret = -EINVAL;
-free_table:
- exfat_free_upcase_table(sbi);
- return ret;
+ return -EINVAL;
}
static int exfat_load_default_upcase_table(struct super_block *sb)
{
- int i, ret = -EIO;
+ int i;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
unsigned char skip = false;
unsigned short uni = 0, *upcase_table;
@@ -740,8 +735,7 @@ static int exfat_load_default_upcase_table(struct super_block *sb)
return 0;
/* FATAL error: default upcase table has error */
- exfat_free_upcase_table(sbi);
- return ret;
+ return -EIO;
}
int exfat_create_upcase_table(struct super_block *sb)
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index d9d4fa91010b..fcb658267765 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -39,9 +39,6 @@ static void exfat_put_super(struct super_block *sb)
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
-
- unload_nls(sbi->nls_io);
- exfat_free_upcase_table(sbi);
}
static int exfat_sync_fs(struct super_block *sb, int wait)
@@ -600,7 +597,7 @@ static int __exfat_fill_super(struct super_block *sb)
ret = exfat_load_bitmap(sb);
if (ret) {
exfat_err(sb, "failed to load alloc-bitmap");
- goto free_upcase_table;
+ goto free_bh;
}
ret = exfat_count_used_clusters(sb, &sbi->used_clusters);
@@ -613,8 +610,6 @@ static int __exfat_fill_super(struct super_block *sb)
free_alloc_bitmap:
exfat_free_bitmap(sbi);
-free_upcase_table:
- exfat_free_upcase_table(sbi);
free_bh:
brelse(sbi->boot_bh);
return ret;
@@ -701,12 +696,10 @@ put_inode:
sb->s_root = NULL;
free_table:
- exfat_free_upcase_table(sbi);
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
check_nls_io:
- unload_nls(sbi->nls_io);
return err;
}
@@ -771,13 +764,22 @@ static int exfat_init_fs_context(struct fs_context *fc)
return 0;
}
+static void delayed_free(struct rcu_head *p)
+{
+ struct exfat_sb_info *sbi = container_of(p, struct exfat_sb_info, rcu);
+
+ unload_nls(sbi->nls_io);
+ exfat_free_upcase_table(sbi);
+ exfat_free_sbi(sbi);
+}
+
static void exfat_kill_sb(struct super_block *sb)
{
struct exfat_sb_info *sbi = sb->s_fs_info;
kill_block_super(sb);
if (sbi)
- exfat_free_sbi(sbi);
+ call_rcu(&sbi->rcu, delayed_free);
}
static struct file_system_type exfat_fs_type = {
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 464faf6c217e..5a4272b2c6b0 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -969,7 +969,7 @@ const struct address_space_operations ext2_aops = {
.writepages = ext2_writepages,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
static const struct address_space_operations ext2_dax_aops = {
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 65f702b1da5b..8346ab9534c1 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -325,6 +325,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
struct ext2_dir_entry_2 * dir_de = NULL;
struct folio * old_folio;
struct ext2_dir_entry_2 * old_de;
+ bool old_is_dir = S_ISDIR(old_inode->i_mode);
int err;
if (flags & ~RENAME_NOREPLACE)
@@ -342,7 +343,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
if (IS_ERR(old_de))
return PTR_ERR(old_de);
- if (S_ISDIR(old_inode->i_mode)) {
+ if (old_is_dir && old_dir != new_dir) {
err = -EIO;
dir_de = ext2_dotdot(old_inode, &dir_folio);
if (!dir_de)
@@ -354,7 +355,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
struct ext2_dir_entry_2 *new_de;
err = -ENOTEMPTY;
- if (dir_de && !ext2_empty_dir (new_inode))
+ if (old_is_dir && !ext2_empty_dir(new_inode))
goto out_dir;
new_de = ext2_find_entry(new_dir, &new_dentry->d_name,
@@ -368,14 +369,14 @@ static int ext2_rename (struct mnt_idmap * idmap,
if (err)
goto out_dir;
inode_set_ctime_current(new_inode);
- if (dir_de)
+ if (old_is_dir)
drop_nlink(new_inode);
inode_dec_link_count(new_inode);
} else {
err = ext2_add_link(new_dentry, old_inode);
if (err)
goto out_dir;
- if (dir_de)
+ if (old_is_dir)
inode_inc_link_count(new_dir);
}
@@ -387,7 +388,7 @@ static int ext2_rename (struct mnt_idmap * idmap,
mark_inode_dirty(old_inode);
err = ext2_delete_entry(old_de, old_folio);
- if (!err && dir_de) {
+ if (!err && old_is_dir) {
if (old_dir != new_dir)
err = ext2_set_link(old_inode, dir_de, dir_folio,
new_dir, false);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a5d784872303..023571f8dd1b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -252,8 +252,10 @@ struct ext4_allocation_request {
#define EXT4_MAP_MAPPED BIT(BH_Mapped)
#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
+#define EXT4_MAP_DELAYED BIT(BH_Delay)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
- EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY)
+ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+ EXT4_MAP_DELAYED)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -2912,10 +2914,10 @@ extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
-extern int ext4_mb_release(struct super_block *);
+extern void ext4_mb_release(struct super_block *);
extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *,
struct ext4_allocation_request *, int *);
-extern void ext4_discard_preallocations(struct inode *, unsigned int);
+extern void ext4_discard_preallocations(struct inode *);
extern int __init ext4_init_mballoc(void);
extern void ext4_exit_mballoc(void);
extern ext4_group_t ext4_mb_prefetch(struct super_block *sb,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index d1a2e6624401..5d8055161acd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -235,8 +235,6 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
might_sleep();
- ext4_check_bdev_write_error(sb);
-
if (ext4_handle_valid(handle)) {
err = jbd2_journal_get_write_access(handle, bh);
if (err) {
@@ -244,7 +242,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line,
handle, err);
return err;
}
- }
+ } else
+ ext4_check_bdev_write_error(sb);
if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb))
return 0;
BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d5efe076d3d3..7669d154c05e 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -100,7 +100,7 @@ static int ext4_ext_trunc_restart_fn(struct inode *inode, int *dropped)
* i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
@@ -2229,7 +2229,7 @@ static int ext4_fill_es_cache_info(struct inode *inode,
/*
- * ext4_ext_determine_hole - determine hole around given block
+ * ext4_ext_find_hole - find hole around given block according to the given path
* @inode: inode we lookup in
* @path: path in extent tree to @lblk
* @lblk: pointer to logical block around which we want to determine hole
@@ -2241,9 +2241,9 @@ static int ext4_fill_es_cache_info(struct inode *inode,
* The function returns the length of a hole starting at @lblk. We update @lblk
* to the beginning of the hole if we managed to find it.
*/
-static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
- struct ext4_ext_path *path,
- ext4_lblk_t *lblk)
+static ext4_lblk_t ext4_ext_find_hole(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t *lblk)
{
int depth = ext_depth(inode);
struct ext4_extent *ex;
@@ -2271,30 +2271,6 @@ static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
}
/*
- * ext4_ext_put_gap_in_cache:
- * calculate boundaries of the gap that the requested block fits into
- * and cache this gap
- */
-static void
-ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
- ext4_lblk_t hole_len)
-{
- struct extent_status es;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
- hole_start + hole_len - 1, &es);
- if (es.es_len) {
- /* There's delayed extent containing lblock? */
- if (es.es_lblk <= hole_start)
- return;
- hole_len = min(es.es_lblk - hole_start, hole_len);
- }
- ext_debug(inode, " -> %u:%u\n", hole_start, hole_len);
- ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
- EXTENT_STATUS_HOLE);
-}
-
-/*
* ext4_ext_rm_idx:
* removes index from the index block.
*/
@@ -4062,6 +4038,72 @@ static int get_implied_cluster_alloc(struct super_block *sb,
return 0;
}
+/*
+ * Determine hole length around the given logical block, first try to
+ * locate and expand the hole from the given @path, and then adjust it
+ * if it's partially or completely converted to delayed extents, insert
+ * it into the extent cache tree if it's indeed a hole, finally return
+ * the length of the determined extent.
+ */
+static ext4_lblk_t ext4_ext_determine_insert_hole(struct inode *inode,
+ struct ext4_ext_path *path,
+ ext4_lblk_t lblk)
+{
+ ext4_lblk_t hole_start, len;
+ struct extent_status es;
+
+ hole_start = lblk;
+ len = ext4_ext_find_hole(inode, path, &hole_start);
+again:
+ ext4_es_find_extent_range(inode, &ext4_es_is_delayed, hole_start,
+ hole_start + len - 1, &es);
+ if (!es.es_len)
+ goto insert_hole;
+
+ /*
+ * There's a delalloc extent in the hole, handle it if the delalloc
+ * extent is in front of, behind and straddle the queried range.
+ */
+ if (lblk >= es.es_lblk + es.es_len) {
+ /*
+ * The delalloc extent is in front of the queried range,
+ * find again from the queried start block.
+ */
+ len -= lblk - hole_start;
+ hole_start = lblk;
+ goto again;
+ } else if (in_range(lblk, es.es_lblk, es.es_len)) {
+ /*
+ * The delalloc extent containing lblk, it must have been
+ * added after ext4_map_blocks() checked the extent status
+ * tree so we are not holding i_rwsem and delalloc info is
+ * only stabilized by i_data_sem we are going to release
+ * soon. Don't modify the extent status tree and report
+ * extent as a hole, just adjust the length to the delalloc
+ * extent's after lblk.
+ */
+ len = es.es_lblk + es.es_len - lblk;
+ return len;
+ } else {
+ /*
+ * The delalloc extent is partially or completely behind
+ * the queried range, update hole length until the
+ * beginning of the delalloc extent.
+ */
+ len = min(es.es_lblk - hole_start, len);
+ }
+
+insert_hole:
+ /* Put just found gap into cache to speed up subsequent requests */
+ ext_debug(inode, " -> %u:%u\n", hole_start, len);
+ ext4_es_insert_extent(inode, hole_start, len, ~0, EXTENT_STATUS_HOLE);
+
+ /* Update hole_len to reflect hole size after lblk */
+ if (hole_start != lblk)
+ len -= lblk - hole_start;
+
+ return len;
+}
/*
* Block allocation/map/preallocation routine for extents based files
@@ -4179,22 +4221,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
* we couldn't try to create block if create flag is zero
*/
if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
- ext4_lblk_t hole_start, hole_len;
+ ext4_lblk_t len;
- hole_start = map->m_lblk;
- hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
- /*
- * put just found gap into cache to speed up
- * subsequent requests
- */
- ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
+ len = ext4_ext_determine_insert_hole(inode, path, map->m_lblk);
- /* Update hole_len to reflect hole size after map->m_lblk */
- if (hole_start != map->m_lblk)
- hole_len -= map->m_lblk - hole_start;
map->m_pblk = 0;
- map->m_len = min_t(unsigned int, map->m_len, hole_len);
-
+ map->m_len = min_t(unsigned int, map->m_len, len);
goto out;
}
@@ -4313,7 +4345,7 @@ got_allocated_blocks:
* not a good idea to call discard here directly,
* but otherwise we'd need to call it every free().
*/
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
fb_flags = EXT4_FREE_BLOCKS_NO_QUOT_UPDATE;
ext4_free_blocks(handle, inode, NULL, newblock,
@@ -4523,7 +4555,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* Round up offset. This is not fallocate, we need to zero out
* blocks, so convert interior block aligned part of the range to
* unwritten and possibly manually zero out unaligned parts of the
- * range.
+ * range. Here, start and partial_begin are inclusive, end and
+ * partial_end are exclusive.
*/
start = round_up(offset, 1 << blkbits);
end = round_down((offset + len), 1 << blkbits);
@@ -4609,7 +4642,8 @@ static long ext4_zero_range(struct file *file, loff_t offset,
* disk in case of crash before zeroing trans is committed.
*/
if (ext4_should_journal_data(inode)) {
- ret = filemap_write_and_wait_range(mapping, start, end);
+ ret = filemap_write_and_wait_range(mapping, start,
+ end - 1);
if (ret) {
filemap_invalidate_unlock(mapping);
goto out_mutex;
@@ -5355,7 +5389,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle);
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
@@ -5363,7 +5397,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
up_write(&EXT4_I(inode)->i_data_sem);
goto out_stop;
}
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
ret = ext4_ext_shift_extents(inode, handle, punch_stop,
punch_stop - punch_start, SHIFT_LEFT);
@@ -5495,7 +5529,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
goto out_stop;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
path = ext4_find_extent(inode, offset_lblk, NULL, 0);
if (IS_ERR(path)) {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6aa15dafc677..54d6ff22585c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -174,7 +174,7 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
(atomic_read(&inode->i_writecount) == 1) &&
!EXT4_I(inode)->i_reserved_data_blocks) {
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
}
if (is_dx(inode) && filp->private_data)
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index a9f3716119d3..d8ca7f64f952 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -714,7 +714,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode,
* i_rwsem. So we can safely drop the i_data_sem here.
*/
BUG_ON(EXT4_JOURNAL(inode) == NULL);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
up_write(&EXT4_I(inode)->i_data_sem);
*dropped = 1;
return 0;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 9a84a5f9fef4..d5bd1e3a5d36 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -502,9 +502,8 @@ static int ext4_read_inline_folio(struct inode *inode, struct folio *folio)
BUG_ON(len > PAGE_SIZE);
kaddr = kmap_local_folio(folio, 0);
ret = ext4_read_inline_data(inode, kaddr, len, &iloc);
- flush_dcache_folio(folio);
+ kaddr = folio_zero_tail(folio, len, kaddr + len);
kunmap_local(kaddr);
- folio_zero_segment(folio, len, folio_size(folio));
folio_mark_uptodate(folio);
brelse(iloc.bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 61277f7f8722..2ccf3b5e3a7c 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,7 +371,7 @@ void ext4_da_update_reserve_space(struct inode *inode,
*/
if ((ei->i_reserved_data_blocks == 0) &&
!inode_is_open_for_write(inode))
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
}
static int __check_block_validity(struct inode *inode, const char *func,
@@ -515,6 +515,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
map->m_len = retval;
} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
map->m_pblk = 0;
+ map->m_flags |= ext4_es_is_delayed(&es) ?
+ EXT4_MAP_DELAYED : 0;
retval = es.es_len - (map->m_lblk - es.es_lblk);
if (retval > map->m_len)
retval = map->m_len;
@@ -1261,7 +1263,7 @@ static int write_end_fn(handle_t *handle, struct inode *inode,
* We need to pick up the new inode size which generic_commit_write gave us
* `file' can be NULL - eg, when called from page_symlink().
*
- * ext4 never places buffers on inode->i_mapping->private_list. metadata
+ * ext4 never places buffers on inode->i_mapping->i_private_list. metadata
* buffers are managed internally.
*/
static int ext4_write_end(struct file *file,
@@ -1703,11 +1705,8 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, iblock, NULL, &es)) {
- if (ext4_es_is_hole(&es)) {
- retval = 0;
- down_read(&EXT4_I(inode)->i_data_sem);
+ if (ext4_es_is_hole(&es))
goto add_delayed;
- }
/*
* Delayed extent could be allocated by fallocate.
@@ -1749,26 +1748,11 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
retval = ext4_ext_map_blocks(NULL, inode, map, 0);
else
retval = ext4_ind_map_blocks(NULL, inode, map, 0);
-
-add_delayed:
- if (retval == 0) {
- int ret;
-
- /*
- * XXX: __block_prepare_write() unmaps passed block,
- * is it OK?
- */
-
- ret = ext4_insert_delayed_block(inode, map->m_lblk);
- if (ret != 0) {
- retval = ret;
- goto out_unlock;
- }
-
- map_bh(bh, inode->i_sb, invalid_block);
- set_buffer_new(bh);
- set_buffer_delay(bh);
- } else if (retval > 0) {
+ if (retval < 0) {
+ up_read(&EXT4_I(inode)->i_data_sem);
+ return retval;
+ }
+ if (retval > 0) {
unsigned int status;
if (unlikely(retval != map->m_len)) {
@@ -1783,11 +1767,21 @@ add_delayed:
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
map->m_pblk, status);
+ up_read(&EXT4_I(inode)->i_data_sem);
+ return retval;
}
+ up_read(&EXT4_I(inode)->i_data_sem);
-out_unlock:
- up_read((&EXT4_I(inode)->i_data_sem));
+add_delayed:
+ down_write(&EXT4_I(inode)->i_data_sem);
+ retval = ext4_insert_delayed_block(inode, map->m_lblk);
+ up_write(&EXT4_I(inode)->i_data_sem);
+ if (retval)
+ return retval;
+ map_bh(bh, inode->i_sb, invalid_block);
+ set_buffer_new(bh);
+ set_buffer_delay(bh);
return retval;
}
@@ -2947,7 +2941,7 @@ static int ext4_da_should_update_i_disksize(struct folio *folio,
static int ext4_da_do_write_end(struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
- struct page *page)
+ struct folio *folio)
{
struct inode *inode = mapping->host;
loff_t old_size = inode->i_size;
@@ -2958,12 +2952,13 @@ static int ext4_da_do_write_end(struct address_space *mapping,
* block_write_end() will mark the inode as dirty with I_DIRTY_PAGES
* flag, which all that's needed to trigger page writeback.
*/
- copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL);
+ copied = block_write_end(NULL, mapping, pos, len, copied,
+ &folio->page, NULL);
new_i_size = pos + copied;
/*
- * It's important to update i_size while still holding page lock,
- * because page writeout could otherwise come in and zero beyond
+ * It's important to update i_size while still holding folio lock,
+ * because folio writeout could otherwise come in and zero beyond
* i_size.
*
* Since we are holding inode lock, we are sure i_disksize <=
@@ -2981,14 +2976,14 @@ static int ext4_da_do_write_end(struct address_space *mapping,
i_size_write(inode, new_i_size);
end = (new_i_size - 1) & (PAGE_SIZE - 1);
- if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) {
+ if (copied && ext4_da_should_update_i_disksize(folio, end)) {
ext4_update_i_disksize(inode, new_i_size);
disksize_changed = true;
}
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
if (old_size < pos)
pagecache_isize_extended(inode, old_size, pos);
@@ -3027,10 +3022,10 @@ static int ext4_da_write_end(struct file *file,
return ext4_write_inline_data_end(inode, pos, len, copied,
folio);
- if (unlikely(copied < len) && !PageUptodate(page))
+ if (unlikely(copied < len) && !folio_test_uptodate(folio))
copied = 0;
- return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page);
+ return ext4_da_do_write_end(mapping, pos, len, copied, folio);
}
/*
@@ -3213,7 +3208,7 @@ static bool ext4_inode_datasync_dirty(struct inode *inode)
}
/* Any metadata buffers to write? */
- if (!list_empty(&inode->i_mapping->private_list))
+ if (!list_empty(&inode->i_mapping->i_private_list))
return true;
return inode->i_state & I_DIRTY_DATASYNC;
}
@@ -3267,6 +3262,9 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
iomap->addr = (u64) map->m_pblk << blkbits;
if (flags & IOMAP_DAX)
iomap->addr += EXT4_SB(inode->i_sb)->s_dax_part_off;
+ } else if (map->m_flags & EXT4_MAP_DELAYED) {
+ iomap->type = IOMAP_DELALLOC;
+ iomap->addr = IOMAP_NULL_ADDR;
} else {
iomap->type = IOMAP_HOLE;
iomap->addr = IOMAP_NULL_ADDR;
@@ -3429,35 +3427,11 @@ const struct iomap_ops ext4_iomap_overwrite_ops = {
.iomap_end = ext4_iomap_end,
};
-static bool ext4_iomap_is_delalloc(struct inode *inode,
- struct ext4_map_blocks *map)
-{
- struct extent_status es;
- ext4_lblk_t offset = 0, end = map->m_lblk + map->m_len - 1;
-
- ext4_es_find_extent_range(inode, &ext4_es_is_delayed,
- map->m_lblk, end, &es);
-
- if (!es.es_len || es.es_lblk > end)
- return false;
-
- if (es.es_lblk > map->m_lblk) {
- map->m_len = es.es_lblk - map->m_lblk;
- return false;
- }
-
- offset = map->m_lblk - es.es_lblk;
- map->m_len = es.es_len - offset;
-
- return true;
-}
-
static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
loff_t length, unsigned int flags,
struct iomap *iomap, struct iomap *srcmap)
{
int ret;
- bool delalloc = false;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
@@ -3498,13 +3472,8 @@ static int ext4_iomap_begin_report(struct inode *inode, loff_t offset,
ret = ext4_map_blocks(NULL, inode, &map, 0);
if (ret < 0)
return ret;
- if (ret == 0)
- delalloc = ext4_iomap_is_delalloc(inode, &map);
-
set_iomap:
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
- if (delalloc && iomap->type == IOMAP_HOLE)
- iomap->type = IOMAP_DELALLOC;
return 0;
}
@@ -3564,7 +3533,7 @@ static const struct address_space_operations ext4_aops = {
.direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = ext4_iomap_swap_activate,
};
@@ -3581,7 +3550,7 @@ static const struct address_space_operations ext4_journalled_aops = {
.direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio_norefs,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = ext4_iomap_swap_activate,
};
@@ -3598,7 +3567,7 @@ static const struct address_space_operations ext4_da_aops = {
.direct_IO = noop_direct_IO,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = ext4_iomap_swap_activate,
};
@@ -3630,6 +3599,12 @@ void ext4_set_aops(struct inode *inode)
inode->i_mapping->a_ops = &ext4_aops;
}
+/*
+ * Here we can't skip an unwritten buffer even though it usually reads zero
+ * because it might have data in pagecache (eg, if called from ext4_zero_range,
+ * ext4_punch_hole, etc) which needs to be properly zeroed out. Otherwise a
+ * racing writeback can come later and flush the stale pagecache to disk.
+ */
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
@@ -4008,12 +3983,12 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
/* If there are blocks to remove, do it */
if (stop_block > first_block) {
+ ext4_lblk_t hole_len = stop_block - first_block;
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
- ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
+ ext4_es_remove_extent(inode, first_block, hole_len);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_remove_space(inode, first_block,
@@ -4022,6 +3997,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
ret = ext4_ind_remove_space(handle, inode, first_block,
stop_block);
+ ext4_es_insert_extent(inode, first_block, hole_len, ~0,
+ EXTENT_STATUS_HOLE);
up_write(&EXT4_I(inode)->i_data_sem);
}
ext4_fc_track_range(handle, inode, first_block, stop_block);
@@ -4163,7 +4140,7 @@ int ext4_truncate(struct inode *inode)
down_write(&EXT4_I(inode)->i_data_sem);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
err = ext4_ext_truncate(handle, inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 4f931f80cb34..7160a71044c8 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -467,7 +467,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
ext4_reset_inode_seed(inode);
ext4_reset_inode_seed(inode_bl);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
err = ext4_mark_inode_dirty(handle, inode);
if (err < 0) {
@@ -819,11 +819,11 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags)
switch (flags) {
case EXT4_GOING_FLAGS_DEFAULT:
- ret = freeze_bdev(sb->s_bdev);
+ ret = bdev_freeze(sb->s_bdev);
if (ret)
return ret;
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
- thaw_bdev(sb->s_bdev);
+ bdev_thaw(sb->s_bdev);
break;
case EXT4_GOING_FLAGS_LOGFLUSH:
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d72b5e3c92ec..e4f7cf9d89c4 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -564,14 +564,14 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(EXT4_SB(sb), first + i);
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group,
inode ? inode->i_ino : 0,
blocknr,
"freeing block already freed "
"(bit %u)",
first + i);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
}
mb_clear_bit(first + i, e4b->bd_info->bb_bitmap);
}
@@ -677,7 +677,7 @@ do { \
} \
} while (0)
-static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
+static void __mb_check_buddy(struct ext4_buddy *e4b, char *file,
const char *function, int line)
{
struct super_block *sb = e4b->bd_sb;
@@ -696,7 +696,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
void *buddy2;
if (e4b->bd_info->bb_check_counter++ % 10)
- return 0;
+ return;
while (order > 1) {
buddy = mb_find_buddy(e4b, order, &max);
@@ -758,7 +758,7 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
grp = ext4_get_group_info(sb, e4b->bd_group);
if (!grp)
- return NULL;
+ return;
list_for_each(cur, &grp->bb_prealloc_list) {
ext4_group_t groupnr;
struct ext4_prealloc_space *pa;
@@ -768,7 +768,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
for (i = 0; i < pa->pa_len; i++)
MB_CHECK_ASSERT(mb_test_bit(k + i, buddy));
}
- return 0;
}
#undef MB_CHECK_ASSERT
#define mb_check_buddy(e4b) __mb_check_buddy(e4b, \
@@ -842,7 +841,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
struct ext4_sb_info *sbi = EXT4_SB(sb);
int new_order;
- if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0)
+ if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_fragments == 0)
return;
new_order = mb_avg_fragment_size_order(sb,
@@ -871,7 +870,7 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
* cr level needs an update.
*/
static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ enum criteria *new_cr, ext4_group_t *group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *iter;
@@ -945,7 +944,7 @@ ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int o
* order. Updates *new_cr if cr level needs an update.
*/
static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ enum criteria *new_cr, ext4_group_t *group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *grp = NULL;
@@ -990,7 +989,7 @@ static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *
* much and fall to CR_GOAL_LEN_SLOW in that case.
*/
static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
- enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ enum criteria *new_cr, ext4_group_t *group)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *grp = NULL;
@@ -1125,11 +1124,11 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
}
if (*new_cr == CR_POWER2_ALIGNED) {
- ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups);
+ ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group);
} else if (*new_cr == CR_GOAL_LEN_FAST) {
- ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups);
+ ext4_mb_choose_next_group_goal_fast(ac, new_cr, group);
} else if (*new_cr == CR_BEST_AVAIL_LEN) {
- ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups);
+ ext4_mb_choose_next_group_best_avail(ac, new_cr, group);
} else {
/*
* TODO: For CR=2, we can arrange groups in an rb tree sorted by
@@ -1233,6 +1232,24 @@ void ext4_mb_generate_buddy(struct super_block *sb,
atomic64_add(period, &sbi->s_mb_generation_time);
}
+static void mb_regenerate_buddy(struct ext4_buddy *e4b)
+{
+ int count;
+ int order = 1;
+ void *buddy;
+
+ while ((buddy = mb_find_buddy(e4b, order++, &count)))
+ mb_set_bits(buddy, 0, count);
+
+ e4b->bd_info->bb_fragments = 0;
+ memset(e4b->bd_info->bb_counters, 0,
+ sizeof(*e4b->bd_info->bb_counters) *
+ (e4b->bd_sb->s_blocksize_bits + 2));
+
+ ext4_mb_generate_buddy(e4b->bd_sb, e4b->bd_buddy,
+ e4b->bd_bitmap, e4b->bd_group, e4b->bd_info);
+}
+
/* The buddy information is attached the buddy cache inode
* for convenience. The information regarding each group
* is loaded via ext4_mb_load_buddy. The information involve
@@ -1456,9 +1473,8 @@ static int ext4_mb_get_buddy_page_lock(struct super_block *sb,
return 0;
}
- block++;
- pnum = block / blocks_per_page;
- page = find_or_create_page(inode->i_mapping, pnum, gfp);
+ /* blocks_per_page == 1, hence we need another page for the buddy */
+ page = find_or_create_page(inode->i_mapping, block + 1, gfp);
if (!page)
return -ENOMEM;
BUG_ON(page->mapping != inode->i_mapping);
@@ -1892,11 +1908,6 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
mb_check_buddy(e4b);
mb_free_blocks_double(inode, e4b, first, count);
- this_cpu_inc(discard_pa_seq);
- e4b->bd_info->bb_free += count;
- if (first < e4b->bd_info->bb_first_free)
- e4b->bd_info->bb_first_free = first;
-
/* access memory sequentially: check left neighbour,
* clear range and then check right neighbour
*/
@@ -1910,21 +1921,31 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_fsblk_t blocknr;
+ /*
+ * Fastcommit replay can free already freed blocks which
+ * corrupts allocation info. Regenerate it.
+ */
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ mb_regenerate_buddy(e4b);
+ goto check;
+ }
+
blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
blocknr += EXT4_C2B(sbi, block);
- if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
- ext4_grp_locked_error(sb, e4b->bd_group,
- inode ? inode->i_ino : 0,
- blocknr,
- "freeing already freed block (bit %u); block bitmap corrupt.",
- block);
- ext4_mark_group_bitmap_corrupted(
- sb, e4b->bd_group,
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
EXT4_GROUP_INFO_BBITMAP_CORRUPT);
- }
- goto done;
+ ext4_grp_locked_error(sb, e4b->bd_group,
+ inode ? inode->i_ino : 0, blocknr,
+ "freeing already freed block (bit %u); block bitmap corrupt.",
+ block);
+ return;
}
+ this_cpu_inc(discard_pa_seq);
+ e4b->bd_info->bb_free += count;
+ if (first < e4b->bd_info->bb_first_free)
+ e4b->bd_info->bb_first_free = first;
+
/* let's maintain fragments counter */
if (left_is_free && right_is_free)
e4b->bd_info->bb_fragments--;
@@ -1949,17 +1970,16 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
if (first <= last)
mb_buddy_mark_free(e4b, first >> 1, last >> 1);
-done:
mb_set_largest_free_order(sb, e4b->bd_info);
mb_update_avg_fragment_size(sb, e4b->bd_info);
+check:
mb_check_buddy(e4b);
}
static int mb_find_extent(struct ext4_buddy *e4b, int block,
int needed, struct ext4_free_extent *ex)
{
- int next = block;
- int max, order;
+ int max, order, next;
void *buddy;
assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
@@ -1977,16 +1997,12 @@ static int mb_find_extent(struct ext4_buddy *e4b, int block,
/* find actual order */
order = mb_find_order_for_block(e4b, block);
- block = block >> order;
- ex->fe_len = 1 << order;
- ex->fe_start = block << order;
+ ex->fe_len = (1 << order) - (block & ((1 << order) - 1));
+ ex->fe_start = block;
ex->fe_group = e4b->bd_group;
- /* calc difference from given start */
- next = next - ex->fe_start;
- ex->fe_len -= next;
- ex->fe_start += next;
+ block = block >> order;
while (needed > ex->fe_len &&
mb_find_buddy(e4b, order, &max)) {
@@ -2282,6 +2298,9 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
return;
ext4_lock_group(ac->ac_sb, group);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ goto out;
+
max = mb_find_extent(e4b, ex.fe_start, ex.fe_len, &ex);
if (max > 0) {
@@ -2289,6 +2308,7 @@ void ext4_mb_try_best_found(struct ext4_allocation_context *ac,
ext4_mb_use_best_found(ac, e4b);
}
+out:
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
}
@@ -2315,12 +2335,10 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
if (err)
return err;
- if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info))) {
- ext4_mb_unload_buddy(e4b);
- return 0;
- }
-
ext4_lock_group(ac->ac_sb, group);
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ goto out;
+
max = mb_find_extent(e4b, ac->ac_g_ex.fe_start,
ac->ac_g_ex.fe_len, &ex);
ex.fe_logical = 0xDEADFA11; /* debug value */
@@ -2353,6 +2371,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
}
+out:
ext4_unlock_group(ac->ac_sb, group);
ext4_mb_unload_buddy(e4b);
@@ -2386,12 +2405,12 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
k = mb_find_next_zero_bit(buddy, max, 0);
if (k >= max) {
+ ext4_mark_group_bitmap_corrupted(ac->ac_sb,
+ e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(ac->ac_sb, e4b->bd_group, 0, 0,
"%d free clusters of order %d. But found 0",
grp->bb_counters[i], i);
- ext4_mark_group_bitmap_corrupted(ac->ac_sb,
- e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
break;
}
ac->ac_found++;
@@ -2442,12 +2461,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
* free blocks even though group info says we
* have free blocks
*/
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
"group info. But bitmap says 0",
free);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
break;
}
@@ -2473,12 +2492,12 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
if (WARN_ON(ex.fe_len <= 0))
break;
if (free < ex.fe_len) {
+ ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
+ EXT4_GROUP_INFO_BBITMAP_CORRUPT);
ext4_grp_locked_error(sb, e4b->bd_group, 0, 0,
"%d free clusters as per "
"group info. But got %d blocks",
free, ex.fe_len);
- ext4_mark_group_bitmap_corrupted(sb, e4b->bd_group,
- EXT4_GROUP_INFO_BBITMAP_CORRUPT);
/*
* The number of free blocks differs. This mostly
* indicate that the bitmap is corrupt. So exit
@@ -2895,14 +2914,19 @@ repeat:
ac->ac_groups_scanned++;
if (cr == CR_POWER2_ALIGNED)
ext4_mb_simple_scan_group(ac, &e4b);
- else if ((cr == CR_GOAL_LEN_FAST ||
- cr == CR_BEST_AVAIL_LEN) &&
- sbi->s_stripe &&
- !(ac->ac_g_ex.fe_len %
- EXT4_B2C(sbi, sbi->s_stripe)))
- ext4_mb_scan_aligned(ac, &e4b);
- else
- ext4_mb_complex_scan_group(ac, &e4b);
+ else {
+ bool is_stripe_aligned = sbi->s_stripe &&
+ !(ac->ac_g_ex.fe_len %
+ EXT4_B2C(sbi, sbi->s_stripe));
+
+ if ((cr == CR_GOAL_LEN_FAST ||
+ cr == CR_BEST_AVAIL_LEN) &&
+ is_stripe_aligned)
+ ext4_mb_scan_aligned(ac, &e4b);
+
+ if (ac->ac_status == AC_STATUS_CONTINUE)
+ ext4_mb_complex_scan_group(ac, &e4b);
+ }
ext4_unlock_group(sb, group);
ext4_mb_unload_buddy(&e4b);
@@ -3726,7 +3750,7 @@ static int ext4_mb_cleanup_pa(struct ext4_group_info *grp)
return count;
}
-int ext4_mb_release(struct super_block *sb)
+void ext4_mb_release(struct super_block *sb)
{
ext4_group_t ngroups = ext4_get_groups_count(sb);
ext4_group_t i;
@@ -3802,8 +3826,6 @@ int ext4_mb_release(struct super_block *sb)
}
free_percpu(sbi->s_locality_groups);
-
- return 0;
}
static inline int ext4_issue_discard(struct super_block *sb,
@@ -5285,7 +5307,7 @@ static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac)
* the caller MUST hold group/inode locks.
* TODO: optimize the case when there are no in-core structures yet
*/
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
struct ext4_prealloc_space *pa)
{
@@ -5335,11 +5357,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
*/
}
atomic_add(free, &sbi->s_mb_discarded);
-
- return 0;
}
-static noinline_for_stack int
+static noinline_for_stack void
ext4_mb_release_group_pa(struct ext4_buddy *e4b,
struct ext4_prealloc_space *pa)
{
@@ -5353,13 +5373,11 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b,
if (unlikely(group != e4b->bd_group && pa->pa_len != 0)) {
ext4_warning(sb, "bad group: expected %u, group %u, pa_start %llu",
e4b->bd_group, group, pa->pa_pstart);
- return 0;
+ return;
}
mb_free_blocks(pa->pa_inode, e4b, bit, pa->pa_len);
atomic_add(pa->pa_len, &EXT4_SB(sb)->s_mb_discarded);
trace_ext4_mballoc_discard(sb, NULL, group, bit, pa->pa_len);
-
- return 0;
}
/*
@@ -5480,7 +5498,7 @@ out_dbg:
*
* FIXME!! Make sure it is valid at all the call sites
*/
-void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
+void ext4_discard_preallocations(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct super_block *sb = inode->i_sb;
@@ -5492,9 +5510,8 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
struct rb_node *iter;
int err;
- if (!S_ISREG(inode->i_mode)) {
+ if (!S_ISREG(inode->i_mode))
return;
- }
if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY)
return;
@@ -5502,15 +5519,12 @@ void ext4_discard_preallocations(struct inode *inode, unsigned int needed)
mb_debug(sb, "discard preallocation for inode %lu\n",
inode->i_ino);
trace_ext4_discard_preallocations(inode,
- atomic_read(&ei->i_prealloc_active), needed);
-
- if (needed == 0)
- needed = UINT_MAX;
+ atomic_read(&ei->i_prealloc_active));
repeat:
/* first, collect all pa's in the inode */
write_lock(&ei->i_prealloc_lock);
- for (iter = rb_first(&ei->i_prealloc_node); iter && needed;
+ for (iter = rb_first(&ei->i_prealloc_node); iter;
iter = rb_next(iter)) {
pa = rb_entry(iter, struct ext4_prealloc_space,
pa_node.inode_node);
@@ -5534,7 +5548,6 @@ repeat:
spin_unlock(&pa->pa_lock);
rb_erase(&pa->pa_node.inode_node, &ei->i_prealloc_node);
list_add(&pa->u.pa_tmp_list, &list);
- needed--;
continue;
}
@@ -5944,7 +5957,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
/*
* release all resource we used in allocation
*/
-static int ext4_mb_release_context(struct ext4_allocation_context *ac)
+static void ext4_mb_release_context(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_prealloc_space *pa = ac->ac_pa;
@@ -5981,7 +5994,6 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)
mutex_unlock(&ac->ac_lg->lg_mutex);
ext4_mb_collect_stats(ac);
- return 0;
}
static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
@@ -6735,11 +6747,16 @@ __acquires(bitlock)
static ext4_grpblk_t ext4_last_grp_cluster(struct super_block *sb,
ext4_group_t grp)
{
- if (grp < ext4_get_groups_count(sb))
- return EXT4_CLUSTERS_PER_GROUP(sb) - 1;
- return (ext4_blocks_count(EXT4_SB(sb)->s_es) -
- ext4_group_first_block_no(sb, grp) - 1) >>
- EXT4_CLUSTER_BITS(sb);
+ unsigned long nr_clusters_in_group;
+
+ if (grp < (ext4_get_groups_count(sb) - 1))
+ nr_clusters_in_group = EXT4_CLUSTERS_PER_GROUP(sb);
+ else
+ nr_clusters_in_group = (ext4_blocks_count(EXT4_SB(sb)->s_es) -
+ ext4_group_first_block_no(sb, grp))
+ >> EXT4_CLUSTER_BITS(sb);
+
+ return nr_clusters_in_group - 1;
}
static bool ext4_trim_interrupted(void)
@@ -6753,13 +6770,18 @@ static int ext4_try_to_trim_range(struct super_block *sb,
__acquires(ext4_group_lock_ptr(sb, e4b->bd_group))
__releases(ext4_group_lock_ptr(sb, e4b->bd_group))
{
- ext4_grpblk_t next, count, free_count;
+ ext4_grpblk_t next, count, free_count, last, origin_start;
bool set_trimmed = false;
void *bitmap;
+ if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(e4b->bd_info)))
+ return 0;
+
+ last = ext4_last_grp_cluster(sb, e4b->bd_group);
bitmap = e4b->bd_bitmap;
- if (start == 0 && max >= ext4_last_grp_cluster(sb, e4b->bd_group))
+ if (start == 0 && max >= last)
set_trimmed = true;
+ origin_start = start;
start = max(e4b->bd_info->bb_first_free, start);
count = 0;
free_count = 0;
@@ -6768,7 +6790,10 @@ __releases(ext4_group_lock_ptr(sb, e4b->bd_group))
start = mb_find_next_zero_bit(bitmap, max + 1, start);
if (start > max)
break;
- next = mb_find_next_bit(bitmap, max + 1, start);
+
+ next = mb_find_next_bit(bitmap, last + 1, start);
+ if (origin_start == 0 && next >= last)
+ set_trimmed = true;
if ((next - start) >= minblocks) {
int ret = ext4_trim_extent(sb, start, next - start, e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index d7aeb5da7d86..56938532b4ce 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -192,7 +192,6 @@ struct ext4_allocation_context {
*/
ext4_grpblk_t ac_orig_goal_len;
- __u32 ac_groups_considered;
__u32 ac_flags; /* allocation hints */
__u16 ac_groups_scanned;
__u16 ac_groups_linear_remaining;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3aa57376d9c2..7cd4afa4de1d 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -618,6 +618,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
goto out;
o_end = o_start + len;
+ *moved_len = 0;
while (o_start < o_end) {
struct ext4_extent *ex;
ext4_lblk_t cur_blk, next_blk;
@@ -672,7 +673,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
*/
ext4_double_up_write_data_sem(orig_inode, donor_inode);
/* Swap original branches with new branches */
- move_extent_per_page(o_filp, donor_inode,
+ *moved_len += move_extent_per_page(o_filp, donor_inode,
orig_page_index, donor_page_index,
offset_in_page, cur_len,
unwritten, &ret);
@@ -682,14 +683,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
o_start += cur_len;
d_start += cur_len;
}
- *moved_len = o_start - orig_blk;
- if (*moved_len > len)
- *moved_len = len;
out:
if (*moved_len) {
- ext4_discard_preallocations(orig_inode, 0);
- ext4_discard_preallocations(donor_inode, 0);
+ ext4_discard_preallocations(orig_inode);
+ ext4_discard_preallocations(donor_inode);
}
ext4_free_ext_path(path);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index d252935f9c8a..05b647e6bc19 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2388,8 +2388,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
sb = dir->i_sb;
blocksize = sb->s_blocksize;
- if (!dentry->d_name.len)
- return -EINVAL;
if (fscrypt_is_nokey_name(dentry))
return -ENOKEY;
@@ -3591,10 +3589,14 @@ struct ext4_renament {
int dir_inlined;
};
-static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent)
+static int ext4_rename_dir_prepare(handle_t *handle, struct ext4_renament *ent, bool is_cross)
{
int retval;
+ ent->is_dir = true;
+ if (!is_cross)
+ return 0;
+
ent->dir_bh = ext4_get_first_dir_block(handle, ent->inode,
&retval, &ent->parent_de,
&ent->dir_inlined);
@@ -3612,6 +3614,9 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent,
{
int retval;
+ if (!ent->dir_bh)
+ return 0;
+
ent->parent_de->inode = cpu_to_le32(dir_ino);
BUFFER_TRACE(ent->dir_bh, "call ext4_handle_dirty_metadata");
if (!ent->dir_inlined) {
@@ -3900,7 +3905,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (new.dir != old.dir && EXT4_DIR_LINK_MAX(new.dir))
goto end_rename;
}
- retval = ext4_rename_dir_prepare(handle, &old);
+ retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir);
if (retval)
goto end_rename;
}
@@ -3964,7 +3969,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
}
inode_set_mtime_to_ts(old.dir, inode_set_ctime_current(old.dir));
ext4_update_dx_flag(old.dir);
- if (old.dir_bh) {
+ if (old.is_dir) {
retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
if (retval)
goto end_rename;
@@ -3987,7 +3992,7 @@ static int ext4_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (unlikely(retval))
goto end_rename;
- if (S_ISDIR(old.inode->i_mode)) {
+ if (old.is_dir) {
/*
* We disable fast commits here that's because the
* replay code is not yet capable of changing dot dot
@@ -4114,14 +4119,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_handle_sync(handle);
if (S_ISDIR(old.inode->i_mode)) {
- old.is_dir = true;
- retval = ext4_rename_dir_prepare(handle, &old);
+ retval = ext4_rename_dir_prepare(handle, &old, new.dir != old.dir);
if (retval)
goto end_rename;
}
if (S_ISDIR(new.inode->i_mode)) {
- new.is_dir = true;
- retval = ext4_rename_dir_prepare(handle, &new);
+ retval = ext4_rename_dir_prepare(handle, &new, new.dir != old.dir);
if (retval)
goto end_rename;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index dfdd7e5cf038..312bc6813357 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -444,7 +444,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio,
folio_clear_error(folio);
/*
- * Comments copied from block_write_full_page:
+ * Comments copied from block_write_full_folio:
*
* The folio straddles i_size. It must be zeroed out on each and every
* writepage invocation because it may be mmapped. "A file is mapped
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 4fe061edefdd..4d4a5a32e310 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -218,35 +218,53 @@ struct ext4_new_flex_group_data {
in the flex group */
__u16 *bg_flags; /* block group flags of groups
in @groups */
+ ext4_group_t resize_bg; /* number of allocated
+ new_group_data */
ext4_group_t count; /* number of groups in @groups
*/
};
/*
+ * Avoiding memory allocation failures due to too many groups added each time.
+ */
+#define MAX_RESIZE_BG 16384
+
+/*
* alloc_flex_gd() allocates a ext4_new_flex_group_data with size of
* @flexbg_size.
*
* Returns NULL on failure otherwise address of the allocated structure.
*/
-static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned long flexbg_size)
+static struct ext4_new_flex_group_data *alloc_flex_gd(unsigned int flexbg_size,
+ ext4_group_t o_group, ext4_group_t n_group)
{
+ ext4_group_t last_group;
struct ext4_new_flex_group_data *flex_gd;
flex_gd = kmalloc(sizeof(*flex_gd), GFP_NOFS);
if (flex_gd == NULL)
goto out3;
- if (flexbg_size >= UINT_MAX / sizeof(struct ext4_new_group_data))
- goto out2;
- flex_gd->count = flexbg_size;
+ if (unlikely(flexbg_size > MAX_RESIZE_BG))
+ flex_gd->resize_bg = MAX_RESIZE_BG;
+ else
+ flex_gd->resize_bg = flexbg_size;
+
+ /* Avoid allocating large 'groups' array if not needed */
+ last_group = o_group | (flex_gd->resize_bg - 1);
+ if (n_group <= last_group)
+ flex_gd->resize_bg = 1 << fls(n_group - o_group + 1);
+ else if (n_group - last_group < flex_gd->resize_bg)
+ flex_gd->resize_bg = 1 << max(fls(last_group - o_group + 1),
+ fls(n_group - last_group));
- flex_gd->groups = kmalloc_array(flexbg_size,
+ flex_gd->groups = kmalloc_array(flex_gd->resize_bg,
sizeof(struct ext4_new_group_data),
GFP_NOFS);
if (flex_gd->groups == NULL)
goto out2;
- flex_gd->bg_flags = kmalloc_array(flexbg_size, sizeof(__u16),
+ flex_gd->bg_flags = kmalloc_array(flex_gd->resize_bg, sizeof(__u16),
GFP_NOFS);
if (flex_gd->bg_flags == NULL)
goto out1;
@@ -283,7 +301,7 @@ static void free_flex_gd(struct ext4_new_flex_group_data *flex_gd)
*/
static int ext4_alloc_group_tables(struct super_block *sb,
struct ext4_new_flex_group_data *flex_gd,
- int flexbg_size)
+ unsigned int flexbg_size)
{
struct ext4_new_group_data *group_data = flex_gd->groups;
ext4_fsblk_t start_blk;
@@ -384,12 +402,12 @@ next_group:
group = group_data[0].group;
printk(KERN_DEBUG "EXT4-fs: adding a flex group with "
- "%d groups, flexbg size is %d:\n", flex_gd->count,
+ "%u groups, flexbg size is %u:\n", flex_gd->count,
flexbg_size);
for (i = 0; i < flex_gd->count; i++) {
ext4_debug(
- "adding %s group %u: %u blocks (%d free, %d mdata blocks)\n",
+ "adding %s group %u: %u blocks (%u free, %u mdata blocks)\n",
ext4_bg_has_super(sb, group + i) ? "normal" :
"no-super", group + i,
group_data[i].blocks_count,
@@ -1605,8 +1623,7 @@ exit:
static int ext4_setup_next_flex_gd(struct super_block *sb,
struct ext4_new_flex_group_data *flex_gd,
- ext4_fsblk_t n_blocks_count,
- unsigned long flexbg_size)
+ ext4_fsblk_t n_blocks_count)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
@@ -1630,7 +1647,7 @@ static int ext4_setup_next_flex_gd(struct super_block *sb,
BUG_ON(last);
ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &last);
- last_group = group | (flexbg_size - 1);
+ last_group = group | (flex_gd->resize_bg - 1);
if (last_group > n_group)
last_group = n_group;
@@ -1990,8 +2007,9 @@ int ext4_resize_fs(struct super_block *sb, ext4_fsblk_t n_blocks_count)
ext4_fsblk_t o_blocks_count;
ext4_fsblk_t n_blocks_count_retry = 0;
unsigned long last_update_time = 0;
- int err = 0, flexbg_size = 1 << sbi->s_log_groups_per_flex;
+ int err = 0;
int meta_bg;
+ unsigned int flexbg_size = ext4_flex_bg_size(sbi);
/* See if the device is actually as big as what was requested */
bh = ext4_sb_bread(sb, n_blocks_count - 1, 0);
@@ -2123,7 +2141,7 @@ retry:
if (err)
goto out;
- flex_gd = alloc_flex_gd(flexbg_size);
+ flex_gd = alloc_flex_gd(flexbg_size, o_group, n_group);
if (flex_gd == NULL) {
err = -ENOMEM;
goto out;
@@ -2132,8 +2150,7 @@ retry:
/* Add flex groups. Note that a regular group is a
* flex group with 1 group.
*/
- while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count,
- flexbg_size)) {
+ while (ext4_setup_next_flex_gd(sb, flex_gd, n_blocks_count)) {
if (time_is_before_jiffies(last_update_time + HZ * 10)) {
if (last_update_time)
ext4_msg(sb, KERN_INFO,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c5fcf377ab1f..0f931d0c227d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1525,7 +1525,7 @@ void ext4_clear_inode(struct inode *inode)
ext4_fc_del(inode);
invalidate_inode_buffers(inode);
clear_inode(inode);
- ext4_discard_preallocations(inode, 0);
+ ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
dquot_drop(inode);
if (EXT4_I(inode)->jinode) {
@@ -2793,15 +2793,6 @@ static int ext4_check_opt_consistency(struct fs_context *fc,
return -EINVAL;
}
- if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DIOREAD_NOLOCK)) {
- int blocksize =
- BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
- if (blocksize < PAGE_SIZE)
- ext4_msg(NULL, KERN_WARNING, "Warning: mounting with an "
- "experimental mount option 'dioread_nolock' "
- "for blocksize < PAGE_SIZE");
- }
-
err = ext4_check_test_dummy_encryption(fc, sb);
if (err)
return err;
@@ -4410,7 +4401,7 @@ static void ext4_set_def_opts(struct super_block *sb,
((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0))
set_opt(sb, DELALLOC);
- if (sb->s_blocksize == PAGE_SIZE)
+ if (sb->s_blocksize <= PAGE_SIZE)
set_opt(sb, DIOREAD_NOLOCK);
}
@@ -5864,11 +5855,9 @@ static struct bdev_handle *ext4_get_journal_blkdev(struct super_block *sb,
struct ext4_super_block *es;
int errno;
- /* see get_tree_bdev why this is needed and safe */
- up_write(&sb->s_umount);
- bdev_handle = bdev_open_by_dev(j_dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
- sb, &fs_holder_ops);
- down_write(&sb->s_umount);
+ bdev_handle = bdev_open_by_dev(j_dev,
+ BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
+ sb, &fs_holder_ops);
if (IS_ERR(bdev_handle)) {
ext4_msg(sb, KERN_ERR,
"failed to open journal device unknown-block(%u,%u) %ld",
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 75bf1f88843c..645240cc0229 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -92,10 +92,12 @@ static const char *ext4_get_link(struct dentry *dentry, struct inode *inode,
if (!dentry) {
bh = ext4_getblk(NULL, inode, 0, EXT4_GET_BLOCKS_CACHED_NOWAIT);
- if (IS_ERR(bh))
- return ERR_CAST(bh);
- if (!bh || !ext4_buffer_uptodate(bh))
+ if (IS_ERR(bh) || !bh)
return ERR_PTR(-ECHILD);
+ if (!ext4_buffer_uptodate(bh)) {
+ brelse(bh);
+ return ERR_PTR(-ECHILD);
+ }
} else {
bh = ext4_bread(NULL, inode, 0, 0);
if (IS_ERR(bh))
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index 36e5dab6baae..531517dac079 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -1036,8 +1036,10 @@ static void set_cluster_dirty(struct compress_ctx *cc)
int i;
for (i = 0; i < cc->cluster_size; i++)
- if (cc->rpages[i])
+ if (cc->rpages[i]) {
set_page_dirty(cc->rpages[i]);
+ set_page_private_gcing(cc->rpages[i]);
+ }
}
static int prepare_compress_overwrite(struct compress_ctx *cc,
@@ -1369,8 +1371,6 @@ unlock_continue:
add_compr_block_stat(inode, cc->valid_nr_cpages);
set_inode_flag(cc->inode, FI_APPEND_WRITE);
- if (cc->cluster_idx == 0)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
f2fs_put_dnode(&dn);
if (quota_inode)
@@ -1944,7 +1944,7 @@ void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino)
continue;
}
- generic_error_remove_page(mapping, &folio->page);
+ generic_error_remove_folio(mapping, folio);
folio_unlock(folio);
}
folio_batch_release(&fbatch);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 4e42b5f24deb..26e317696b33 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -995,7 +995,7 @@ static bool is_end_zone_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr)
}
blkaddr -= FDEV(devi).start_blk;
}
- return bdev_zoned_model(FDEV(devi).bdev) == BLK_ZONED_HM &&
+ return bdev_is_zoned(FDEV(devi).bdev) &&
f2fs_blkz_is_seq(sbi, devi, blkaddr) &&
(blkaddr % sbi->blocks_per_blkz == sbi->blocks_per_blkz - 1);
}
@@ -1179,18 +1179,12 @@ static int f2fs_submit_page_read(struct inode *inode, struct page *page,
return 0;
}
-static void __set_data_blkaddr(struct dnode_of_data *dn)
+static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
- struct f2fs_node *rn = F2FS_NODE(dn->node_page);
- __le32 *addr_array;
- int base = 0;
+ __le32 *addr = get_dnode_addr(dn->inode, dn->node_page);
- if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
- base = get_extra_isize(dn->inode);
-
- /* Get physical address of data block */
- addr_array = blkaddr_in_node(rn);
- addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
+ dn->data_blkaddr = blkaddr;
+ addr[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
}
/*
@@ -1199,18 +1193,17 @@ static void __set_data_blkaddr(struct dnode_of_data *dn)
* ->node_page
* update block addresses in the node page
*/
-void f2fs_set_data_blkaddr(struct dnode_of_data *dn)
+void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
- __set_data_blkaddr(dn);
+ __set_data_blkaddr(dn, blkaddr);
if (set_page_dirty(dn->node_page))
dn->node_changed = true;
}
void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
{
- dn->data_blkaddr = blkaddr;
- f2fs_set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn, blkaddr);
f2fs_update_read_extent_cache(dn);
}
@@ -1237,8 +1230,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
block_t blkaddr = f2fs_data_blkaddr(dn);
if (blkaddr == NULL_ADDR) {
- dn->data_blkaddr = NEW_ADDR;
- __set_data_blkaddr(dn);
+ __set_data_blkaddr(dn, NEW_ADDR);
count--;
}
}
@@ -1492,11 +1484,9 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
old_blkaddr = dn->data_blkaddr;
f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
&sum, seg_type, NULL);
- if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
- invalidate_mapping_pages(META_MAPPING(sbi),
- old_blkaddr, old_blkaddr);
- f2fs_invalidate_compress_page(sbi, old_blkaddr);
- }
+ if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+ f2fs_invalidate_internal_cache(sbi, old_blkaddr);
+
f2fs_update_data_blkaddr(dn, dn->data_blkaddr);
return 0;
}
@@ -1992,7 +1982,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
if (ret)
return ret;
- inode_lock(inode);
+ inode_lock_shared(inode);
maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS;
if (start > maxbytes) {
@@ -2112,7 +2102,7 @@ out:
if (ret == 1)
ret = 0;
- inode_unlock(inode);
+ inode_unlock_shared(inode);
return ret;
}
@@ -2566,9 +2556,6 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio)
page = fio->compressed_page ? fio->compressed_page : fio->page;
- /* wait for GCed page writeback via META_MAPPING */
- f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
-
if (fscrypt_inode_uses_inline_crypto(inode))
return 0;
@@ -2755,6 +2742,10 @@ got_it:
goto out_writepage;
}
+ /* wait for GCed page writeback via META_MAPPING */
+ if (fio->post_read)
+ f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
+
/*
* If current allocation needs SSR,
* it had better in-place writes for updated data.
@@ -2810,8 +2801,6 @@ got_it:
f2fs_outplace_write_data(&dn, fio);
trace_f2fs_do_write_data_page(page, OPU);
set_inode_flag(inode, FI_APPEND_WRITE);
- if (page->index == 0)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
out_writepage:
f2fs_put_dnode(&dn);
out:
@@ -2894,9 +2883,6 @@ int f2fs_write_single_data_page(struct page *page, int *submitted,
zero_user_segment(page, offset, PAGE_SIZE);
write:
- if (f2fs_is_drop_cache(inode))
- goto out;
-
/* Dentry/quota blocks are controlled by checkpoint */
if (S_ISDIR(inode->i_mode) || quota_inode) {
/*
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9043cedfa12b..65294e3b0bef 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -374,6 +374,12 @@ enum {
MAX_DPOLICY,
};
+enum {
+ DPOLICY_IO_AWARE_DISABLE, /* force to not be aware of IO */
+ DPOLICY_IO_AWARE_ENABLE, /* force to be aware of IO */
+ DPOLICY_IO_AWARE_MAX,
+};
+
struct discard_policy {
int type; /* type of discard */
unsigned int min_interval; /* used for candidates exist */
@@ -406,6 +412,7 @@ struct discard_cmd_control {
unsigned int discard_urgent_util; /* utilization which issue discard proactively */
unsigned int discard_granularity; /* discard granularity */
unsigned int max_ordered_discard; /* maximum discard granularity issued by lba order */
+ unsigned int discard_io_aware; /* io_aware policy */
unsigned int undiscard_blks; /* # of undiscard blocks */
unsigned int next_pos; /* next discard position */
atomic_t issued_discard; /* # of issued discard */
@@ -774,8 +781,6 @@ enum {
FI_UPDATE_WRITE, /* inode has in-place-update data */
FI_NEED_IPU, /* used for ipu per file */
FI_ATOMIC_FILE, /* indicate atomic file */
- FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
- FI_DROP_CACHE, /* drop dirty page cache */
FI_DATA_EXIST, /* indicate data exists */
FI_INLINE_DOTS, /* indicate inline dot dentries */
FI_SKIP_WRITES, /* should skip data page writeback */
@@ -3272,22 +3277,13 @@ static inline bool f2fs_is_cow_file(struct inode *inode)
return is_inode_flag_set(inode, FI_COW_FILE);
}
-static inline bool f2fs_is_first_block_written(struct inode *inode)
-{
- return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN);
-}
-
-static inline bool f2fs_is_drop_cache(struct inode *inode)
-{
- return is_inode_flag_set(inode, FI_DROP_CACHE);
-}
-
+static inline __le32 *get_dnode_addr(struct inode *inode,
+ struct page *node_page);
static inline void *inline_data_addr(struct inode *inode, struct page *page)
{
- struct f2fs_inode *ri = F2FS_INODE(page);
- int extra_size = get_extra_isize(inode);
+ __le32 *addr = get_dnode_addr(inode, page);
- return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]);
+ return (void *)(addr + DEF_INLINE_RESERVED_SIZE);
}
static inline int f2fs_has_inline_dentry(struct inode *inode)
@@ -3432,6 +3428,17 @@ static inline int get_inline_xattr_addrs(struct inode *inode)
return F2FS_I(inode)->i_inline_xattr_size;
}
+static inline __le32 *get_dnode_addr(struct inode *inode,
+ struct page *node_page)
+{
+ int base = 0;
+
+ if (IS_INODE(node_page) && f2fs_has_extra_attr(inode))
+ base = get_extra_isize(inode);
+
+ return blkaddr_in_node(F2FS_NODE(node_page)) + base;
+}
+
#define f2fs_get_inode_mode(i) \
((is_inode_flag_set(i, FI_ACL_MODE)) ? \
(F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -3815,7 +3822,7 @@ void f2fs_submit_page_write(struct f2fs_io_info *fio);
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
block_t blk_addr, sector_t *sector);
int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr);
-void f2fs_set_data_blkaddr(struct dnode_of_data *dn);
+void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr);
void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr);
int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count);
int f2fs_reserve_new_block(struct dnode_of_data *dn);
@@ -4606,6 +4613,13 @@ static inline bool f2fs_is_readonly(struct f2fs_sb_info *sbi)
return f2fs_sb_has_readonly(sbi) || f2fs_readonly(sbi->sb);
}
+static inline void f2fs_invalidate_internal_cache(struct f2fs_sb_info *sbi,
+ block_t blkaddr)
+{
+ invalidate_mapping_pages(META_MAPPING(sbi), blkaddr, blkaddr);
+ f2fs_invalidate_compress_page(sbi, blkaddr);
+}
+
#define EFSBADCRC EBADMSG /* Bad CRC detected */
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index e50363583f01..b58ab1157b7e 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -42,11 +42,11 @@ static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf)
vm_fault_t ret;
ret = filemap_fault(vmf);
- if (!ret)
+ if (ret & VM_FAULT_LOCKED)
f2fs_update_iostat(F2FS_I_SB(inode), inode,
APP_MAPPED_READ_IO, F2FS_BLKSIZE);
- trace_f2fs_filemap_fault(inode, vmf->pgoff, (unsigned long)ret);
+ trace_f2fs_filemap_fault(inode, vmf->pgoff, vmf->vma->vm_flags, ret);
return ret;
}
@@ -59,26 +59,29 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
struct dnode_of_data dn;
bool need_alloc = true;
int err = 0;
+ vm_fault_t ret;
if (unlikely(IS_IMMUTABLE(inode)))
return VM_FAULT_SIGBUS;
- if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED))
- return VM_FAULT_SIGBUS;
+ if (is_inode_flag_set(inode, FI_COMPRESS_RELEASED)) {
+ err = -EIO;
+ goto out;
+ }
if (unlikely(f2fs_cp_error(sbi))) {
err = -EIO;
- goto err;
+ goto out;
}
if (!f2fs_is_checkpoint_ready(sbi)) {
err = -ENOSPC;
- goto err;
+ goto out;
}
err = f2fs_convert_inline_inode(inode);
if (err)
- goto err;
+ goto out;
#ifdef CONFIG_F2FS_FS_COMPRESSION
if (f2fs_compressed_file(inode)) {
@@ -86,7 +89,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
if (ret < 0) {
err = ret;
- goto err;
+ goto out;
} else if (ret) {
need_alloc = false;
}
@@ -153,13 +156,15 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
f2fs_update_iostat(sbi, inode, APP_MAPPED_IO, F2FS_BLKSIZE);
f2fs_update_time(sbi, REQ_TIME);
- trace_f2fs_vm_page_mkwrite(page, DATA);
out_sem:
filemap_invalidate_unlock_shared(inode->i_mapping);
sb_end_pagefault(inode->i_sb);
-err:
- return vmf_fs_error(err);
+out:
+ ret = vmf_fs_error(err);
+
+ trace_f2fs_vm_page_mkwrite(inode, page->index, vmf->vma->vm_flags, ret);
+ return ret;
}
static const struct vm_operations_struct f2fs_file_vm_ops = {
@@ -418,7 +423,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
loff_t isize;
int err = 0;
- inode_lock(inode);
+ inode_lock_shared(inode);
isize = i_size_read(inode);
if (offset >= isize)
@@ -483,10 +488,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
found:
if (whence == SEEK_HOLE && data_ofs > isize)
data_ofs = isize;
- inode_unlock(inode);
+ inode_unlock_shared(inode);
return vfs_setpos(file, data_ofs, maxbytes);
fail:
- inode_unlock(inode);
+ inode_unlock_shared(inode);
return -ENXIO;
}
@@ -557,20 +562,14 @@ static int f2fs_file_open(struct inode *inode, struct file *filp)
void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
- struct f2fs_node *raw_node;
int nr_free = 0, ofs = dn->ofs_in_node, len = count;
__le32 *addr;
- int base = 0;
bool compressed_cluster = false;
int cluster_index = 0, valid_blocks = 0;
int cluster_size = F2FS_I(dn->inode)->i_cluster_size;
bool released = !atomic_read(&F2FS_I(dn->inode)->i_compr_blocks);
- if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
- base = get_extra_isize(dn->inode);
-
- raw_node = F2FS_NODE(dn->node_page);
- addr = blkaddr_in_node(raw_node) + base + ofs;
+ addr = get_dnode_addr(dn->inode, dn->node_page) + ofs;
/* Assumption: truncation starts with cluster */
for (; count > 0; count--, addr++, dn->ofs_in_node++, cluster_index++) {
@@ -588,8 +587,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
if (blkaddr == NULL_ADDR)
continue;
- dn->data_blkaddr = NULL_ADDR;
- f2fs_set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn, NULL_ADDR);
if (__is_valid_data_blkaddr(blkaddr)) {
if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
@@ -599,9 +597,6 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count)
valid_blocks++;
}
- if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page))
- clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN);
-
f2fs_invalidate_blocks(sbi, blkaddr);
if (!released || blkaddr != COMPRESS_ADDR)
@@ -1317,6 +1312,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
}
memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE);
set_page_dirty(pdst);
+ set_page_private_gcing(pdst);
f2fs_put_page(pdst, 1);
f2fs_put_page(psrc, 1);
@@ -1487,8 +1483,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
}
f2fs_invalidate_blocks(sbi, dn->data_blkaddr);
- dn->data_blkaddr = NEW_ADDR;
- f2fs_set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn, NEW_ADDR);
}
f2fs_update_read_extent_cache_range(dn, start, 0, index - start);
@@ -2239,11 +2234,11 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
switch (in) {
case F2FS_GOING_DOWN_FULLSYNC:
- ret = freeze_bdev(sb->s_bdev);
+ ret = bdev_freeze(sb->s_bdev);
if (ret)
goto out;
f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_SHUTDOWN);
- thaw_bdev(sb->s_bdev);
+ bdev_thaw(sb->s_bdev);
break;
case F2FS_GOING_DOWN_METASYNC:
/* do checkpoint only */
@@ -2818,6 +2813,11 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
goto out;
}
+ if (f2fs_compressed_file(src) || f2fs_compressed_file(dst)) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
+
ret = -EINVAL;
if (pos_in + len > src->i_size || pos_in + len < pos_in)
goto out_unlock;
@@ -3463,8 +3463,7 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
if (blkaddr != NEW_ADDR)
continue;
- dn->data_blkaddr = NULL_ADDR;
- f2fs_set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn, NULL_ADDR);
}
f2fs_i_compr_blocks_update(dn->inode, compr_blocks, false);
@@ -3630,8 +3629,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count)
continue;
}
- dn->data_blkaddr = NEW_ADDR;
- f2fs_set_data_blkaddr(dn);
+ f2fs_set_data_blkaddr(dn, NEW_ADDR);
}
reserved = cluster_size - compr_blocks;
@@ -4054,6 +4052,7 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len)
f2fs_bug_on(F2FS_I_SB(inode), !page);
set_page_dirty(page);
+ set_page_private_gcing(page);
f2fs_put_page(page, 1);
f2fs_put_page(page, 0);
}
@@ -4568,7 +4567,8 @@ static int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *iter,
if (map.m_len > map.m_lblk)
map.m_len -= map.m_lblk;
else
- map.m_len = 0;
+ return 0;
+
map.m_may_create = true;
if (dio) {
map.m_seg_type = f2fs_rw_hint_to_seg_type(inode->i_write_hint);
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index f550cdeaa663..a079eebfb080 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -46,8 +46,8 @@ static int gc_thread_func(void *data)
do {
bool sync_mode, foreground = false;
- wait_event_interruptible_timeout(*wq,
- kthread_should_stop() || freezing(current) ||
+ wait_event_freezable_timeout(*wq,
+ kthread_should_stop() ||
waitqueue_active(fggc_wq) ||
gc_th->gc_wake,
msecs_to_jiffies(wait_ms));
@@ -59,7 +59,7 @@ static int gc_thread_func(void *data)
if (gc_th->gc_wake)
gc_th->gc_wake = false;
- if (try_to_freeze() || f2fs_readonly(sbi->sb)) {
+ if (f2fs_readonly(sbi->sb)) {
stat_other_skip_bggc_count(sbi);
continue;
}
@@ -1380,9 +1380,8 @@ static int move_data_block(struct inode *inode, block_t bidx,
memcpy(page_address(fio.encrypted_page),
page_address(mpage), PAGE_SIZE);
f2fs_put_page(mpage, 1);
- invalidate_mapping_pages(META_MAPPING(fio.sbi),
- fio.old_blkaddr, fio.old_blkaddr);
- f2fs_invalidate_compress_page(fio.sbi, fio.old_blkaddr);
+
+ f2fs_invalidate_internal_cache(fio.sbi, fio.old_blkaddr);
set_page_dirty(fio.encrypted_page);
if (clear_page_dirty_for_io(fio.encrypted_page))
@@ -1405,8 +1404,6 @@ static int move_data_block(struct inode *inode, block_t bidx,
f2fs_update_data_blkaddr(&dn, newaddr);
set_inode_flag(inode, FI_APPEND_WRITE);
- if (page->index == 0)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
put_page_out:
f2fs_put_page(fio.encrypted_page, 1);
recover_block:
@@ -1868,6 +1865,9 @@ retry:
seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type,
gc_control->should_migrate_blocks);
+ if (seg_freed < 0)
+ goto stop;
+
total_freed += seg_freed;
if (seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) {
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 560bfcad1af2..c26effdce9aa 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -61,49 +61,31 @@ void f2fs_set_inode_flags(struct inode *inode)
S_ENCRYPTED|S_VERITY|S_CASEFOLD);
}
-static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
+static void __get_inode_rdev(struct inode *inode, struct page *node_page)
{
- int extra_size = get_extra_isize(inode);
+ __le32 *addr = get_dnode_addr(inode, node_page);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
- if (ri->i_addr[extra_size])
- inode->i_rdev = old_decode_dev(
- le32_to_cpu(ri->i_addr[extra_size]));
+ if (addr[0])
+ inode->i_rdev = old_decode_dev(le32_to_cpu(addr[0]));
else
- inode->i_rdev = new_decode_dev(
- le32_to_cpu(ri->i_addr[extra_size + 1]));
+ inode->i_rdev = new_decode_dev(le32_to_cpu(addr[1]));
}
}
-static int __written_first_block(struct f2fs_sb_info *sbi,
- struct f2fs_inode *ri)
+static void __set_inode_rdev(struct inode *inode, struct page *node_page)
{
- block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]);
-
- if (!__is_valid_data_blkaddr(addr))
- return 1;
- if (!f2fs_is_valid_blkaddr(sbi, addr, DATA_GENERIC_ENHANCE)) {
- f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR);
- return -EFSCORRUPTED;
- }
- return 0;
-}
-
-static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
-{
- int extra_size = get_extra_isize(inode);
+ __le32 *addr = get_dnode_addr(inode, node_page);
if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
if (old_valid_dev(inode->i_rdev)) {
- ri->i_addr[extra_size] =
- cpu_to_le32(old_encode_dev(inode->i_rdev));
- ri->i_addr[extra_size + 1] = 0;
+ addr[0] = cpu_to_le32(old_encode_dev(inode->i_rdev));
+ addr[1] = 0;
} else {
- ri->i_addr[extra_size] = 0;
- ri->i_addr[extra_size + 1] =
- cpu_to_le32(new_encode_dev(inode->i_rdev));
- ri->i_addr[extra_size + 2] = 0;
+ addr[0] = 0;
+ addr[1] = cpu_to_le32(new_encode_dev(inode->i_rdev));
+ addr[2] = 0;
}
}
}
@@ -398,7 +380,6 @@ static int do_read_inode(struct inode *inode)
struct page *node_page;
struct f2fs_inode *ri;
projid_t i_projid;
- int err;
/* Check if ino is within scope */
if (f2fs_check_nid_range(sbi, inode->i_ino))
@@ -478,17 +459,7 @@ static int do_read_inode(struct inode *inode)
}
/* get rdev by using inline_info */
- __get_inode_rdev(inode, ri);
-
- if (S_ISREG(inode->i_mode)) {
- err = __written_first_block(sbi, ri);
- if (err < 0) {
- f2fs_put_page(node_page, 1);
- return err;
- }
- if (!err)
- set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
- }
+ __get_inode_rdev(inode, node_page);
if (!f2fs_need_inode_block_update(sbi, inode->i_ino))
fi->last_disk_size = inode->i_size;
@@ -600,7 +571,7 @@ make_now:
#ifdef CONFIG_F2FS_FS_COMPRESSION
inode->i_mapping->a_ops = &f2fs_compress_aops;
/*
- * generic_error_remove_page only truncates pages of regular
+ * generic_error_remove_folio only truncates pages of regular
* inode
*/
inode->i_mode |= S_IFREG;
@@ -761,7 +732,7 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page)
}
}
- __set_inode_rdev(inode, ri);
+ __set_inode_rdev(inode, node_page);
/* deleted inode */
if (inode->i_nlink == 0)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index d0053b0284d8..b3bb815fc6aa 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -459,7 +459,6 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
struct qstr dot = QSTR_INIT(".", 1);
- struct qstr dotdot = QSTR_INIT("..", 2);
struct f2fs_dir_entry *de;
struct page *page;
int err = 0;
@@ -497,13 +496,13 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
goto out;
}
- de = f2fs_find_entry(dir, &dotdot, &page);
+ de = f2fs_find_entry(dir, &dotdot_name, &page);
if (de)
f2fs_put_page(page, 0);
else if (IS_ERR(page))
err = PTR_ERR(page);
else
- err = f2fs_do_add_link(dir, &dotdot, NULL, pino, S_IFDIR);
+ err = f2fs_do_add_link(dir, &dotdot_name, NULL, pino, S_IFDIR);
out:
if (!err)
clear_inode_flag(dir, FI_INLINE_DOTS);
@@ -963,6 +962,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct f2fs_dir_entry *old_dir_entry = NULL;
struct f2fs_dir_entry *old_entry;
struct f2fs_dir_entry *new_entry;
+ bool old_is_dir = S_ISDIR(old_inode->i_mode);
int err;
if (unlikely(f2fs_cp_error(sbi)))
@@ -1017,7 +1017,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto out;
}
- if (S_ISDIR(old_inode->i_mode)) {
+ if (old_is_dir && old_dir != new_dir) {
old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page);
if (!old_dir_entry) {
if (IS_ERR(old_dir_page))
@@ -1029,7 +1029,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (new_inode) {
err = -ENOTEMPTY;
- if (old_dir_entry && !f2fs_empty_dir(new_inode))
+ if (old_is_dir && !f2fs_empty_dir(new_inode))
goto out_dir;
err = -ENOENT;
@@ -1054,7 +1054,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
inode_set_ctime_current(new_inode);
f2fs_down_write(&F2FS_I(new_inode)->i_sem);
- if (old_dir_entry)
+ if (old_is_dir)
f2fs_i_links_write(new_inode, false);
f2fs_i_links_write(new_inode, false);
f2fs_up_write(&F2FS_I(new_inode)->i_sem);
@@ -1074,12 +1074,12 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
goto out_dir;
}
- if (old_dir_entry)
+ if (old_is_dir)
f2fs_i_links_write(new_dir, true);
}
f2fs_down_write(&F2FS_I(old_inode)->i_sem);
- if (!old_dir_entry || whiteout)
+ if (!old_is_dir || whiteout)
file_lost_pino(old_inode);
else
/* adjust dir's i_pino to pass fsck check */
@@ -1105,8 +1105,8 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
iput(whiteout);
}
- if (old_dir_entry) {
- if (old_dir != new_dir && !whiteout)
+ if (old_is_dir) {
+ if (old_dir_entry)
f2fs_set_link(old_inode, old_dir_entry,
old_dir_page, new_dir);
else
@@ -1316,21 +1316,27 @@ static int f2fs_rename2(struct mnt_idmap *idmap,
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
return -EINVAL;
+ trace_f2fs_rename_start(old_dir, old_dentry, new_dir, new_dentry,
+ flags);
+
err = fscrypt_prepare_rename(old_dir, old_dentry, new_dir, new_dentry,
flags);
if (err)
return err;
- if (flags & RENAME_EXCHANGE) {
- return f2fs_cross_rename(old_dir, old_dentry,
- new_dir, new_dentry);
- }
+ if (flags & RENAME_EXCHANGE)
+ err = f2fs_cross_rename(old_dir, old_dentry,
+ new_dir, new_dentry);
+ else
/*
* VFS has already handled the new dentry existence case,
* here, we just deal with "RENAME_NOREPLACE" as regular rename.
*/
- return f2fs_rename(idmap, old_dir, old_dentry,
+ err = f2fs_rename(idmap, old_dir, old_dentry,
new_dir, new_dentry, flags);
+
+ trace_f2fs_rename_end(old_dentry, new_dentry, flags, err);
+ return err;
}
static const char *f2fs_encrypted_get_link(struct dentry *dentry,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 6c7f6a649d27..9b546fd21010 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -2751,11 +2751,11 @@ recover_xnid:
f2fs_update_inode_page(inode);
/* 3: update and set xattr node page dirty */
- if (page)
+ if (page) {
memcpy(F2FS_NODE(xpage), F2FS_NODE(page),
VALID_XATTR_BLOCK_SIZE);
-
- set_page_dirty(xpage);
+ set_page_dirty(xpage);
+ }
f2fs_put_page(xpage, 1);
return 0;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index b56d0f1078a7..d0f24ccbd1ac 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -712,7 +712,16 @@ retry_dn:
*/
if (dest == NEW_ADDR) {
f2fs_truncate_data_blocks_range(&dn, 1);
- f2fs_reserve_new_block(&dn);
+ do {
+ err = f2fs_reserve_new_block(&dn);
+ if (err == -ENOSPC) {
+ f2fs_bug_on(sbi, 1);
+ break;
+ }
+ } while (err &&
+ IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
+ if (err)
+ goto err;
continue;
}
@@ -720,12 +729,14 @@ retry_dn:
if (f2fs_is_valid_blkaddr(sbi, dest, META_POR)) {
if (src == NULL_ADDR) {
- err = f2fs_reserve_new_block(&dn);
- while (err &&
- IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION))
+ do {
err = f2fs_reserve_new_block(&dn);
- /* We should not get -ENOSPC */
- f2fs_bug_on(sbi, err);
+ if (err == -ENOSPC) {
+ f2fs_bug_on(sbi, 1);
+ break;
+ }
+ } while (err &&
+ IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION));
if (err)
goto err;
}
@@ -906,6 +917,8 @@ skip:
if (!err && fix_curseg_write_pointer && !f2fs_readonly(sbi->sb) &&
f2fs_sb_has_blkzoned(sbi)) {
err = f2fs_fix_curseg_write_pointer(sbi);
+ if (!err)
+ err = f2fs_check_write_pointer(sbi);
ret = err;
}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 727d016318f9..4c8836ded90f 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1172,7 +1172,10 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->min_interval = dcc->min_discard_issue_time;
dpolicy->mid_interval = dcc->mid_discard_issue_time;
dpolicy->max_interval = dcc->max_discard_issue_time;
- dpolicy->io_aware = true;
+ if (dcc->discard_io_aware == DPOLICY_IO_AWARE_ENABLE)
+ dpolicy->io_aware = true;
+ else if (dcc->discard_io_aware == DPOLICY_IO_AWARE_DISABLE)
+ dpolicy->io_aware = false;
dpolicy->sync = false;
dpolicy->ordered = true;
if (utilization(sbi) > dcc->discard_urgent_util) {
@@ -1380,7 +1383,8 @@ static void __insert_discard_cmd(struct f2fs_sb_info *sbi,
p = &(*p)->rb_right;
leftmost = false;
} else {
- f2fs_bug_on(sbi, 1);
+ /* Let's skip to add, if exists */
+ return;
}
}
@@ -1883,9 +1887,8 @@ static int issue_discard_thread(void *data)
set_freezable();
do {
- wait_event_interruptible_timeout(*q,
- kthread_should_stop() || freezing(current) ||
- dcc->discard_wake,
+ wait_event_freezable_timeout(*q,
+ kthread_should_stop() || dcc->discard_wake,
msecs_to_jiffies(wait_ms));
if (sbi->gc_mode == GC_URGENT_HIGH ||
@@ -1903,8 +1906,6 @@ static int issue_discard_thread(void *data)
if (atomic_read(&dcc->queued_discard))
__wait_all_discard_cmd(sbi, NULL);
- if (try_to_freeze())
- continue;
if (f2fs_readonly(sbi->sb))
continue;
if (kthread_should_stop())
@@ -2274,6 +2275,7 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
dcc->discard_io_aware_gran = MAX_PLIST_NUM;
dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY;
+ dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE;
if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT)
dcc->discard_granularity = sbi->blocks_per_seg;
else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION)
@@ -2495,8 +2497,7 @@ void f2fs_invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
if (addr == NEW_ADDR || addr == COMPRESS_ADDR)
return;
- invalidate_mapping_pages(META_MAPPING(sbi), addr, addr);
- f2fs_invalidate_compress_page(sbi, addr);
+ f2fs_invalidate_internal_cache(sbi, addr);
/* add it into sit main buffer */
down_write(&sit_i->sentry_lock);
@@ -3557,11 +3558,8 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
reallocate:
f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr,
&fio->new_blkaddr, sum, type, fio);
- if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) {
- invalidate_mapping_pages(META_MAPPING(fio->sbi),
- fio->old_blkaddr, fio->old_blkaddr);
- f2fs_invalidate_compress_page(fio->sbi, fio->old_blkaddr);
- }
+ if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO)
+ f2fs_invalidate_internal_cache(fio->sbi, fio->old_blkaddr);
/* writeout dirty page into bdev */
f2fs_submit_page_write(fio);
@@ -3757,9 +3755,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
update_sit_entry(sbi, new_blkaddr, 1);
}
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) {
- invalidate_mapping_pages(META_MAPPING(sbi),
- old_blkaddr, old_blkaddr);
- f2fs_invalidate_compress_page(sbi, old_blkaddr);
+ f2fs_invalidate_internal_cache(sbi, old_blkaddr);
if (!from_gc)
update_segment_mtime(sbi, old_blkaddr, 0);
update_sit_entry(sbi, old_blkaddr, -1);
@@ -4865,99 +4861,56 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
struct f2fs_dev_info *fdev,
struct blk_zone *zone)
{
- unsigned int wp_segno, wp_blkoff, zone_secno, zone_segno, segno;
- block_t zone_block, wp_block, last_valid_block;
+ unsigned int zone_segno;
+ block_t zone_block, valid_block_cnt;
unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
- int i, s, b, ret;
- struct seg_entry *se;
+ int ret;
if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
return 0;
- wp_block = fdev->start_blk + (zone->wp >> log_sectors_per_block);
- wp_segno = GET_SEGNO(sbi, wp_block);
- wp_blkoff = wp_block - START_BLOCK(sbi, wp_segno);
zone_block = fdev->start_blk + (zone->start >> log_sectors_per_block);
zone_segno = GET_SEGNO(sbi, zone_block);
- zone_secno = GET_SEC_FROM_SEG(sbi, zone_segno);
-
- if (zone_segno >= MAIN_SEGS(sbi))
- return 0;
/*
* Skip check of zones cursegs point to, since
* fix_curseg_write_pointer() checks them.
*/
- for (i = 0; i < NO_CHECK_TYPE; i++)
- if (zone_secno == GET_SEC_FROM_SEG(sbi,
- CURSEG_I(sbi, i)->segno))
- return 0;
+ if (zone_segno >= MAIN_SEGS(sbi) ||
+ IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, zone_segno)))
+ return 0;
/*
- * Get last valid block of the zone.
+ * Get # of valid block of the zone.
*/
- last_valid_block = zone_block - 1;
- for (s = sbi->segs_per_sec - 1; s >= 0; s--) {
- segno = zone_segno + s;
- se = get_seg_entry(sbi, segno);
- for (b = sbi->blocks_per_seg - 1; b >= 0; b--)
- if (f2fs_test_bit(b, se->cur_valid_map)) {
- last_valid_block = START_BLOCK(sbi, segno) + b;
- break;
- }
- if (last_valid_block >= zone_block)
- break;
- }
+ valid_block_cnt = get_valid_blocks(sbi, zone_segno, true);
- /*
- * When safely unmounted in the previous mount, we can trust write
- * pointers. Otherwise, finish zones.
- */
- if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
- /*
- * The write pointer matches with the valid blocks or
- * already points to the end of the zone.
- */
- if ((last_valid_block + 1 == wp_block) ||
- (zone->wp == zone->start + zone->len))
- return 0;
- }
+ if ((!valid_block_cnt && zone->cond == BLK_ZONE_COND_EMPTY) ||
+ (valid_block_cnt && zone->cond == BLK_ZONE_COND_FULL))
+ return 0;
- if (last_valid_block + 1 == zone_block) {
- if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
- /*
- * If there is no valid block in the zone and if write
- * pointer is not at zone start, reset the write
- * pointer.
- */
- f2fs_notice(sbi,
- "Zone without valid block has non-zero write "
- "pointer. Reset the write pointer: wp[0x%x,0x%x]",
- wp_segno, wp_blkoff);
- }
+ if (!valid_block_cnt) {
+ f2fs_notice(sbi, "Zone without valid block has non-zero write "
+ "pointer. Reset the write pointer: cond[0x%x]",
+ zone->cond);
ret = __f2fs_issue_discard_zone(sbi, fdev->bdev, zone_block,
zone->len >> log_sectors_per_block);
if (ret)
f2fs_err(sbi, "Discard zone failed: %s (errno=%d)",
fdev->path, ret);
-
return ret;
}
- if (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) {
- /*
- * If there are valid blocks and the write pointer doesn't match
- * with them, we need to report the inconsistency and fill
- * the zone till the end to close the zone. This inconsistency
- * does not cause write error because the zone will not be
- * selected for write operation until it get discarded.
- */
- f2fs_notice(sbi, "Valid blocks are not aligned with write "
- "pointer: valid block[0x%x,0x%x] wp[0x%x,0x%x]",
- GET_SEGNO(sbi, last_valid_block),
- GET_BLKOFF_FROM_SEG0(sbi, last_valid_block),
- wp_segno, wp_blkoff);
- }
+ /*
+ * If there are valid blocks and the write pointer doesn't match
+ * with them, we need to report the inconsistency and fill
+ * the zone till the end to close the zone. This inconsistency
+ * does not cause write error because the zone will not be
+ * selected for write operation until it get discarded.
+ */
+ f2fs_notice(sbi, "Valid blocks are not aligned with write "
+ "pointer: valid block[0x%x,0x%x] cond[0x%x]",
+ zone_segno, valid_block_cnt, zone->cond);
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
zone->start, zone->len, GFP_NOFS);
@@ -5048,15 +5001,18 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type)
f2fs_notice(sbi, "Unaligned curseg[%d] with write pointer: "
"curseg[0x%x,0x%x] wp[0x%x,0x%x]", type, cs->segno,
cs->next_blkoff, wp_segno, wp_blkoff);
- } else {
- f2fs_notice(sbi, "Not successfully unmounted in the previous "
- "mount");
}
- f2fs_notice(sbi, "Assign new section to curseg[%d]: "
- "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff);
+ /* Allocate a new section if it's not new. */
+ if (cs->next_blkoff) {
+ unsigned int old_segno = cs->segno, old_blkoff = cs->next_blkoff;
- f2fs_allocate_new_section(sbi, type, true);
+ f2fs_allocate_new_section(sbi, type, true);
+ f2fs_notice(sbi, "Assign new section to curseg[%d]: "
+ "[0x%x,0x%x] -> [0x%x,0x%x]",
+ type, old_segno, old_blkoff,
+ cs->segno, cs->next_blkoff);
+ }
/* check consistency of the zone curseg pointed to */
if (check_zone_write_pointer(sbi, zbd, &zone))
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 033af907c3b1..d45ab0992ae5 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1422,11 +1422,6 @@ default_check:
}
}
- if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) {
- f2fs_err(sbi, "LFS is not compatible with checkpoint=disable");
- return -EINVAL;
- }
-
if (test_opt(sbi, ATGC) && f2fs_lfs_mode(sbi)) {
f2fs_err(sbi, "LFS is not compatible with ATGC");
return -EINVAL;
@@ -1717,12 +1712,10 @@ static void f2fs_put_super(struct super_block *sb)
kvfree(sbi->ckpt);
- sb->s_fs_info = NULL;
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi->raw_super);
- destroy_device_list(sbi);
f2fs_destroy_page_array_cache(sbi);
f2fs_destroy_xattr_caches(sbi);
mempool_destroy(sbi->write_io_dummy);
@@ -1738,7 +1731,6 @@ static void f2fs_put_super(struct super_block *sb)
#if IS_ENABLED(CONFIG_UNICODE)
utf8_unload(sb->s_encoding);
#endif
- kfree(sbi);
}
int f2fs_sync_fs(struct super_block *sb, int sync)
@@ -3364,6 +3356,14 @@ loff_t max_file_blocks(struct inode *inode)
leaf_count *= NIDS_PER_BLOCK;
result += leaf_count;
+ /*
+ * For compatibility with FSCRYPT_POLICY_FLAG_IV_INO_LBLK_{64,32} with
+ * a 4K crypto data unit, we must restrict the max filesize to what can
+ * fit within U32_MAX + 1 data units.
+ */
+
+ result = min(result, (((loff_t)U32_MAX + 1) * 4096) >> F2FS_BLKSIZE_BITS);
+
return result;
}
@@ -4282,24 +4282,21 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
sbi->aligned_blksize = false;
#ifdef CONFIG_BLK_DEV_ZONED
- if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
- !f2fs_sb_has_blkzoned(sbi)) {
- f2fs_err(sbi, "Zoned block device feature not enabled");
- return -EINVAL;
- }
- if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
+ if (bdev_is_zoned(FDEV(i).bdev)) {
+ if (!f2fs_sb_has_blkzoned(sbi)) {
+ f2fs_err(sbi, "Zoned block device feature not enabled");
+ return -EINVAL;
+ }
if (init_blkz_info(sbi, i)) {
f2fs_err(sbi, "Failed to initialize F2FS blkzone information");
return -EINVAL;
}
if (max_devices == 1)
break;
- f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
+ f2fs_info(sbi, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: Host-managed)",
i, FDEV(i).path,
FDEV(i).total_segments,
- FDEV(i).start_blk, FDEV(i).end_blk,
- bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
- "Host-aware" : "Host-managed");
+ FDEV(i).start_blk, FDEV(i).end_blk);
continue;
}
#endif
@@ -4746,7 +4743,7 @@ try_onemore:
#ifdef CONFIG_QUOTA
f2fs_recover_quota_end(sbi, quota_enabled);
#endif
-
+reset_checkpoint:
/*
* If the f2fs is not readonly and fsync data recovery succeeds,
* check zoned block devices' write pointer consistency.
@@ -4757,7 +4754,6 @@ try_onemore:
goto free_meta;
}
-reset_checkpoint:
f2fs_init_inmem_curseg(sbi);
/* f2fs_recover_fsync_data() cleared this already */
@@ -4884,6 +4880,7 @@ free_sbi:
if (sbi->s_chksum_driver)
crypto_free_shash(sbi->s_chksum_driver);
kfree(sbi);
+ sb->s_fs_info = NULL;
/* give only one another chance */
if (retry_cnt > 0 && skip_recovery) {
@@ -4902,9 +4899,9 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
static void kill_f2fs_super(struct super_block *sb)
{
- if (sb->s_root) {
- struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ struct f2fs_sb_info *sbi = F2FS_SB(sb);
+ if (sb->s_root) {
set_sbi_flag(sbi, SBI_IS_CLOSE);
f2fs_stop_gc_thread(sbi);
f2fs_stop_discard_thread(sbi);
@@ -4931,6 +4928,12 @@ static void kill_f2fs_super(struct super_block *sb)
sb->s_flags &= ~SB_RDONLY;
}
kill_block_super(sb);
+ /* Release block devices last, after fscrypt_destroy_keyring(). */
+ if (sbi) {
+ destroy_device_list(sbi);
+ kfree(sbi);
+ sb->s_fs_info = NULL;
+ }
}
static struct file_system_type f2fs_fs_type = {
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 417fae96890f..a7ec55c7bb20 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -143,6 +143,33 @@ static ssize_t pending_discard_show(struct f2fs_attr *a,
&SM_I(sbi)->dcc_info->discard_cmd_cnt));
}
+static ssize_t issued_discard_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ if (!SM_I(sbi)->dcc_info)
+ return -EINVAL;
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read(
+ &SM_I(sbi)->dcc_info->issued_discard));
+}
+
+static ssize_t queued_discard_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ if (!SM_I(sbi)->dcc_info)
+ return -EINVAL;
+ return sysfs_emit(buf, "%llu\n", (unsigned long long)atomic_read(
+ &SM_I(sbi)->dcc_info->queued_discard));
+}
+
+static ssize_t undiscard_blks_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ if (!SM_I(sbi)->dcc_info)
+ return -EINVAL;
+ return sysfs_emit(buf, "%u\n",
+ SM_I(sbi)->dcc_info->undiscard_blks);
+}
+
static ssize_t gc_mode_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
{
@@ -516,6 +543,13 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "discard_io_aware")) {
+ if (t >= DPOLICY_IO_AWARE_MAX)
+ return -EINVAL;
+ *ui = t;
+ return count;
+ }
+
if (!strcmp(a->attr.name, "migration_granularity")) {
if (t == 0 || t > sbi->segs_per_sec)
return -EINVAL;
@@ -734,6 +768,13 @@ out:
return count;
}
+ if (!strcmp(a->attr.name, "dir_level")) {
+ if (t > MAX_DIR_HASH_DEPTH)
+ return -EINVAL;
+ sbi->dir_level = t;
+ return count;
+ }
+
*ui = (unsigned int)t;
return count;
@@ -926,6 +967,7 @@ DCC_INFO_GENERAL_RW_ATTR(discard_io_aware_gran);
DCC_INFO_GENERAL_RW_ATTR(discard_urgent_util);
DCC_INFO_GENERAL_RW_ATTR(discard_granularity);
DCC_INFO_GENERAL_RW_ATTR(max_ordered_discard);
+DCC_INFO_GENERAL_RW_ATTR(discard_io_aware);
/* NM_INFO ATTR */
NM_INFO_RW_ATTR(max_roll_forward_node_blocks, max_rf_node_blocks);
@@ -1074,6 +1116,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(discard_urgent_util),
ATTR_LIST(discard_granularity),
ATTR_LIST(max_ordered_discard),
+ ATTR_LIST(discard_io_aware),
ATTR_LIST(pending_discard),
ATTR_LIST(gc_mode),
ATTR_LIST(ipu_policy),
@@ -1197,9 +1240,16 @@ ATTRIBUTE_GROUPS(f2fs_feat);
F2FS_GENERAL_RO_ATTR(sb_status);
F2FS_GENERAL_RO_ATTR(cp_status);
+F2FS_GENERAL_RO_ATTR(issued_discard);
+F2FS_GENERAL_RO_ATTR(queued_discard);
+F2FS_GENERAL_RO_ATTR(undiscard_blks);
+
static struct attribute *f2fs_stat_attrs[] = {
ATTR_LIST(sb_status),
ATTR_LIST(cp_status),
+ ATTR_LIST(issued_discard),
+ ATTR_LIST(queued_discard),
+ ATTR_LIST(undiscard_blks),
NULL,
};
ATTRIBUTE_GROUPS(f2fs_stat);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 47e88b4d4e7d..f290fe9327c4 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -660,11 +660,14 @@ retry:
here = __find_xattr(base_addr, last_base_addr, NULL, index, len, name);
if (!here) {
if (!F2FS_I(inode)->i_xattr_nid) {
+ error = f2fs_recover_xattr_data(inode, NULL);
f2fs_notice(F2FS_I_SB(inode),
- "recover xattr in inode (%lu)", inode->i_ino);
- f2fs_recover_xattr_data(inode, NULL);
- kfree(base_addr);
- goto retry;
+ "recover xattr in inode (%lu), error(%d)",
+ inode->i_ino, error);
+ if (!error) {
+ kfree(base_addr);
+ goto retry;
+ }
}
f2fs_err(F2FS_I_SB(inode), "set inode (%lu) has corrupted xattr",
inode->i_ino);
@@ -754,6 +757,12 @@ retry:
memcpy(pval, value, size);
last->e_value_size = cpu_to_le16(size);
new_hsize += newsize;
+ /*
+ * Explicitly add the null terminator. The unused xattr space
+ * is supposed to always be zeroed, which would make this
+ * unnecessary, but don't depend on that.
+ */
+ *(u32 *)((u8 *)last + newsize) = 0;
}
error = write_all_xattrs(inode, new_hsize, base_addr, ipage);
diff --git a/fs/file.c b/fs/file.c
index 5fb0b146e79e..3b683b9101d8 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -629,19 +629,23 @@ void fd_install(unsigned int fd, struct file *file)
EXPORT_SYMBOL(fd_install);
/**
- * pick_file - return file associatd with fd
+ * file_close_fd_locked - return file associated with fd
* @files: file struct to retrieve file from
* @fd: file descriptor to retrieve file for
*
+ * Doesn't take a separate reference count.
+ *
* Context: files_lock must be held.
*
* Returns: The file associated with @fd (NULL if @fd is not open)
*/
-static struct file *pick_file(struct files_struct *files, unsigned fd)
+struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
{
struct fdtable *fdt = files_fdtable(files);
struct file *file;
+ lockdep_assert_held(&files->file_lock);
+
if (fd >= fdt->max_fds)
return NULL;
@@ -660,7 +664,7 @@ int close_fd(unsigned fd)
struct file *file;
spin_lock(&files->file_lock);
- file = pick_file(files, fd);
+ file = file_close_fd_locked(files, fd);
spin_unlock(&files->file_lock);
if (!file)
return -EBADF;
@@ -707,7 +711,7 @@ static inline void __range_close(struct files_struct *files, unsigned int fd,
max_fd = min(max_fd, n);
for (; fd <= max_fd; fd++) {
- file = pick_file(files, fd);
+ file = file_close_fd_locked(files, fd);
if (file) {
spin_unlock(&files->file_lock);
filp_close(file, files);
@@ -795,26 +799,21 @@ int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
return 0;
}
-/*
- * See close_fd_get_file() below, this variant assumes current->files->file_lock
- * is held.
- */
-struct file *__close_fd_get_file(unsigned int fd)
-{
- return pick_file(current->files, fd);
-}
-
-/*
- * variant of close_fd that gets a ref on the file for later fput.
- * The caller must ensure that filp_close() called on the file.
+/**
+ * file_close_fd - return file associated with fd
+ * @fd: file descriptor to retrieve file for
+ *
+ * Doesn't take a separate reference count.
+ *
+ * Returns: The file associated with @fd (NULL if @fd is not open)
*/
-struct file *close_fd_get_file(unsigned int fd)
+struct file *file_close_fd(unsigned int fd)
{
struct files_struct *files = current->files;
struct file *file;
spin_lock(&files->file_lock);
- file = pick_file(files, fd);
+ file = file_close_fd_locked(files, fd);
spin_unlock(&files->file_lock);
return file;
@@ -959,31 +958,45 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
struct file *file;
struct fdtable *fdt = rcu_dereference_raw(files->fdt);
struct file __rcu **fdentry;
+ unsigned long nospec_mask;
- if (unlikely(fd >= fdt->max_fds))
- return NULL;
-
- fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
+ /* Mask is a 0 for invalid fd's, ~0 for valid ones */
+ nospec_mask = array_index_mask_nospec(fd, fdt->max_fds);
/*
- * Ok, we have a file pointer. However, because we do
- * this all locklessly under RCU, we may be racing with
- * that file being closed.
- *
- * Such a race can take two forms:
- *
- * (a) the file ref already went down to zero and the
- * file hasn't been reused yet or the file count
- * isn't zero but the file has already been reused.
+ * fdentry points to the 'fd' offset, or fdt->fd[0].
+ * Loading from fdt->fd[0] is always safe, because the
+ * array always exists.
*/
- file = __get_file_rcu(fdentry);
+ fdentry = fdt->fd + (fd & nospec_mask);
+
+ /* Do the load, then mask any invalid result */
+ file = rcu_dereference_raw(*fdentry);
+ file = (void *)(nospec_mask & (unsigned long)file);
if (unlikely(!file))
return NULL;
- if (unlikely(IS_ERR(file)))
+ /*
+ * Ok, we have a file pointer that was valid at
+ * some point, but it might have become stale since.
+ *
+ * We need to confirm it by incrementing the refcount
+ * and then check the lookup again.
+ *
+ * atomic_long_inc_not_zero() gives us a full memory
+ * barrier. We only really need an 'acquire' one to
+ * protect the loads below, but we don't have that.
+ */
+ if (unlikely(!atomic_long_inc_not_zero(&file->f_count)))
continue;
/*
+ * Such a race can take two forms:
+ *
+ * (a) the file ref already went down to zero and the
+ * file hasn't been reused yet or the file count
+ * isn't zero but the file has already been reused.
+ *
* (b) the file table entry has changed under us.
* Note that we don't need to re-check the 'fdt->fd'
* pointer having changed, because it always goes
@@ -991,7 +1004,8 @@ static inline struct file *__fget_files_rcu(struct files_struct *files,
*
* If so, we need to put our ref and try again.
*/
- if (unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
+ if (unlikely(file != rcu_dereference_raw(*fdentry)) ||
+ unlikely(rcu_dereference_raw(files->fdt) != fdt)) {
fput(file);
continue;
}
@@ -1128,13 +1142,13 @@ static unsigned long __fget_light(unsigned int fd, fmode_t mask)
* atomic_read_acquire() pairs with atomic_dec_and_test() in
* put_files_struct().
*/
- if (atomic_read_acquire(&files->count) == 1) {
+ if (likely(atomic_read_acquire(&files->count) == 1)) {
file = files_lookup_fd_raw(files, fd);
if (!file || unlikely(file->f_mode & mask))
return 0;
return (unsigned long)file;
} else {
- file = __fget(fd, mask);
+ file = __fget_files(files, fd, mask);
if (!file)
return 0;
return FDPUT_FPUT | (unsigned long)file;
@@ -1282,7 +1296,7 @@ out_unlock:
}
/**
- * __receive_fd() - Install received file into file descriptor table
+ * receive_fd() - Install received file into file descriptor table
* @file: struct file that was received from another process
* @ufd: __user pointer to write new fd number to
* @o_flags: the O_* flags to apply to the new fd entry
@@ -1296,7 +1310,7 @@ out_unlock:
*
* Returns newly install fd or -ve on error.
*/
-int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
+int receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
{
int new_fd;
int error;
@@ -1321,6 +1335,7 @@ int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
__receive_sock(file);
return new_fd;
}
+EXPORT_SYMBOL_GPL(receive_fd);
int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
{
@@ -1336,12 +1351,6 @@ int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
return new_fd;
}
-int receive_fd(struct file *file, unsigned int o_flags)
-{
- return __receive_fd(file, NULL, o_flags);
-}
-EXPORT_SYMBOL_GPL(receive_fd);
-
static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
{
int err = -EBADF;
diff --git a/fs/file_table.c b/fs/file_table.c
index de4a2915bfd4..b991f90571b4 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -75,18 +75,6 @@ static inline void file_free(struct file *f)
}
}
-void release_empty_file(struct file *f)
-{
- WARN_ON_ONCE(f->f_mode & (FMODE_BACKING | FMODE_OPENED));
- if (atomic_long_dec_and_test(&f->f_count)) {
- security_file_free(f);
- put_cred(f->f_cred);
- if (likely(!(f->f_mode & FMODE_NOACCOUNT)))
- percpu_counter_dec(&nr_files);
- kmem_cache_free(filp_cachep, f);
- }
-}
-
/*
* Return the total number of open files in the system
*/
@@ -142,7 +130,6 @@ static struct ctl_table fs_stat_sysctls[] = {
.extra1 = &sysctl_nr_open_min,
.extra2 = &sysctl_nr_open_max,
},
- { }
};
static int __init init_fs_stat_sysctls(void)
@@ -329,9 +316,6 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
const char *name, int flags,
const struct file_operations *fops)
{
- static const struct dentry_operations anon_ops = {
- .d_dname = simple_dname
- };
struct qstr this = QSTR_INIT(name, strlen(name));
struct path path;
struct file *file;
@@ -339,8 +323,6 @@ struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
if (!path.dentry)
return ERR_PTR(-ENOMEM);
- if (!mnt->mnt_sb->s_d_op)
- d_set_d_op(path.dentry, &anon_ops);
path.mnt = mntget(mnt);
d_instantiate(path.dentry, inode);
file = alloc_file(&path, flags, fops);
@@ -419,7 +401,7 @@ static void delayed_fput(struct work_struct *unused)
static void ____fput(struct callback_head *work)
{
- __fput(container_of(work, struct file, f_rcuhead));
+ __fput(container_of(work, struct file, f_task_work));
}
/*
@@ -445,9 +427,13 @@ void fput(struct file *file)
if (atomic_long_dec_and_test(&file->f_count)) {
struct task_struct *task = current;
+ if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) {
+ file_free(file);
+ return;
+ }
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
- init_task_work(&file->f_rcuhead, ____fput);
- if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
+ init_task_work(&file->f_task_work, ____fput);
+ if (!task_work_add(task, &file->f_task_work, TWA_RESUME))
return;
/*
* After this task has run exit_task_work(),
diff --git a/fs/freevxfs/vxfs_bmap.c b/fs/freevxfs/vxfs_bmap.c
index de2a5bccb930..26d367e3668d 100644
--- a/fs/freevxfs/vxfs_bmap.c
+++ b/fs/freevxfs/vxfs_bmap.c
@@ -29,7 +29,7 @@ vxfs_typdump(struct vxfs_typed *typ)
/**
* vxfs_bmap_ext4 - do bmap for ext4 extents
* @ip: pointer to the inode we do bmap for
- * @iblock: logical block.
+ * @bn: logical block.
*
* Description:
* vxfs_bmap_ext4 performs the bmap operation for inodes with
@@ -97,7 +97,7 @@ fail_buf:
* vxfs_bmap_indir reads a &struct vxfs_typed at @indir
* and performs the type-defined action.
*
- * Return Value:
+ * Returns:
* The physical block number on success, else Zero.
*
* Note:
@@ -179,7 +179,7 @@ out:
* Description:
* Performs the bmap operation for typed extents.
*
- * Return Value:
+ * Returns:
* The physical block number on success, else Zero.
*/
static daddr_t
@@ -243,7 +243,7 @@ vxfs_bmap_typed(struct inode *ip, long iblock)
* vxfs_bmap1 perfoms a logical to physical block mapping
* for vxfs-internal purposes.
*
- * Return Value:
+ * Returns:
* The physical block number on success, else Zero.
*/
daddr_t
diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c
index 9b49ec36e667..ed51fcd34757 100644
--- a/fs/freevxfs/vxfs_immed.c
+++ b/fs/freevxfs/vxfs_immed.c
@@ -15,7 +15,7 @@
/**
* vxfs_immed_read_folio - read part of an immed inode into pagecache
- * @file: file context (unused)
+ * @fp: file context (unused)
* @folio: folio to fill in.
*
* Description:
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index f04ba2ed1e1a..1b0bca8b4cc6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -177,8 +177,7 @@ vxfs_lookup(struct inode *dip, struct dentry *dp, unsigned int flags)
/**
* vxfs_readdir - read a directory
* @fp: the directory to read
- * @retp: return buffer
- * @filler: filldir callback
+ * @ctx: dir_context for filldir/readdir
*
* Description:
* vxfs_readdir fills @retp with directory entries from @fp
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1767493dffda..3d84fcc471c6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1675,11 +1675,11 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
inode->i_state |= I_DIRTY_PAGES;
- else if (unlikely(inode->i_state & I_PINNING_FSCACHE_WB)) {
+ else if (unlikely(inode->i_state & I_PINNING_NETFS_WB)) {
if (!(inode->i_state & I_DIRTY_PAGES)) {
- inode->i_state &= ~I_PINNING_FSCACHE_WB;
- wbc->unpinned_fscache_wb = true;
- dirty |= I_PINNING_FSCACHE_WB; /* Cause write_inode */
+ inode->i_state &= ~I_PINNING_NETFS_WB;
+ wbc->unpinned_netfs_wb = true;
+ dirty |= I_PINNING_NETFS_WB; /* Cause write_inode */
}
}
@@ -1691,7 +1691,7 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
if (ret == 0)
ret = err;
}
- wbc->unpinned_fscache_wb = false;
+ wbc->unpinned_netfs_wb = false;
trace_writeback_single_inode(inode, wbc, nr_to_write);
return ret;
}
diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig
deleted file mode 100644
index b313a978ae0a..000000000000
--- a/fs/fscache/Kconfig
+++ /dev/null
@@ -1,40 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0-only
-
-config FSCACHE
- tristate "General filesystem local caching manager"
- select NETFS_SUPPORT
- help
- This option enables a generic filesystem caching manager that can be
- used by various network and other filesystems to cache data locally.
- Different sorts of caches can be plugged in, depending on the
- resources available.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_STATS
- bool "Gather statistical information on local caching"
- depends on FSCACHE && PROC_FS
- select NETFS_STATS
- help
- This option causes statistical information to be gathered on local
- caching and exported through file:
-
- /proc/fs/fscache/stats
-
- The gathering of statistics adds a certain amount of overhead to
- execution as there are a quite a few stats gathered, and on a
- multi-CPU system these may be on cachelines that keep bouncing
- between CPUs. On the other hand, the stats are very useful for
- debugging purposes. Saying 'Y' here is recommended.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
-
-config FSCACHE_DEBUG
- bool "Debug FS-Cache"
- depends on FSCACHE
- help
- This permits debugging to be dynamically enabled in the local caching
- management module. If this is set, the debugging output may be
- enabled by setting bits in /sys/modules/fscache/parameter/debug.
-
- See Documentation/filesystems/caching/fscache.rst for more information.
diff --git a/fs/fscache/Makefile b/fs/fscache/Makefile
deleted file mode 100644
index afb090ea16c4..000000000000
--- a/fs/fscache/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: GPL-2.0
-#
-# Makefile for general filesystem caching code
-#
-
-fscache-y := \
- cache.o \
- cookie.o \
- io.o \
- main.o \
- volume.o
-
-fscache-$(CONFIG_PROC_FS) += proc.o
-fscache-$(CONFIG_FSCACHE_STATS) += stats.o
-
-obj-$(CONFIG_FSCACHE) := fscache.o
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
deleted file mode 100644
index 1336f517e9b1..000000000000
--- a/fs/fscache/internal.h
+++ /dev/null
@@ -1,277 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
-/* Internal definitions for FS-Cache
- *
- * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- */
-
-#ifdef pr_fmt
-#undef pr_fmt
-#endif
-
-#define pr_fmt(fmt) "FS-Cache: " fmt
-
-#include <linux/slab.h>
-#include <linux/fscache-cache.h>
-#include <trace/events/fscache.h>
-#include <linux/sched.h>
-#include <linux/seq_file.h>
-
-/*
- * cache.c
- */
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_caches_seq_ops;
-#endif
-bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
-void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
-struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache);
-void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where);
-
-static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache)
-{
- return smp_load_acquire(&cache->state);
-}
-
-static inline bool fscache_cache_is_live(const struct fscache_cache *cache)
-{
- return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE;
-}
-
-static inline void fscache_set_cache_state(struct fscache_cache *cache,
- enum fscache_cache_state new_state)
-{
- smp_store_release(&cache->state, new_state);
-
-}
-
-static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
- enum fscache_cache_state old_state,
- enum fscache_cache_state new_state)
-{
- return try_cmpxchg_release(&cache->state, &old_state, new_state);
-}
-
-/*
- * cookie.c
- */
-extern struct kmem_cache *fscache_cookie_jar;
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_cookies_seq_ops;
-#endif
-extern struct timer_list fscache_cookie_lru_timer;
-
-extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
-extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
- enum fscache_access_trace why);
-
-static inline void fscache_see_cookie(struct fscache_cookie *cookie,
- enum fscache_cookie_trace where)
-{
- trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
- where);
-}
-
-/*
- * main.c
- */
-extern unsigned fscache_debug;
-
-extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len);
-
-/*
- * proc.c
- */
-#ifdef CONFIG_PROC_FS
-extern int __init fscache_proc_init(void);
-extern void fscache_proc_cleanup(void);
-#else
-#define fscache_proc_init() (0)
-#define fscache_proc_cleanup() do {} while (0)
-#endif
-
-/*
- * stats.c
- */
-#ifdef CONFIG_FSCACHE_STATS
-extern atomic_t fscache_n_volumes;
-extern atomic_t fscache_n_volumes_collision;
-extern atomic_t fscache_n_volumes_nomem;
-extern atomic_t fscache_n_cookies;
-extern atomic_t fscache_n_cookies_lru;
-extern atomic_t fscache_n_cookies_lru_expired;
-extern atomic_t fscache_n_cookies_lru_removed;
-extern atomic_t fscache_n_cookies_lru_dropped;
-
-extern atomic_t fscache_n_acquires;
-extern atomic_t fscache_n_acquires_ok;
-extern atomic_t fscache_n_acquires_oom;
-
-extern atomic_t fscache_n_invalidates;
-
-extern atomic_t fscache_n_relinquishes;
-extern atomic_t fscache_n_relinquishes_retire;
-extern atomic_t fscache_n_relinquishes_dropped;
-
-extern atomic_t fscache_n_resizes;
-extern atomic_t fscache_n_resizes_null;
-
-static inline void fscache_stat(atomic_t *stat)
-{
- atomic_inc(stat);
-}
-
-static inline void fscache_stat_d(atomic_t *stat)
-{
- atomic_dec(stat);
-}
-
-#define __fscache_stat(stat) (stat)
-
-int fscache_stats_show(struct seq_file *m, void *v);
-#else
-
-#define __fscache_stat(stat) (NULL)
-#define fscache_stat(stat) do {} while (0)
-#define fscache_stat_d(stat) do {} while (0)
-#endif
-
-/*
- * volume.c
- */
-#ifdef CONFIG_PROC_FS
-extern const struct seq_operations fscache_volumes_seq_ops;
-#endif
-
-struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
- enum fscache_volume_trace where);
-void fscache_put_volume(struct fscache_volume *volume,
- enum fscache_volume_trace where);
-bool fscache_begin_volume_access(struct fscache_volume *volume,
- struct fscache_cookie *cookie,
- enum fscache_access_trace why);
-void fscache_create_volume(struct fscache_volume *volume, bool wait);
-
-
-/*****************************************************************************/
-/*
- * debug tracing
- */
-#define dbgprintk(FMT, ...) \
- printk("[%-6.6s] "FMT"\n", current->comm, ##__VA_ARGS__)
-
-#define kenter(FMT, ...) dbgprintk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define kleave(FMT, ...) dbgprintk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define kdebug(FMT, ...) dbgprintk(FMT, ##__VA_ARGS__)
-
-#define kjournal(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
-
-#ifdef __KDEBUG
-#define _enter(FMT, ...) kenter(FMT, ##__VA_ARGS__)
-#define _leave(FMT, ...) kleave(FMT, ##__VA_ARGS__)
-#define _debug(FMT, ...) kdebug(FMT, ##__VA_ARGS__)
-
-#elif defined(CONFIG_FSCACHE_DEBUG)
-#define _enter(FMT, ...) \
-do { \
- if (__do_kdebug(ENTER)) \
- kenter(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _leave(FMT, ...) \
-do { \
- if (__do_kdebug(LEAVE)) \
- kleave(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#define _debug(FMT, ...) \
-do { \
- if (__do_kdebug(DEBUG)) \
- kdebug(FMT, ##__VA_ARGS__); \
-} while (0)
-
-#else
-#define _enter(FMT, ...) no_printk("==> %s("FMT")", __func__, ##__VA_ARGS__)
-#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
-#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
-#endif
-
-/*
- * determine whether a particular optional debugging point should be logged
- * - we need to go through three steps to persuade cpp to correctly join the
- * shorthand in FSCACHE_DEBUG_LEVEL with its prefix
- */
-#define ____do_kdebug(LEVEL, POINT) \
- unlikely((fscache_debug & \
- (FSCACHE_POINT_##POINT << (FSCACHE_DEBUG_ ## LEVEL * 3))))
-#define ___do_kdebug(LEVEL, POINT) \
- ____do_kdebug(LEVEL, POINT)
-#define __do_kdebug(POINT) \
- ___do_kdebug(FSCACHE_DEBUG_LEVEL, POINT)
-
-#define FSCACHE_DEBUG_CACHE 0
-#define FSCACHE_DEBUG_COOKIE 1
-#define FSCACHE_DEBUG_OBJECT 2
-#define FSCACHE_DEBUG_OPERATION 3
-
-#define FSCACHE_POINT_ENTER 1
-#define FSCACHE_POINT_LEAVE 2
-#define FSCACHE_POINT_DEBUG 4
-
-#ifndef FSCACHE_DEBUG_LEVEL
-#define FSCACHE_DEBUG_LEVEL CACHE
-#endif
-
-/*
- * assertions
- */
-#if 1 /* defined(__KDEBUGALL) */
-
-#define ASSERT(X) \
-do { \
- if (unlikely(!(X))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTCMP(X, OP, Y) \
-do { \
- if (unlikely(!((X) OP (Y)))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- pr_err("%lx " #OP " %lx is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTIF(C, X) \
-do { \
- if (unlikely((C) && !(X))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- BUG(); \
- } \
-} while (0)
-
-#define ASSERTIFCMP(C, X, OP, Y) \
-do { \
- if (unlikely((C) && !((X) OP (Y)))) { \
- pr_err("\n"); \
- pr_err("Assertion failed\n"); \
- pr_err("%lx " #OP " %lx is false\n", \
- (unsigned long)(X), (unsigned long)(Y)); \
- BUG(); \
- } \
-} while (0)
-
-#else
-
-#define ASSERT(X) do {} while (0)
-#define ASSERTCMP(X, OP, Y) do {} while (0)
-#define ASSERTIF(C, X) do {} while (0)
-#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
-
-#endif /* assert or not */
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index 91e89e68177e..b6cad106c37e 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -474,8 +474,7 @@ err:
static void cuse_fc_release(struct fuse_conn *fc)
{
- struct cuse_conn *cc = fc_to_cc(fc);
- kfree_rcu(cc, fc.rcu);
+ kfree(fc_to_cc(fc));
}
/**
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a660f1f21540..148a71b8b4d0 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -19,6 +19,7 @@
#include <linux/uio.h>
#include <linux/fs.h>
#include <linux/filelock.h>
+#include <linux/splice.h>
static int fuse_send_open(struct fuse_mount *fm, u64 nodeid,
unsigned int open_flags, int opcode,
@@ -3195,8 +3196,8 @@ static ssize_t fuse_copy_file_range(struct file *src_file, loff_t src_off,
len, flags);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
- ret = generic_copy_file_range(src_file, src_off, dst_file,
- dst_off, len, flags);
+ ret = splice_copy_file_range(src_file, src_off, dst_file,
+ dst_off, len);
return ret;
}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 1df83eebda92..bcbe34488862 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -888,6 +888,7 @@ struct fuse_mount {
/* Entry on fc->mounts */
struct list_head fc_entry;
+ struct rcu_head rcu;
};
static inline struct fuse_mount *get_fuse_mount_super(struct super_block *sb)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2a6d44f91729..516ea2979a90 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -930,6 +930,14 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm,
}
EXPORT_SYMBOL_GPL(fuse_conn_init);
+static void delayed_release(struct rcu_head *p)
+{
+ struct fuse_conn *fc = container_of(p, struct fuse_conn, rcu);
+
+ put_user_ns(fc->user_ns);
+ fc->release(fc);
+}
+
void fuse_conn_put(struct fuse_conn *fc)
{
if (refcount_dec_and_test(&fc->count)) {
@@ -941,13 +949,12 @@ void fuse_conn_put(struct fuse_conn *fc)
if (fiq->ops->release)
fiq->ops->release(fiq);
put_pid_ns(fc->pid_ns);
- put_user_ns(fc->user_ns);
bucket = rcu_dereference_protected(fc->curr_bucket, 1);
if (bucket) {
WARN_ON(atomic_read(&bucket->count) != 1);
kfree(bucket);
}
- fc->release(fc);
+ call_rcu(&fc->rcu, delayed_release);
}
}
EXPORT_SYMBOL_GPL(fuse_conn_put);
@@ -1366,7 +1373,7 @@ EXPORT_SYMBOL_GPL(fuse_send_init);
void fuse_free_conn(struct fuse_conn *fc)
{
WARN_ON(!list_empty(&fc->devices));
- kfree_rcu(fc, rcu);
+ kfree(fc);
}
EXPORT_SYMBOL_GPL(fuse_free_conn);
@@ -1902,7 +1909,7 @@ static void fuse_sb_destroy(struct super_block *sb)
void fuse_mount_destroy(struct fuse_mount *fm)
{
fuse_conn_put(fm->fc);
- kfree(fm);
+ kfree_rcu(fm, rcu);
}
EXPORT_SYMBOL(fuse_mount_destroy);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 9611bfceda4b..974aca9c8ea8 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -82,11 +82,11 @@ static int gfs2_get_block_noalloc(struct inode *inode, sector_t lblock,
}
/**
- * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_page
+ * gfs2_write_jdata_folio - gfs2 jdata-specific version of block_write_full_folio
* @folio: The folio to write
* @wbc: The writeback control
*
- * This is the same as calling block_write_full_page, but it also
+ * This is the same as calling block_write_full_folio, but it also
* writes pages outside of i_size
*/
static int gfs2_write_jdata_folio(struct folio *folio,
@@ -108,7 +108,7 @@ static int gfs2_write_jdata_folio(struct folio *folio,
folio_size(folio));
return __block_write_full_folio(inode, folio, gfs2_get_block_noalloc,
- wbc, end_buffer_async_write);
+ wbc);
}
/**
@@ -403,18 +403,18 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
}
/**
- * stuffed_readpage - Fill in a Linux folio with stuffed file data
+ * stuffed_read_folio - Fill in a Linux folio with stuffed file data
* @ip: the inode
* @folio: the folio
*
* Returns: errno
*/
-static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio)
+static int stuffed_read_folio(struct gfs2_inode *ip, struct folio *folio)
{
- struct buffer_head *dibh;
- size_t i_size = i_size_read(&ip->i_inode);
- void *data;
- int error;
+ struct buffer_head *dibh = NULL;
+ size_t dsize = i_size_read(&ip->i_inode);
+ void *from = NULL;
+ int error = 0;
/*
* Due to the order of unstuffing files and ->fault(), we can be
@@ -422,22 +422,20 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct folio *folio)
* so we need to supply one here. It doesn't happen often.
*/
if (unlikely(folio->index)) {
- folio_zero_range(folio, 0, folio_size(folio));
- folio_mark_uptodate(folio);
- return 0;
+ dsize = 0;
+ } else {
+ error = gfs2_meta_inode_buffer(ip, &dibh);
+ if (error)
+ goto out;
+ from = dibh->b_data + sizeof(struct gfs2_dinode);
}
- error = gfs2_meta_inode_buffer(ip, &dibh);
- if (error)
- return error;
-
- data = dibh->b_data + sizeof(struct gfs2_dinode);
- memcpy_to_folio(folio, 0, data, i_size);
- folio_zero_range(folio, i_size, folio_size(folio) - i_size);
+ folio_fill_tail(folio, 0, from, dsize);
brelse(dibh);
- folio_mark_uptodate(folio);
+out:
+ folio_end_read(folio, error == 0);
- return 0;
+ return error;
}
/**
@@ -456,13 +454,12 @@ static int gfs2_read_folio(struct file *file, struct folio *folio)
(i_blocksize(inode) == PAGE_SIZE && !folio_buffers(folio))) {
error = iomap_read_folio(folio, &gfs2_iomap_ops);
} else if (gfs2_is_stuffed(ip)) {
- error = stuffed_readpage(ip, folio);
- folio_unlock(folio);
+ error = stuffed_read_folio(ip, folio);
} else {
error = mpage_read_folio(folio, gfs2_block_map);
}
- if (unlikely(gfs2_withdrawn(sdp)))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return -EIO;
return error;
@@ -748,7 +745,7 @@ static const struct address_space_operations gfs2_aops = {
.bmap = gfs2_bmap,
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
static const struct address_space_operations gfs2_jdata_aops = {
@@ -761,7 +758,7 @@ static const struct address_space_operations gfs2_jdata_aops = {
.invalidate_folio = gfs2_invalidate_folio,
.release_folio = gfs2_release_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
void gfs2_set_aops(struct inode *inode)
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index cf40895233f5..d418d8b5367f 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -138,8 +138,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
return ERR_PTR(-ESTALE);
inode = gfs2_lookup_by_inum(sdp, inum->no_addr, inum->no_formal_ino,
GFS2_BLKST_DINODE);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
return d_obtain_alias(inode);
}
@@ -192,5 +190,6 @@ const struct export_operations gfs2_export_ops = {
.fh_to_parent = gfs2_fh_to_parent,
.get_name = gfs2_get_name,
.get_parent = gfs2_get_parent,
+ .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 4b66efc1a82a..992ca4effb50 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1442,7 +1442,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
if (!(fl->fl_flags & FL_POSIX))
return -ENOLCK;
- if (unlikely(gfs2_withdrawn(sdp))) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
if (fl->fl_type == F_UNLCK)
locks_lock_file_wait(file, fl);
return -EIO;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index d6bf1f8c25dc..34540f9d011c 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -156,7 +156,7 @@ static bool glock_blocked_by_withdraw(struct gfs2_glock *gl)
{
struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
- if (likely(!gfs2_withdrawn(sdp)))
+ if (!gfs2_withdrawing_or_withdrawn(sdp))
return false;
if (gl->gl_ops->go_flags & GLOF_NONDISK)
return false;
@@ -278,7 +278,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl)
GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders));
if (mapping) {
truncate_inode_pages_final(mapping);
- if (!gfs2_withdrawn(sdp))
+ if (!gfs2_withdrawing_or_withdrawn(sdp))
GLOCK_BUG_ON(gl, !mapping_empty(mapping));
}
trace_gfs2_glock_put(gl);
@@ -517,6 +517,23 @@ static inline struct gfs2_holder *find_first_waiter(const struct gfs2_glock *gl)
}
/**
+ * find_last_waiter - find the last gh that's waiting for the glock
+ * @gl: the glock
+ *
+ * This also is a fast way of finding out if there are any waiters.
+ */
+
+static inline struct gfs2_holder *find_last_waiter(const struct gfs2_glock *gl)
+{
+ struct gfs2_holder *gh;
+
+ if (list_empty(&gl->gl_holders))
+ return NULL;
+ gh = list_last_entry(&gl->gl_holders, struct gfs2_holder, gh_list);
+ return test_bit(HIF_HOLDER, &gh->gh_iflags) ? NULL : gh;
+}
+
+/**
* state_change - record that the glock is now in a different state
* @gl: the glock
* @new_state: the new state
@@ -757,7 +774,7 @@ skip_inval:
* gfs2_gl_hash_clear calls clear_glock) and recovery is complete
* then it's okay to tell dlm to unlock it.
*/
- if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp)))
+ if (unlikely(sdp->sd_log_error) && !gfs2_withdrawing_or_withdrawn(sdp))
gfs2_withdraw_delayed(sdp);
if (glock_blocked_by_withdraw(gl) &&
(target != LM_ST_UNLOCKED ||
@@ -794,7 +811,7 @@ skip_inval:
gfs2_glock_queue_work(gl, 0);
} else if (ret) {
fs_err(sdp, "lm_lock ret %d\n", ret);
- GLOCK_BUG_ON(gl, !gfs2_withdrawn(sdp));
+ GLOCK_BUG_ON(gl, !gfs2_withdrawing_or_withdrawn(sdp));
}
} else { /* lock_nolock */
finish_xmote(gl, target);
@@ -1213,7 +1230,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping->host = s->s_bdev->bd_inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
- mapping->private_data = NULL;
+ mapping->i_private_data = NULL;
mapping->writeback_index = 0;
}
@@ -1555,11 +1572,30 @@ trap_recursive:
int gfs2_glock_nq(struct gfs2_holder *gh)
{
struct gfs2_glock *gl = gh->gh_gl;
- int error = 0;
+ int error;
if (glock_blocked_by_withdraw(gl) && !(gh->gh_flags & LM_FLAG_NOEXP))
return -EIO;
+ if (gh->gh_flags & GL_NOBLOCK) {
+ struct gfs2_holder *current_gh;
+
+ error = -ECHILD;
+ spin_lock(&gl->gl_lockref.lock);
+ if (find_last_waiter(gl))
+ goto unlock;
+ current_gh = find_first_holder(gl);
+ if (!may_grant(gl, current_gh, gh))
+ goto unlock;
+ set_bit(HIF_HOLDER, &gh->gh_iflags);
+ list_add_tail(&gh->gh_list, &gl->gl_holders);
+ trace_gfs2_promote(gh);
+ error = 0;
+unlock:
+ spin_unlock(&gl->gl_lockref.lock);
+ return error;
+ }
+
if (test_bit(GLF_LRU, &gl->gl_flags))
gfs2_glock_remove_from_lru(gl);
@@ -1575,6 +1611,7 @@ int gfs2_glock_nq(struct gfs2_holder *gh)
run_queue(gl, 1);
spin_unlock(&gl->gl_lockref.lock);
+ error = 0;
if (!(gh->gh_flags & GL_ASYNC))
error = gfs2_glock_wait(gh);
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 61197598abfd..0114f3e0ebe0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -84,6 +84,7 @@ enum {
#define GL_SKIP 0x0100
#define GL_NOPID 0x0200
#define GL_NOCACHE 0x0400
+#define GL_NOBLOCK 0x0800
/*
* lm_async_cb return flags
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index b41c78bd2cc0..45653cbc8a87 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -174,7 +174,7 @@ static int gfs2_rgrp_metasync(struct gfs2_glock *gl)
filemap_fdatawrite_range(metamapping, start, end);
error = filemap_fdatawait_range(metamapping, start, end);
- WARN_ON_ONCE(error && !gfs2_withdrawn(sdp));
+ WARN_ON_ONCE(error && !gfs2_withdrawing_or_withdrawn(sdp));
mapping_set_error(metamapping, error);
if (error)
gfs2_io_error(sdp);
@@ -494,7 +494,7 @@ int gfs2_inode_refresh(struct gfs2_inode *ip)
/**
* inode_go_instantiate - read in an inode if necessary
- * @gh: The glock holder
+ * @gl: The glock
*
* Returns: errno
*/
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 59ab18c79889..d1ac5d0679ea 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1122,7 +1122,7 @@ static void gdlm_recover_prep(void *arg)
struct gfs2_sbd *sdp = arg;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_err(sdp, "recover_prep ignored due to withdraw.\n");
return;
}
@@ -1148,7 +1148,7 @@ static void gdlm_recover_slot(void *arg, struct dlm_slot *slot)
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
int jid = slot->slot - 1;
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_err(sdp, "recover_slot jid %d ignored due to withdraw.\n",
jid);
return;
@@ -1177,7 +1177,7 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
struct gfs2_sbd *sdp = arg;
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_err(sdp, "recover_done ignored due to withdraw.\n");
return;
}
@@ -1208,7 +1208,7 @@ static void gdlm_recovery_result(struct gfs2_sbd *sdp, unsigned int jid,
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_err(sdp, "recovery_result jid %d ignored due to withdraw.\n",
jid);
return;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e5271ae87d1c..8cddf955ebc0 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -126,7 +126,7 @@ __acquires(&sdp->sd_ail_lock)
}
}
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
gfs2_remove_from_ail(bd);
continue;
}
@@ -352,14 +352,15 @@ static int gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
* @sdp: The superblock
* @max_revokes: If non-zero, add revokes where appropriate
*
- * Tries to empty the ail1 lists, starting with the oldest first
+ * Tries to empty the ail1 lists, starting with the oldest first.
+ * Returns %true if the ail1 list is now empty.
*/
-static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes)
+static bool gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes)
{
struct gfs2_trans *tr, *s;
int oldest_tr = 1;
- int ret;
+ bool empty;
spin_lock(&sdp->sd_ail_lock);
list_for_each_entry_safe_reverse(tr, s, &sdp->sd_ail1_list, tr_list) {
@@ -369,15 +370,10 @@ static int gfs2_ail1_empty(struct gfs2_sbd *sdp, int max_revokes)
oldest_tr = 0;
}
gfs2_log_update_flush_tail(sdp);
- ret = list_empty(&sdp->sd_ail1_list);
+ empty = list_empty(&sdp->sd_ail1_list);
spin_unlock(&sdp->sd_ail_lock);
- if (test_bit(SDF_WITHDRAWING, &sdp->sd_flags)) {
- gfs2_lm(sdp, "fatal: I/O error(s)\n");
- gfs2_withdraw(sdp);
- }
-
- return ret;
+ return empty;
}
static void gfs2_ail1_wait(struct gfs2_sbd *sdp)
@@ -814,6 +810,9 @@ void gfs2_flush_revokes(struct gfs2_sbd *sdp)
gfs2_log_lock(sdp);
gfs2_ail1_empty(sdp, max_revokes);
gfs2_log_unlock(sdp);
+
+ if (gfs2_withdrawing(sdp))
+ gfs2_withdraw(sdp);
}
/**
@@ -841,7 +840,7 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
struct super_block *sb = sdp->sd_vfs;
u64 dblock;
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return;
page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
@@ -914,8 +913,9 @@ void gfs2_write_log_header(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd,
static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
{
blk_opf_t op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC;
+ struct super_block *sb = sdp->sd_vfs;
- gfs2_assert_withdraw(sdp, !test_bit(SDF_FROZEN, &sdp->sd_flags));
+ gfs2_assert_withdraw(sdp, sb->s_writers.frozen != SB_FREEZE_COMPLETE);
if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
gfs2_ordered_wait(sdp);
@@ -974,8 +974,9 @@ void gfs2_ail_drain(struct gfs2_sbd *sdp)
static void empty_ail1_list(struct gfs2_sbd *sdp)
{
unsigned long start = jiffies;
+ bool empty = false;
- for (;;) {
+ while (!empty) {
if (time_after(jiffies, start + (HZ * 600))) {
fs_err(sdp, "Error: In %s for 10 minutes! t=%d\n",
__func__, current->journal_info ? 1 : 0);
@@ -984,9 +985,14 @@ static void empty_ail1_list(struct gfs2_sbd *sdp)
}
gfs2_ail1_start(sdp);
gfs2_ail1_wait(sdp);
- if (gfs2_ail1_empty(sdp, 0))
- return;
+ empty = gfs2_ail1_empty(sdp, 0);
+
+ if (gfs2_withdrawing_or_withdrawn(sdp))
+ break;
}
+
+ if (gfs2_withdrawing(sdp))
+ gfs2_withdraw(sdp);
}
/**
@@ -1047,7 +1053,8 @@ repeat:
* Do this check while holding the log_flush_lock to prevent new
* buffers from being added to the ail via gfs2_pin()
*/
- if (gfs2_withdrawn(sdp) || !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+ if (gfs2_withdrawing_or_withdrawn(sdp) ||
+ !test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
goto out;
/* Log might have been flushed while we waited for the flush lock */
@@ -1096,13 +1103,13 @@ repeat:
goto out_withdraw;
gfs2_ordered_write(sdp);
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
goto out_withdraw;
lops_before_commit(sdp, tr);
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
goto out_withdraw;
gfs2_log_submit_bio(&sdp->sd_jdesc->jd_log_bio, REQ_OP_WRITE);
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
goto out_withdraw;
if (sdp->sd_log_head != sdp->sd_log_flush_head) {
@@ -1110,7 +1117,7 @@ repeat:
} else if (sdp->sd_log_tail != sdp->sd_log_flush_tail && !sdp->sd_log_idle) {
log_write_header(sdp, flags);
}
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
goto out_withdraw;
lops_after_commit(sdp, tr);
@@ -1128,7 +1135,7 @@ repeat:
if (!(flags & GFS2_LOG_HEAD_FLUSH_NORMAL)) {
if (!sdp->sd_log_idle) {
empty_ail1_list(sdp);
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
goto out_withdraw;
log_write_header(sdp, flags);
}
@@ -1297,8 +1304,9 @@ int gfs2_logd(void *data)
struct gfs2_sbd *sdp = data;
unsigned long t = 1;
+ set_freezable();
while (!kthread_should_stop()) {
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
break;
/* Check for errors writing to the journal */
@@ -1330,18 +1338,19 @@ int gfs2_logd(void *data)
t = gfs2_tune_get(sdp, gt_logd_secs) * HZ;
- try_to_freeze();
-
- t = wait_event_interruptible_timeout(sdp->sd_logd_waitq,
+ t = wait_event_freezable_timeout(sdp->sd_logd_waitq,
test_bit(SDF_FORCE_AIL_FLUSH, &sdp->sd_flags) ||
gfs2_ail_flush_reqd(sdp) ||
gfs2_jrnl_flush_reqd(sdp) ||
sdp->sd_log_error ||
- gfs2_withdrawn(sdp) ||
+ gfs2_withdrawing_or_withdrawn(sdp) ||
kthread_should_stop(),
t);
}
+ if (gfs2_withdrawing(sdp))
+ gfs2_withdraw(sdp);
+
return 0;
}
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 483f69807062..314ec2a70167 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -391,22 +391,15 @@ static void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page)
* Simply unlock the pages in the bio. The main thread will wait on them and
* process them in order as necessary.
*/
-
static void gfs2_end_log_read(struct bio *bio)
{
- struct page *page;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ int error = blk_status_to_errno(bio->bi_status);
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- page = bvec->bv_page;
- if (bio->bi_status) {
- int err = blk_status_to_errno(bio->bi_status);
-
- SetPageError(page);
- mapping_set_error(page->mapping, err);
- }
- unlock_page(page);
+ bio_for_each_folio_all(fi, bio) {
+ /* We're abusing wb_err to get the error to gfs2_find_jhead */
+ filemap_set_wb_err(fi.folio->mapping, error);
+ folio_end_read(fi.folio, !error);
}
bio_put(bio);
@@ -475,7 +468,7 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index,
folio = filemap_get_folio(jd->jd_inode->i_mapping, index);
folio_wait_locked(folio);
- if (folio_test_error(folio))
+ if (!folio_test_uptodate(folio))
*done = true;
if (!*done)
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 25ceb0805df2..f814054c8cd0 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -252,7 +252,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
struct buffer_head *bh, *bhs[2];
int num = 0;
- if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp) &&
+ !gfs2_withdraw_in_prog(sdp)) {
*bhp = NULL;
return -EIO;
}
@@ -310,7 +311,8 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
{
- if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp) &&
+ !gfs2_withdraw_in_prog(sdp))
return -EIO;
wait_on_buffer(bh);
@@ -321,7 +323,8 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
gfs2_io_error_bh_wd(sdp, bh);
return -EIO;
}
- if (unlikely(gfs2_withdrawn(sdp)) && !gfs2_withdraw_in_prog(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp) &&
+ !gfs2_withdraw_in_prog(sdp))
return -EIO;
return 0;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b108c5d26839..1281e60be639 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -117,7 +117,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
mapping->host = sb->s_bdev->bd_inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
- mapping->private_data = NULL;
+ mapping->i_private_data = NULL;
mapping->writeback_index = 0;
spin_lock_init(&sdp->sd_log_lock);
@@ -1073,7 +1073,7 @@ hostdata_error:
void gfs2_lm_unmount(struct gfs2_sbd *sdp)
{
const struct lm_lockops *lm = sdp->sd_lockstruct.ls_ops;
- if (likely(!gfs2_withdrawn(sdp)) && lm->lm_unmount)
+ if (!gfs2_withdrawing_or_withdrawn(sdp) && lm->lm_unmount)
lm->lm_unmount(sdp);
}
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 95dae7838b4e..aa9cf0102848 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -128,7 +128,7 @@ static void gfs2_qd_dispose(struct gfs2_quota_data *qd)
hlist_bl_del_rcu(&qd->qd_hlist);
spin_unlock_bucket(qd->qd_hash);
- if (!gfs2_withdrawn(sdp)) {
+ if (!gfs2_withdrawing_or_withdrawn(sdp)) {
gfs2_assert_warn(sdp, !qd->qd_change);
gfs2_assert_warn(sdp, !qd->qd_slot_ref);
gfs2_assert_warn(sdp, !qd->qd_bh_count);
@@ -271,7 +271,7 @@ static struct gfs2_quota_data *gfs2_qd_search_bucket(unsigned int hash,
if (qd->qd_sbd != sdp)
continue;
if (lockref_get_not_dead(&qd->qd_lockref)) {
- list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+ list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru);
return qd;
}
}
@@ -344,7 +344,7 @@ static void qd_put(struct gfs2_quota_data *qd)
}
qd->qd_lockref.count = 0;
- list_lru_add(&gfs2_qd_lru, &qd->qd_lru);
+ list_lru_add_obj(&gfs2_qd_lru, &qd->qd_lru);
spin_unlock(&qd->qd_lockref.lock);
}
@@ -1505,7 +1505,8 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
LIST_HEAD(dispose);
int count;
- BUG_ON(test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
+ BUG_ON(!test_bit(SDF_NORECOVERY, &sdp->sd_flags) &&
+ test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags));
spin_lock(&qd_lock);
list_for_each_entry(qd, &sdp->sd_quota_list, qd_list) {
@@ -1517,7 +1518,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp)
lockref_mark_dead(&qd->qd_lockref);
spin_unlock(&qd->qd_lockref.lock);
- list_lru_del(&gfs2_qd_lru, &qd->qd_lru);
+ list_lru_del_obj(&gfs2_qd_lru, &qd->qd_lru);
list_add(&qd->qd_lru, &dispose);
}
spin_unlock(&qd_lock);
@@ -1539,7 +1540,7 @@ static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error)
{
if (error == 0 || error == -EROFS)
return;
- if (!gfs2_withdrawn(sdp)) {
+ if (!gfs2_withdrawing_or_withdrawn(sdp)) {
if (!cmpxchg(&sdp->sd_log_error, 0, error))
fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error);
wake_up(&sdp->sd_logd_waitq);
@@ -1582,8 +1583,9 @@ int gfs2_quotad(void *data)
unsigned long quotad_timeo = 0;
unsigned long t = 0;
+ set_freezable();
while (!kthread_should_stop()) {
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
break;
/* Update the master statfs file */
@@ -1601,13 +1603,11 @@ int gfs2_quotad(void *data)
quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t,
&quotad_timeo, &tune->gt_quota_quantum);
- try_to_freeze();
-
t = min(quotad_timeo, statfs_timeo);
- t = wait_event_interruptible_timeout(sdp->sd_quota_wait,
+ t = wait_event_freezable_timeout(sdp->sd_quota_wait,
sdp->sd_statfs_force_sync ||
- gfs2_withdrawn(sdp) ||
+ gfs2_withdrawing_or_withdrawn(sdp) ||
kthread_should_stop(),
t);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 5aae02669a40..f4fe7039f725 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -411,7 +411,7 @@ void gfs2_recover_func(struct work_struct *work)
int error = 0;
int jlocked = 0;
- if (gfs2_withdrawn(sdp)) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_err(sdp, "jid=%u: Recovery not attempted due to withdraw.\n",
jd->jd_jid);
goto fail;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c2060203b98a..26d6c1eea559 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -159,13 +159,13 @@ static inline u8 gfs2_testbit(const struct gfs2_rbm *rbm, bool use_clone)
}
/**
- * gfs2_bit_search
+ * gfs2_bit_search - search bitmap for a state
* @ptr: Pointer to bitmap data
* @mask: Mask to use (normally 0x55555.... but adjusted for search start)
* @state: The state we are searching for
*
- * We xor the bitmap data with a patter which is the bitwise opposite
- * of what we are looking for, this gives rise to a pattern of ones
+ * We xor the bitmap data with a pattern which is the bitwise opposite
+ * of what we are looking for. This gives rise to a pattern of ones
* wherever there is a match. Since we have two bits per entry, we
* take this pattern, shift it down by one place and then and it with
* the original. All the even bit positions (0,2,4, etc) then represent
@@ -1188,7 +1188,7 @@ static void rgrp_set_bitmap_flags(struct gfs2_rgrpd *rgd)
/**
* gfs2_rgrp_go_instantiate - Read in a RG's header and bitmaps
- * @gh: the glock holder representing the rgrpd to read in
+ * @gl: the glock representing the rgrpd to read in
*
* Read in all of a Resource Group's header and bitmap blocks.
* Caller must eventually call gfs2_rgrp_brelse() to free the bitmaps.
@@ -1967,7 +1967,7 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
}
/**
- * gfs2_rgrp_used_recently
+ * gfs2_rgrp_used_recently - test if an rgrp has been used recently
* @rs: The block reservation with the rgrp to test
* @msecs: The time limit in milliseconds
*
@@ -2306,7 +2306,7 @@ void gfs2_rgrp_dump(struct seq_file *seq, struct gfs2_rgrpd *rgd,
(unsigned long long)rgd->rd_addr, rgd->rd_flags,
rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
rgd->rd_requested, rgd->rd_reserved, rgd->rd_extfail_pt);
- if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
+ if (rgd->rd_sbd->sd_args.ar_rgrplvb && rgd->rd_rgl) {
struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
gfs2_print_dbg(seq, "%s L: f:%02x b:%u i:%u\n", fs_id_buf,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d21c04a22d73..e5f79466340d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -134,7 +134,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
int error;
j_gl->gl_ops->go_inval(j_gl, DIO_METADATA);
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return -EIO;
error = gfs2_find_jhead(sdp->sd_jdesc, &head, false);
@@ -153,7 +153,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
gfs2_log_pointers_init(sdp, head.lh_blkno);
error = gfs2_quota_init(sdp);
- if (!error && gfs2_withdrawn(sdp))
+ if (!error && gfs2_withdrawing_or_withdrawn(sdp))
error = -EIO;
if (!error)
set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
@@ -499,7 +499,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
return;
}
- if (unlikely(gfs2_withdrawn(sdp)))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return;
if (!gfs2_glock_is_locked_by_me(ip->i_gl)) {
ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
@@ -605,7 +605,7 @@ restart:
if (!sb_rdonly(sb))
gfs2_make_fs_ro(sdp);
else {
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
gfs2_destroy_threads(sdp);
gfs2_quota_cleanup(sdp);
@@ -673,28 +673,6 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
return sdp->sd_log_error;
}
-static int gfs2_freeze_locally(struct gfs2_sbd *sdp)
-{
- struct super_block *sb = sdp->sd_vfs;
- int error;
-
- error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
- if (error)
- return error;
-
- if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
- gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
- GFS2_LFC_FREEZE_GO_SYNC);
- if (gfs2_withdrawn(sdp)) {
- error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
- if (error)
- return error;
- return -EIO;
- }
- }
- return 0;
-}
-
static int gfs2_do_thaw(struct gfs2_sbd *sdp)
{
struct super_block *sb = sdp->sd_vfs;
@@ -724,7 +702,7 @@ void gfs2_freeze_func(struct work_struct *work)
if (test_bit(SDF_FROZEN, &sdp->sd_flags))
goto freeze_failed;
- error = gfs2_freeze_locally(sdp);
+ error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
if (error)
goto freeze_failed;
@@ -759,12 +737,13 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
if (!mutex_trylock(&sdp->sd_freeze_mutex))
return -EBUSY;
- error = -EBUSY;
- if (test_bit(SDF_FROZEN, &sdp->sd_flags))
- goto out;
+ if (test_bit(SDF_FROZEN, &sdp->sd_flags)) {
+ mutex_unlock(&sdp->sd_freeze_mutex);
+ return -EBUSY;
+ }
for (;;) {
- error = gfs2_freeze_locally(sdp);
+ error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
if (error) {
fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n",
error);
@@ -772,8 +751,11 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
}
error = gfs2_lock_fs_check_clean(sdp);
- if (!error)
- break; /* success */
+ if (!error) {
+ set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
+ set_bit(SDF_FROZEN, &sdp->sd_flags);
+ break;
+ }
error = gfs2_do_thaw(sdp);
if (error)
@@ -793,14 +775,23 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who)
}
out:
- if (!error) {
- set_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
- set_bit(SDF_FROZEN, &sdp->sd_flags);
- }
mutex_unlock(&sdp->sd_freeze_mutex);
return error;
}
+static int gfs2_freeze_fs(struct super_block *sb)
+{
+ struct gfs2_sbd *sdp = sb->s_fs_info;
+
+ if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+ gfs2_log_flush(sdp, NULL, GFS2_LOG_HEAD_FLUSH_FREEZE |
+ GFS2_LFC_FREEZE_GO_SYNC);
+ if (gfs2_withdrawing_or_withdrawn(sdp))
+ return -EIO;
+ }
+ return 0;
+}
+
/**
* gfs2_thaw_super - reallow writes to the filesystem
* @sb: the VFS structure for the filesystem
@@ -814,10 +805,12 @@ static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who)
if (!mutex_trylock(&sdp->sd_freeze_mutex))
return -EBUSY;
- error = -EINVAL;
- if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags))
- goto out;
+ if (!test_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags)) {
+ mutex_unlock(&sdp->sd_freeze_mutex);
+ return -EINVAL;
+ }
+ atomic_inc(&sb->s_active);
gfs2_freeze_unlock(&sdp->sd_freeze_gh);
error = gfs2_do_thaw(sdp);
@@ -826,8 +819,8 @@ static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who)
clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags);
clear_bit(SDF_FROZEN, &sdp->sd_flags);
}
-out:
mutex_unlock(&sdp->sd_freeze_mutex);
+ deactivate_super(sb);
return error;
}
@@ -1065,16 +1058,6 @@ static int gfs2_drop_inode(struct inode *inode)
return generic_drop_inode(inode);
}
-static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
-{
- do {
- if (d1 == d2)
- return 1;
- d1 = d1->d_parent;
- } while (!IS_ROOT(d1));
- return 0;
-}
-
/**
* gfs2_show_options - Show mount options for /proc/mounts
* @s: seq_file structure
@@ -1096,7 +1079,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
statfs_slow = sdp->sd_tune.gt_statfs_slow;
spin_unlock(&sdp->sd_tune.gt_spin);
- if (is_ancestor(root, sdp->sd_master_dir))
+ if (is_subdir(root, sdp->sd_master_dir))
seq_puts(s, ",meta");
if (args->ar_lockproto[0])
seq_show_option(s, "lockproto", args->ar_lockproto);
@@ -1607,6 +1590,7 @@ const struct super_operations gfs2_super_ops = {
.put_super = gfs2_put_super,
.sync_fs = gfs2_sync_fs,
.freeze_super = gfs2_freeze_super,
+ .freeze_fs = gfs2_freeze_fs,
.thaw_super = gfs2_thaw_super,
.statfs = gfs2_statfs,
.drop_inode = gfs2_drop_inode,
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 60a0206890c5..250f340cb44d 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -193,7 +193,7 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
{
- unsigned int b = gfs2_withdrawn(sdp);
+ unsigned int b = gfs2_withdrawing_or_withdrawn(sdp);
return snprintf(buf, PAGE_SIZE, "%u\n", b);
}
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 7e835be7032d..192213c7359a 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -268,7 +268,7 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
(unsigned long long)bd->bd_bh->b_blocknr);
BUG();
}
- if (unlikely(gfs2_withdrawn(sdp))) {
+ if (gfs2_withdrawing_or_withdrawn(sdp)) {
fs_info(sdp, "GFS2:adding buf while withdrawn! 0x%llx\n",
(unsigned long long)bd->bd_bh->b_blocknr);
goto out_unlock;
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index da29fafb6272..f52141ce9485 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -372,7 +372,7 @@ void gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion,
const char *function, char *file, unsigned int line,
bool delayed)
{
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return;
fs_err(sdp,
@@ -548,7 +548,7 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
const char *function, char *file, unsigned int line,
bool withdraw)
{
- if (gfs2_withdrawn(sdp))
+ if (gfs2_withdrawing_or_withdrawn(sdp))
return;
fs_err(sdp, "fatal: I/O error\n"
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 11c9d59b6889..ba071998461f 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -198,13 +198,14 @@ static inline void gfs2_withdraw_delayed(struct gfs2_sbd *sdp)
}
/**
- * gfs2_withdrawn - test whether the file system is withdrawing or withdrawn
+ * gfs2_withdrawing_or_withdrawn - test whether the file system is withdrawing
+ * or withdrawn
* @sdp: the superblock
*/
-static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp)
+static inline bool gfs2_withdrawing_or_withdrawn(struct gfs2_sbd *sdp)
{
- return test_bit(SDF_WITHDRAWN, &sdp->sd_flags) ||
- test_bit(SDF_WITHDRAWING, &sdp->sd_flags);
+ return unlikely(test_bit(SDF_WITHDRAWN, &sdp->sd_flags) ||
+ test_bit(SDF_WITHDRAWING, &sdp->sd_flags));
}
/**
@@ -213,13 +214,13 @@ static inline bool gfs2_withdrawn(struct gfs2_sbd *sdp)
*/
static inline bool gfs2_withdrawing(struct gfs2_sbd *sdp)
{
- return test_bit(SDF_WITHDRAWING, &sdp->sd_flags) &&
- !test_bit(SDF_WITHDRAWN, &sdp->sd_flags);
+ return unlikely(test_bit(SDF_WITHDRAWING, &sdp->sd_flags) &&
+ !test_bit(SDF_WITHDRAWN, &sdp->sd_flags));
}
static inline bool gfs2_withdraw_in_prog(struct gfs2_sbd *sdp)
{
- return test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags);
+ return unlikely(test_bit(SDF_WITHDRAW_IN_PROG, &sdp->sd_flags));
}
#define gfs2_tune_get(sdp, field) \
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index a7bc4690a780..8c34798a0715 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -29,11 +29,6 @@ static const struct inode_operations hfs_file_inode_operations;
#define HFS_VALID_MODE_BITS (S_IFREG | S_IFDIR | S_IRWXUGO)
-static int hfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, hfs_get_block, wbc);
-}
-
static int hfs_read_folio(struct file *file, struct folio *folio)
{
return block_read_full_folio(folio, hfs_get_block);
@@ -162,9 +157,10 @@ const struct address_space_operations hfs_btree_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hfs_read_folio,
- .writepage = hfs_writepage,
+ .writepages = hfs_writepages,
.write_begin = hfs_write_begin,
.write_end = generic_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = hfs_bmap,
.release_folio = hfs_release_folio,
};
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 7ededcb720c1..012a3d003fbe 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -190,6 +190,7 @@ struct hfsplus_sb_info {
int work_queued; /* non-zero delayed work is queued */
struct delayed_work sync_work; /* FS sync delayed work */
spinlock_t work_lock; /* protects sync_work and work_queued */
+ struct rcu_head rcu;
};
#define HFSPLUS_SB_WRITEBACKUP 0
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 702a0663b1d8..3d326926c195 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -28,11 +28,6 @@ static int hfsplus_read_folio(struct file *file, struct folio *folio)
return block_read_full_folio(folio, hfsplus_get_block);
}
-static int hfsplus_writepage(struct page *page, struct writeback_control *wbc)
-{
- return block_write_full_page(page, hfsplus_get_block, wbc);
-}
-
static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
@@ -159,9 +154,10 @@ const struct address_space_operations hfsplus_btree_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = hfsplus_read_folio,
- .writepage = hfsplus_writepage,
+ .writepages = hfsplus_writepages,
.write_begin = hfsplus_write_begin,
.write_end = generic_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = hfsplus_bmap,
.release_folio = hfsplus_release_folio,
};
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1986b4f18a90..97920202790f 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -277,6 +277,14 @@ void hfsplus_mark_mdb_dirty(struct super_block *sb)
spin_unlock(&sbi->work_lock);
}
+static void delayed_free(struct rcu_head *p)
+{
+ struct hfsplus_sb_info *sbi = container_of(p, struct hfsplus_sb_info, rcu);
+
+ unload_nls(sbi->nls);
+ kfree(sbi);
+}
+
static void hfsplus_put_super(struct super_block *sb)
{
struct hfsplus_sb_info *sbi = HFSPLUS_SB(sb);
@@ -302,9 +310,7 @@ static void hfsplus_put_super(struct super_block *sb)
hfs_btree_close(sbi->ext_tree);
kfree(sbi->s_vhdr_buf);
kfree(sbi->s_backup_vhdr_buf);
- unload_nls(sbi->nls);
- kfree(sb->s_fs_info);
- sb->s_fs_info = NULL;
+ call_rcu(&sbi->rcu, delayed_free);
}
static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 0b791adf02e5..b0cb70400996 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -30,8 +30,7 @@ struct hfsplus_wd {
* @sector: block to read or write, for blocks of HFSPLUS_SECTOR_SIZE bytes
* @buf: buffer for I/O
* @data: output pointer for location of requested data
- * @op: direction of I/O
- * @op_flags: request op flags
+ * @opf: request op flags
*
* The unit of I/O is hfsplus_min_io_size(sb), which may be bigger than
* HFSPLUS_SECTOR_SIZE, and @buf must be sized accordingly. On reads
@@ -43,6 +42,8 @@ struct hfsplus_wd {
* that starts at the rounded-down address. As long as the data was
* read using hfsplus_submit_bio() and the same buffer is used things
* will work correctly.
+ *
+ * Returns: %0 on success else -errno code
*/
int hfsplus_submit_bio(struct super_block *sb, sector_t sector,
void *buf, void **data, blk_opf_t opf)
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index ea87f24c6c3f..a73d27c4dd58 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -637,12 +637,8 @@ static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
inode = hostfs_iget(ino->i_sb, name);
__putname(name);
- if (IS_ERR(inode)) {
- if (PTR_ERR(inode) == -ENOENT)
- inode = NULL;
- else
- return ERR_CAST(inode);
- }
+ if (inode == ERR_PTR(-ENOENT))
+ inode = NULL;
return d_splice_alias(inode, dentry);
}
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index f757d4f7ad98..d746866ae3b6 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -100,6 +100,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
loff_t len, vma_len;
int ret;
struct hstate *h = hstate_file(file);
+ vm_flags_t vm_flags;
/*
* vma address alignment (but not the pgoff alignment) has
@@ -141,10 +142,20 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
ret = -ENOMEM;
+
+ vm_flags = vma->vm_flags;
+ /*
+ * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip
+ * reserving here. Note: only for SHM hugetlbfs file, the inode
+ * flag S_PRIVATE is set.
+ */
+ if (inode->i_flags & S_PRIVATE)
+ vm_flags |= VM_NORESERVE;
+
if (!hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
- vma->vm_flags))
+ vm_flags))
goto out;
ret = 0;
@@ -340,7 +351,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
folio_unlock(folio);
- if (!folio_test_has_hwpoisoned(folio))
+ if (!folio_test_hwpoison(folio))
want = nr;
else {
/*
@@ -686,7 +697,7 @@ static void hugetlbfs_evict_inode(struct inode *inode)
* at inode creation time. If this is a device special inode,
* i_mapping may not point to the original address space.
*/
- resv_map = (struct resv_map *)(&inode->i_data)->private_data;
+ resv_map = (struct resv_map *)(&inode->i_data)->i_private_data;
/* Only regular and link inodes have associated reserve maps */
if (resv_map)
resv_map_release(&resv_map->refs);
@@ -1000,7 +1011,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
simple_inode_init_ts(inode);
- inode->i_mapping->private_data = resv_map;
+ inode->i_mapping->i_private_data = resv_map;
info->seals = F_SEAL_SEAL;
switch (mode & S_IFMT) {
default:
@@ -1129,8 +1140,8 @@ static int hugetlbfs_migrate_folio(struct address_space *mapping,
#define hugetlbfs_migrate_folio NULL
#endif
-static int hugetlbfs_error_remove_page(struct address_space *mapping,
- struct page *page)
+static int hugetlbfs_error_remove_folio(struct address_space *mapping,
+ struct folio *folio)
{
return 0;
}
@@ -1277,7 +1288,7 @@ static const struct address_space_operations hugetlbfs_aops = {
.write_end = hugetlbfs_write_end,
.dirty_folio = noop_dirty_folio,
.migrate_folio = hugetlbfs_migrate_folio,
- .error_remove_page = hugetlbfs_error_remove_page,
+ .error_remove_folio = hugetlbfs_error_remove_folio,
};
@@ -1354,6 +1365,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
{
struct hugetlbfs_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
+ struct hstate *h;
char *rest;
unsigned long ps;
int opt;
@@ -1398,11 +1410,12 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
case Opt_pagesize:
ps = memparse(param->string, &rest);
- ctx->hstate = size_to_hstate(ps);
- if (!ctx->hstate) {
+ h = size_to_hstate(ps);
+ if (!h) {
pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
return -EINVAL;
}
+ ctx->hstate = h;
return 0;
case Opt_min_size:
diff --git a/fs/inode.c b/fs/inode.c
index f238d987dec9..91048c4c9c9e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -129,7 +129,6 @@ static struct ctl_table inodes_sysctls[] = {
.mode = 0444,
.proc_handler = proc_nr_inodes,
},
- { }
};
static int __init init_fs_inode_sysctls(void)
@@ -209,7 +208,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
atomic_set(&mapping->nr_thps, 0);
#endif
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
- mapping->private_data = NULL;
+ mapping->i_private_data = NULL;
mapping->writeback_index = 0;
init_rwsem(&mapping->invalidate_lock);
lockdep_set_class_and_name(&mapping->invalidate_lock,
@@ -398,8 +397,8 @@ static void __address_space_init_once(struct address_space *mapping)
{
xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
init_rwsem(&mapping->i_mmap_rwsem);
- INIT_LIST_HEAD(&mapping->private_list);
- spin_lock_init(&mapping->private_lock);
+ INIT_LIST_HEAD(&mapping->i_private_list);
+ spin_lock_init(&mapping->i_private_lock);
mapping->i_mmap = RB_ROOT_CACHED;
}
@@ -464,7 +463,7 @@ static void __inode_add_lru(struct inode *inode, bool rotate)
if (!mapping_shrinkable(&inode->i_data))
return;
- if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
+ if (list_lru_add_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_inc(nr_unused);
else if (rotate)
inode->i_state |= I_REFERENCED;
@@ -482,7 +481,7 @@ void inode_add_lru(struct inode *inode)
static void inode_lru_list_del(struct inode *inode)
{
- if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
+ if (list_lru_del_obj(&inode->i_sb->s_inode_lru, &inode->i_lru))
this_cpu_dec(nr_unused);
}
@@ -620,7 +619,7 @@ void clear_inode(struct inode *inode)
* nor even WARN_ON(!mapping_empty).
*/
xa_unlock_irq(&inode->i_data.i_pages);
- BUG_ON(!list_empty(&inode->i_data.private_list));
+ BUG_ON(!list_empty(&inode->i_data.i_private_list));
BUG_ON(!(inode->i_state & I_FREEING));
BUG_ON(inode->i_state & I_CLEAR);
BUG_ON(!list_empty(&inode->i_wb_list));
@@ -1090,48 +1089,6 @@ void discard_new_inode(struct inode *inode)
EXPORT_SYMBOL(discard_new_inode);
/**
- * lock_two_inodes - lock two inodes (may be regular files but also dirs)
- *
- * Lock any non-NULL argument. The caller must make sure that if he is passing
- * in two directories, one is not ancestor of the other. Zero, one or two
- * objects may be locked by this function.
- *
- * @inode1: first inode to lock
- * @inode2: second inode to lock
- * @subclass1: inode lock subclass for the first lock obtained
- * @subclass2: inode lock subclass for the second lock obtained
- */
-void lock_two_inodes(struct inode *inode1, struct inode *inode2,
- unsigned subclass1, unsigned subclass2)
-{
- if (!inode1 || !inode2) {
- /*
- * Make sure @subclass1 will be used for the acquired lock.
- * This is not strictly necessary (no current caller cares) but
- * let's keep things consistent.
- */
- if (!inode1)
- swap(inode1, inode2);
- goto lock;
- }
-
- /*
- * If one object is directory and the other is not, we must make sure
- * to lock directory first as the other object may be its child.
- */
- if (S_ISDIR(inode2->i_mode) == S_ISDIR(inode1->i_mode)) {
- if (inode1 > inode2)
- swap(inode1, inode2);
- } else if (!S_ISDIR(inode1->i_mode))
- swap(inode1, inode2);
-lock:
- if (inode1)
- inode_lock_nested(inode1, subclass1);
- if (inode2 && inode2 != inode1)
- inode_lock_nested(inode2, subclass2);
-}
-
-/**
* lock_two_nondirectories - take two i_mutexes on non-directory objects
*
* Lock any non-NULL argument. Passed objects must not be directories.
@@ -1146,7 +1103,12 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
WARN_ON_ONCE(S_ISDIR(inode1->i_mode));
if (inode2)
WARN_ON_ONCE(S_ISDIR(inode2->i_mode));
- lock_two_inodes(inode1, inode2, I_MUTEX_NORMAL, I_MUTEX_NONDIR2);
+ if (inode1 > inode2)
+ swap(inode1, inode2);
+ if (inode1)
+ inode_lock(inode1);
+ if (inode2 && inode2 != inode1)
+ inode_lock_nested(inode2, I_MUTEX_NONDIR2);
}
EXPORT_SYMBOL(lock_two_nondirectories);
@@ -1836,37 +1798,37 @@ EXPORT_SYMBOL(bmap);
* earlier than or equal to either the ctime or mtime,
* or if at least a day has passed since the last atime update.
*/
-static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
+static bool relatime_need_update(struct vfsmount *mnt, struct inode *inode,
struct timespec64 now)
{
struct timespec64 atime, mtime, ctime;
if (!(mnt->mnt_flags & MNT_RELATIME))
- return 1;
+ return true;
/*
* Is mtime younger than or equal to atime? If yes, update atime:
*/
atime = inode_get_atime(inode);
mtime = inode_get_mtime(inode);
if (timespec64_compare(&mtime, &atime) >= 0)
- return 1;
+ return true;
/*
* Is ctime younger than or equal to atime? If yes, update atime:
*/
ctime = inode_get_ctime(inode);
if (timespec64_compare(&ctime, &atime) >= 0)
- return 1;
+ return true;
/*
* Is the previous atime value older than a day? If yes,
* update atime:
*/
if ((long)(now.tv_sec - atime.tv_sec) >= 24*60*60)
- return 1;
+ return true;
/*
* Good, we can skip the atime update:
*/
- return 0;
+ return false;
}
/**
@@ -2404,7 +2366,7 @@ EXPORT_SYMBOL(inode_init_owner);
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
bool inode_owner_or_capable(struct mnt_idmap *idmap,
const struct inode *inode)
diff --git a/fs/internal.h b/fs/internal.h
index 58e43341aebf..b67406435fc0 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -83,6 +83,8 @@ int path_mount(const char *dev_name, struct path *path,
const char *type_page, unsigned long flags, void *data_page);
int path_umount(struct path *path, int flags);
+int show_path(struct seq_file *m, struct dentry *root);
+
/*
* fs_struct.c
*/
@@ -94,7 +96,6 @@ extern void chroot_fs_refs(const struct path *, const struct path *);
struct file *alloc_empty_file(int flags, const struct cred *cred);
struct file *alloc_empty_file_noaccount(int flags, const struct cred *cred);
struct file *alloc_empty_backing_file(int flags, const struct cred *cred);
-void release_empty_file(struct file *f);
static inline void file_put_write_access(struct file *file)
{
@@ -180,7 +181,7 @@ extern struct file *do_file_open_root(const struct path *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
-extern struct file *__close_fd_get_file(unsigned int fd);
+struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
@@ -196,8 +197,6 @@ extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
int dentry_needs_remove_privs(struct mnt_idmap *, struct dentry *dentry);
bool in_group_or_capable(struct mnt_idmap *idmap,
const struct inode *inode, vfsgid_t vfsgid);
-void lock_two_inodes(struct inode *inode1, struct inode *inode2,
- unsigned subclass1, unsigned subclass2);
/*
* fs-writeback.c
@@ -215,6 +214,11 @@ extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *)
extern char *simple_dname(struct dentry *, char *, int);
extern void dput_to_list(struct dentry *, struct list_head *);
extern void shrink_dentry_list(struct list_head *);
+extern void shrink_dcache_for_umount(struct super_block *);
+extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *);
+extern struct dentry *__d_lookup_rcu(const struct dentry *parent,
+ const struct qstr *name, unsigned *seq);
+extern void d_genocide(struct dentry *);
/*
* pipe.c
@@ -243,10 +247,10 @@ int do_statx(int dfd, struct filename *filename, unsigned int flags,
/*
* fs/splice.c:
*/
-long splice_file_to_pipe(struct file *in,
- struct pipe_inode_info *opipe,
- loff_t *offset,
- size_t len, unsigned int flags);
+ssize_t splice_file_to_pipe(struct file *in,
+ struct pipe_inode_info *opipe,
+ loff_t *offset,
+ size_t len, unsigned int flags);
/*
* fs/xattr.c:
diff --git a/fs/ioctl.c b/fs/ioctl.c
index f5fd99d6b0d4..76cf22ac97d7 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -920,8 +920,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
if (!f.file)
return -EBADF;
- /* RED-PEN how should LSM module know it's handling 32bit? */
- error = security_file_ioctl(f.file, cmd, arg);
+ error = security_file_ioctl_compat(f.file, cmd, arg);
if (error)
goto out;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index f72df2babe56..093c4515b22a 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -305,28 +305,18 @@ static int iomap_read_inline_data(const struct iomap_iter *iter,
{
const struct iomap *iomap = iomap_iter_srcmap(iter);
size_t size = i_size_read(iter->inode) - iomap->offset;
- size_t poff = offset_in_page(iomap->offset);
size_t offset = offset_in_folio(folio, iomap->offset);
- void *addr;
if (folio_test_uptodate(folio))
return 0;
- if (WARN_ON_ONCE(size > PAGE_SIZE - poff))
- return -EIO;
- if (WARN_ON_ONCE(size > PAGE_SIZE -
- offset_in_page(iomap->inline_data)))
- return -EIO;
if (WARN_ON_ONCE(size > iomap->length))
return -EIO;
if (offset > 0)
ifs_alloc(iter->inode, folio, iter->flags);
- addr = kmap_local_folio(folio, offset);
- memcpy(addr, iomap->inline_data, size);
- memset(addr + size, 0, PAGE_SIZE - poff - size);
- kunmap_local(addr);
- iomap_set_range_uptodate(folio, offset, PAGE_SIZE - poff);
+ folio_fill_tail(folio, offset, iomap->inline_data, size);
+ iomap_set_range_uptodate(folio, offset, folio_size(folio) - offset);
return 0;
}
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 118699fff2f9..1c97e64c4784 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -556,7 +556,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
struct transaction_chp_stats_s *stats;
transaction_t *transaction;
journal_t *journal;
- struct buffer_head *bh = jh2bh(jh);
JBUFFER_TRACE(jh, "entry");
@@ -569,16 +568,6 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
JBUFFER_TRACE(jh, "removing from transaction");
- /*
- * If we have failed to write the buffer out to disk, the filesystem
- * may become inconsistent. We cannot abort the journal here since
- * we hold j_list_lock and we have to be careful about races with
- * jbd2_journal_destroy(). So mark the writeback IO error in the
- * journal here and we abort the journal later from a better context.
- */
- if (buffer_write_io_error(bh))
- set_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags);
-
__buffer_unlink(jh);
jh->b_cp_transaction = NULL;
percpu_counter_dec(&journal->j_checkpoint_jh_count);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 206cb53ef2b0..b6c114c11b97 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1534,6 +1534,7 @@ static journal_t *journal_init_common(struct block_device *bdev,
journal->j_fs_dev = fs_dev;
journal->j_blk_offset = start;
journal->j_total_len = len;
+ jbd2_init_fs_dev_write_error(journal);
err = journal_load_superblock(journal);
if (err)
@@ -1861,7 +1862,7 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
if (is_journal_aborted(journal))
return -EIO;
- if (test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags)) {
+ if (jbd2_check_fs_dev_write_error(journal)) {
jbd2_journal_abort(journal, -EIO);
return -EIO;
}
@@ -2159,12 +2160,12 @@ int jbd2_journal_destroy(journal_t *journal)
/*
* OK, all checkpoint transactions have been checked, now check the
- * write out io error flag and abort the journal if some buffer failed
- * to write back to the original location, otherwise the filesystem
- * may become inconsistent.
+ * writeback errseq of fs dev and abort the journal if some buffer
+ * failed to write back to the original location, otherwise the
+ * filesystem may become inconsistent.
*/
if (!is_journal_aborted(journal) &&
- test_bit(JBD2_CHECKPOINT_IO_ERROR, &journal->j_atomic_flags))
+ jbd2_check_fs_dev_write_error(journal))
jbd2_journal_abort(journal, -EIO);
if (journal->j_sb_buffer) {
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 01f744cb97a4..1f7664984d6e 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -289,8 +289,6 @@ int jbd2_journal_recover(journal_t *journal)
journal_superblock_t * sb;
struct recovery_info info;
- errseq_t wb_err;
- struct address_space *mapping;
memset(&info, 0, sizeof(info));
sb = journal->j_superblock;
@@ -308,9 +306,6 @@ int jbd2_journal_recover(journal_t *journal)
return 0;
}
- wb_err = 0;
- mapping = journal->j_fs_dev->bd_inode->i_mapping;
- errseq_check_and_advance(&mapping->wb_err, &wb_err);
err = do_one_pass(journal, &info, PASS_SCAN);
if (!err)
err = do_one_pass(journal, &info, PASS_REVOKE);
@@ -334,7 +329,7 @@ int jbd2_journal_recover(journal_t *journal)
err2 = sync_blockdev(journal->j_fs_dev);
if (!err)
err = err2;
- err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err);
+ err2 = jbd2_check_fs_dev_write_error(journal);
if (!err)
err = err2;
/* Make sure all replayed data is on permanent storage */
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 5f08b5fd105a..cb0b8d6fc0c6 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1231,11 +1231,25 @@ out:
int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh)
{
struct journal_head *jh;
+ journal_t *journal;
int rc;
if (is_handle_aborted(handle))
return -EROFS;
+ journal = handle->h_transaction->t_journal;
+ if (jbd2_check_fs_dev_write_error(journal)) {
+ /*
+ * If the fs dev has writeback errors, it may have failed
+ * to async write out metadata buffers in the background.
+ * In this case, we could read old data from disk and write
+ * it out again, which may lead to on-disk filesystem
+ * inconsistency. Aborting journal can avoid it happen.
+ */
+ jbd2_journal_abort(journal, -EIO);
+ return -EIO;
+ }
+
if (jbd2_write_access_granted(handle, bh, false))
return 0;
diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c
index 9d26b1b9fc01..0925caab23c4 100644
--- a/fs/jffs2/debug.c
+++ b/fs/jffs2/debug.c
@@ -157,7 +157,7 @@ __jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c,
kfree(buf);
}
-void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c)
+static void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c)
{
struct jffs2_eraseblock *jeb;
uint32_t free = 0, dirty = 0, used = 0, wasted = 0,
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 11c77757ead9..cb3cda1390ad 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -63,10 +63,10 @@
*/
static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
int nblocks);
-static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
-static int dbBackSplit(dmtree_t * tp, int leafno);
-static int dbJoin(dmtree_t * tp, int leafno, int newval);
-static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
+static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl);
+static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl);
+static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl);
+static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl);
static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
int level);
static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
@@ -2103,7 +2103,7 @@ static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
* system.
*/
if (dp->tree.stree[word] == NOFREE)
- dbBackSplit((dmtree_t *) & dp->tree, word);
+ dbBackSplit((dmtree_t *)&dp->tree, word, false);
dbAllocBits(bmp, dp, blkno, nblocks);
}
@@ -2189,7 +2189,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
* the binary system of the leaves if need be.
*/
dbSplit(tp, word, BUDMIN,
- dbMaxBud((u8 *) & dp->wmap[word]));
+ dbMaxBud((u8 *)&dp->wmap[word]), false);
word += 1;
} else {
@@ -2229,7 +2229,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
* system of the leaves to reflect the current
* allocation (size).
*/
- dbSplit(tp, word, size, NOFREE);
+ dbSplit(tp, word, size, NOFREE, false);
/* get the number of dmap words handled */
nw = BUDSIZE(size, BUDMIN);
@@ -2336,7 +2336,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
/* update the leaf for this dmap word.
*/
rc = dbJoin(tp, word,
- dbMaxBud((u8 *) & dp->wmap[word]));
+ dbMaxBud((u8 *)&dp->wmap[word]), false);
if (rc)
return rc;
@@ -2369,7 +2369,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
/* update the leaf.
*/
- rc = dbJoin(tp, word, size);
+ rc = dbJoin(tp, word, size, false);
if (rc)
return rc;
@@ -2521,16 +2521,16 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
* that it is at the front of a binary buddy system.
*/
if (oldval == NOFREE) {
- rc = dbBackSplit((dmtree_t *) dcp, leafno);
+ rc = dbBackSplit((dmtree_t *)dcp, leafno, true);
if (rc) {
release_metapage(mp);
return rc;
}
oldval = dcp->stree[ti];
}
- dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
+ dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval, true);
} else {
- rc = dbJoin((dmtree_t *) dcp, leafno, newval);
+ rc = dbJoin((dmtree_t *) dcp, leafno, newval, true);
if (rc) {
release_metapage(mp);
return rc;
@@ -2561,7 +2561,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
*/
if (alloc) {
dbJoin((dmtree_t *) dcp, leafno,
- oldval);
+ oldval, true);
} else {
/* the dbJoin() above might have
* caused a larger binary buddy system
@@ -2571,9 +2571,9 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
*/
if (dcp->stree[ti] == NOFREE)
dbBackSplit((dmtree_t *)
- dcp, leafno);
+ dcp, leafno, true);
dbSplit((dmtree_t *) dcp, leafno,
- dcp->budmin, oldval);
+ dcp->budmin, oldval, true);
}
/* release the buffer and return the error.
@@ -2621,7 +2621,7 @@ dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
*
* serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
*/
-static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
+static void dbSplit(dmtree_t *tp, int leafno, int splitsz, int newval, bool is_ctl)
{
int budsz;
int cursz;
@@ -2643,7 +2643,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
while (cursz >= splitsz) {
/* update the buddy's leaf with its new value.
*/
- dbAdjTree(tp, leafno ^ budsz, cursz);
+ dbAdjTree(tp, leafno ^ budsz, cursz, is_ctl);
/* on to the next size and buddy.
*/
@@ -2655,7 +2655,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
/* adjust the dmap tree to reflect the specified leaf's new
* value.
*/
- dbAdjTree(tp, leafno, newval);
+ dbAdjTree(tp, leafno, newval, is_ctl);
}
@@ -2686,7 +2686,7 @@ static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
*
* serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
*/
-static int dbBackSplit(dmtree_t * tp, int leafno)
+static int dbBackSplit(dmtree_t *tp, int leafno, bool is_ctl)
{
int budsz, bud, w, bsz, size;
int cursz;
@@ -2737,7 +2737,7 @@ static int dbBackSplit(dmtree_t * tp, int leafno)
* system in two.
*/
cursz = leaf[bud] - 1;
- dbSplit(tp, bud, cursz, cursz);
+ dbSplit(tp, bud, cursz, cursz, is_ctl);
break;
}
}
@@ -2765,7 +2765,7 @@ static int dbBackSplit(dmtree_t * tp, int leafno)
*
* RETURN VALUES: none
*/
-static int dbJoin(dmtree_t * tp, int leafno, int newval)
+static int dbJoin(dmtree_t *tp, int leafno, int newval, bool is_ctl)
{
int budsz, buddy;
s8 *leaf;
@@ -2820,12 +2820,12 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
if (leafno < buddy) {
/* leafno is the left buddy.
*/
- dbAdjTree(tp, buddy, NOFREE);
+ dbAdjTree(tp, buddy, NOFREE, is_ctl);
} else {
/* buddy is the left buddy and becomes
* leafno.
*/
- dbAdjTree(tp, leafno, NOFREE);
+ dbAdjTree(tp, leafno, NOFREE, is_ctl);
leafno = buddy;
}
@@ -2838,7 +2838,7 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
/* update the leaf value.
*/
- dbAdjTree(tp, leafno, newval);
+ dbAdjTree(tp, leafno, newval, is_ctl);
return 0;
}
@@ -2859,15 +2859,20 @@ static int dbJoin(dmtree_t * tp, int leafno, int newval)
*
* RETURN VALUES: none
*/
-static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
+static void dbAdjTree(dmtree_t *tp, int leafno, int newval, bool is_ctl)
{
int lp, pp, k;
- int max;
+ int max, size;
+
+ size = is_ctl ? CTLTREESIZE : TREESIZE;
/* pick up the index of the leaf for this leafno.
*/
lp = leafno + le32_to_cpu(tp->dmt_leafidx);
+ if (WARN_ON_ONCE(lp >= size || lp < 0))
+ return;
+
/* is the current value the same as the old value ? if so,
* there is nothing to do.
*/
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 92b7c533407c..031d8f570f58 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -633,6 +633,11 @@ int dtSearch(struct inode *ip, struct component_name * key, ino_t * data,
for (base = 0, lim = p->header.nextindex; lim; lim >>= 1) {
index = base + (lim >> 1);
+ if (stbl[index] < 0) {
+ rc = -EIO;
+ goto out;
+ }
+
if (p->header.flag & BT_LEAF) {
/* uppercase leaf name to compare */
cmp =
@@ -1970,7 +1975,7 @@ static int dtSplitRoot(tid_t tid,
do {
f = &rp->slot[fsi];
fsi = f->next;
- } while (fsi != -1);
+ } while (fsi >= 0);
f->next = n;
}
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index a037ee59e398..2ec35889ad24 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2179,6 +2179,9 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
/* get the ag and iag numbers for this iag.
*/
agno = BLKTOAG(le64_to_cpu(iagp->agstart), sbi);
+ if (agno >= MAXAG || agno < 0)
+ return -EIO;
+
iagno = le32_to_cpu(iagp->iagnum);
/* check if this is the last free extent within the
diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c
index 415eb65a36ff..9b5c6a20b30c 100644
--- a/fs/jfs/jfs_mount.c
+++ b/fs/jfs/jfs_mount.c
@@ -172,15 +172,15 @@ int jfs_mount(struct super_block *sb)
}
jfs_info("jfs_mount: ipimap:0x%p", ipimap);
- /* map further access of per fileset inodes by the fileset inode */
- sbi->ipimap = ipimap;
-
/* initialize fileset inode allocation map */
if ((rc = diMount(ipimap))) {
jfs_err("jfs_mount: diMount failed w/rc = %d", rc);
goto err_ipimap;
}
+ /* map further access of per fileset inodes by the fileset inode */
+ sbi->ipimap = ipimap;
+
return rc;
/*
diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
index dccc8b3f1045..be17e3c43582 100644
--- a/fs/jfs/jfs_txnmgr.c
+++ b/fs/jfs/jfs_txnmgr.c
@@ -2702,6 +2702,7 @@ int jfs_lazycommit(void *arg)
unsigned long flags;
struct jfs_sb_info *sbi;
+ set_freezable();
do {
LAZY_LOCK(flags);
jfs_commit_thread_waking = 0; /* OK to wake another thread */
@@ -2884,6 +2885,7 @@ int jfs_sync(void *arg)
struct jfs_inode_info *jfs_ip;
tid_t tid;
+ set_freezable();
do {
/*
* write each inode on the anonymous inode list
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8b2bd65d70e7..bce1d7ac95ca 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -54,9 +54,9 @@ static bool kernfs_lockdep(struct kernfs_node *kn)
static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen)
{
if (!kn)
- return strlcpy(buf, "(null)", buflen);
+ return strscpy(buf, "(null)", buflen);
- return strlcpy(buf, kn->parent ? kn->name : "/", buflen);
+ return strscpy(buf, kn->parent ? kn->name : "/", buflen);
}
/* kernfs_node_depth - compute depth from @from to @to */
@@ -127,7 +127,7 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a,
*
* [3] when @kn_to is %NULL result will be "(null)"
*
- * Return: the length of the full path. If the full length is equal to or
+ * Return: the length of the constructed path. If the path would have been
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -138,16 +138,17 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
struct kernfs_node *kn, *common;
const char parent_str[] = "/..";
size_t depth_from, depth_to, len = 0;
+ ssize_t copied;
int i, j;
if (!kn_to)
- return strlcpy(buf, "(null)", buflen);
+ return strscpy(buf, "(null)", buflen);
if (!kn_from)
kn_from = kernfs_root(kn_to)->kn;
if (kn_from == kn_to)
- return strlcpy(buf, "/", buflen);
+ return strscpy(buf, "/", buflen);
common = kernfs_common_ancestor(kn_from, kn_to);
if (WARN_ON(!common))
@@ -158,18 +159,19 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
buf[0] = '\0';
- for (i = 0; i < depth_from; i++)
- len += strlcpy(buf + len, parent_str,
- len < buflen ? buflen - len : 0);
+ for (i = 0; i < depth_from; i++) {
+ copied = strscpy(buf + len, parent_str, buflen - len);
+ if (copied < 0)
+ return copied;
+ len += copied;
+ }
/* Calculate how many bytes we need for the rest */
for (i = depth_to - 1; i >= 0; i--) {
for (kn = kn_to, j = 0; j < i; j++)
kn = kn->parent;
- len += strlcpy(buf + len, "/",
- len < buflen ? buflen - len : 0);
- len += strlcpy(buf + len, kn->name,
- len < buflen ? buflen - len : 0);
+
+ len += scnprintf(buf + len, buflen - len, "/%s", kn->name);
}
return len;
@@ -182,12 +184,12 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to,
* @buflen: size of @buf
*
* Copies the name of @kn into @buf of @buflen bytes. The behavior is
- * similar to strlcpy().
+ * similar to strscpy().
*
* Fills buffer with "(null)" if @kn is %NULL.
*
- * Return: the length of @kn's name and if @buf isn't long enough,
- * it's filled up to @buflen-1 and nul terminated.
+ * Return: the resulting length of @buf. If @buf isn't long enough,
+ * it's filled up to @buflen-1 and nul terminated, and returns -E2BIG.
*
* This function can be called from any context.
*/
@@ -214,7 +216,7 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
* path (which includes '..'s) as needed to reach from @from to @to is
* returned.
*
- * Return: the length of the full path. If the full length is equal to or
+ * Return: the length of the constructed path. If the path would have been
* greater than @buflen, @buf contains the truncated path with the trailing
* '\0'. On error, -errno is returned.
*/
@@ -265,12 +267,10 @@ void pr_cont_kernfs_path(struct kernfs_node *kn)
sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
sizeof(kernfs_pr_cont_buf));
if (sz < 0) {
- pr_cont("(error)");
- goto out;
- }
-
- if (sz >= sizeof(kernfs_pr_cont_buf)) {
- pr_cont("(name too long)");
+ if (sz == -E2BIG)
+ pr_cont("(name too long)");
+ else
+ pr_cont("(error)");
goto out;
}
@@ -676,6 +676,18 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent,
{
struct kernfs_node *kn;
+ if (parent->mode & S_ISGID) {
+ /* this code block imitates inode_init_owner() for
+ * kernfs
+ */
+
+ if (parent->iattr)
+ gid = parent->iattr->ia_gid;
+
+ if (flags & KERNFS_DIR)
+ mode |= S_ISGID;
+ }
+
kn = __kernfs_new_node(kernfs_root(parent), parent,
name, mode, uid, gid, flags);
if (kn) {
@@ -850,16 +862,16 @@ static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
const unsigned char *path,
const void *ns)
{
- size_t len;
+ ssize_t len;
char *p, *name;
lockdep_assert_held_read(&kernfs_root(parent)->kernfs_rwsem);
spin_lock_irq(&kernfs_pr_cont_lock);
- len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
+ len = strscpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
- if (len >= sizeof(kernfs_pr_cont_buf)) {
+ if (len < 0) {
spin_unlock_irq(&kernfs_pr_cont_lock);
return NULL;
}
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index f0cb729e9a97..ffa4565c275a 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -447,7 +447,7 @@ static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma)
* warnings and we don't want to add spurious locking dependency
* between the two. Check whether mmap is actually implemented
* without grabbing @of->mutex by testing HAS_MMAP flag. See the
- * comment in kernfs_file_open() for more details.
+ * comment in kernfs_fop_open() for more details.
*/
if (!(of->kn->flags & KERNFS_HAS_MMAP))
return -ENODEV;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 4628edde2e7e..0c93cad0f0ac 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -125,9 +125,6 @@ static struct dentry *__kernfs_fh_to_dentry(struct super_block *sb,
inode = kernfs_get_inode(sb, kn);
kernfs_put(kn);
- if (!inode)
- return ERR_PTR(-ESTALE);
-
return d_obtain_alias(inode);
}
diff --git a/fs/libfs.c b/fs/libfs.c
index c2aa6fd4795c..eec6031b0155 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -104,15 +104,16 @@ EXPORT_SYMBOL(dcache_dir_close);
* If no such element exists, NULL is returned.
*/
static struct dentry *scan_positives(struct dentry *cursor,
- struct list_head *p,
+ struct hlist_node **p,
loff_t count,
struct dentry *last)
{
struct dentry *dentry = cursor->d_parent, *found = NULL;
spin_lock(&dentry->d_lock);
- while ((p = p->next) != &dentry->d_subdirs) {
- struct dentry *d = list_entry(p, struct dentry, d_child);
+ while (*p) {
+ struct dentry *d = hlist_entry(*p, struct dentry, d_sib);
+ p = &d->d_sib.next;
// we must at least skip cursors, to avoid livelocks
if (d->d_flags & DCACHE_DENTRY_CURSOR)
continue;
@@ -126,8 +127,10 @@ static struct dentry *scan_positives(struct dentry *cursor,
count = 1;
}
if (need_resched()) {
- list_move(&cursor->d_child, p);
- p = &cursor->d_child;
+ if (!hlist_unhashed(&cursor->d_sib))
+ __hlist_del(&cursor->d_sib);
+ hlist_add_behind(&cursor->d_sib, &d->d_sib);
+ p = &cursor->d_sib.next;
spin_unlock(&dentry->d_lock);
cond_resched();
spin_lock(&dentry->d_lock);
@@ -159,13 +162,12 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
inode_lock_shared(dentry->d_inode);
if (offset > 2)
- to = scan_positives(cursor, &dentry->d_subdirs,
+ to = scan_positives(cursor, &dentry->d_children.first,
offset - 2, NULL);
spin_lock(&dentry->d_lock);
+ hlist_del_init(&cursor->d_sib);
if (to)
- list_move(&cursor->d_child, &to->d_child);
- else
- list_del_init(&cursor->d_child);
+ hlist_add_behind(&cursor->d_sib, &to->d_sib);
spin_unlock(&dentry->d_lock);
dput(to);
@@ -187,19 +189,16 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct dentry *cursor = file->private_data;
- struct list_head *anchor = &dentry->d_subdirs;
struct dentry *next = NULL;
- struct list_head *p;
+ struct hlist_node **p;
if (!dir_emit_dots(file, ctx))
return 0;
if (ctx->pos == 2)
- p = anchor;
- else if (!list_empty(&cursor->d_child))
- p = &cursor->d_child;
+ p = &dentry->d_children.first;
else
- return 0;
+ p = &cursor->d_sib.next;
while ((next = scan_positives(cursor, p, 1, next)) != NULL) {
if (!dir_emit(ctx, next->d_name.name, next->d_name.len,
@@ -207,13 +206,12 @@ int dcache_readdir(struct file *file, struct dir_context *ctx)
fs_umode_to_dtype(d_inode(next)->i_mode)))
break;
ctx->pos++;
- p = &next->d_child;
+ p = &next->d_sib.next;
}
spin_lock(&dentry->d_lock);
+ hlist_del_init(&cursor->d_sib);
if (next)
- list_move_tail(&cursor->d_child, &next->d_child);
- else
- list_del_init(&cursor->d_child);
+ hlist_add_before(&cursor->d_sib, &next->d_sib);
spin_unlock(&dentry->d_lock);
dput(next);
@@ -500,12 +498,11 @@ const struct file_operations simple_offset_dir_operations = {
static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev)
{
- struct dentry *child = NULL;
- struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs;
+ struct dentry *child = NULL, *d;
spin_lock(&parent->d_lock);
- while ((p = p->next) != &parent->d_subdirs) {
- struct dentry *d = container_of(p, struct dentry, d_child);
+ d = prev ? d_next_sibling(prev) : d_first_child(parent);
+ hlist_for_each_entry_from(d, d_sib) {
if (simple_positive(d)) {
spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(d))
@@ -666,7 +663,7 @@ int simple_empty(struct dentry *dentry)
int ret = 0;
spin_lock(&dentry->d_lock);
- list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+ hlist_for_each_entry(child, &dentry->d_children, d_sib) {
spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED);
if (simple_positive(child)) {
spin_unlock(&child->d_lock);
@@ -920,7 +917,6 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
const struct tree_descr *files)
{
struct inode *inode;
- struct dentry *root;
struct dentry *dentry;
int i;
@@ -943,8 +939,8 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
set_nlink(inode, 2);
- root = d_make_root(inode);
- if (!root)
+ s->s_root = d_make_root(inode);
+ if (!s->s_root)
return -ENOMEM;
for (i = 0; !files->name || files->name[0]; i++, files++) {
if (!files->name)
@@ -956,13 +952,13 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
"with an index of 1!\n", __func__,
s->s_type->name);
- dentry = d_alloc_name(root, files->name);
+ dentry = d_alloc_name(s->s_root, files->name);
if (!dentry)
- goto out;
+ return -ENOMEM;
inode = new_inode(s);
if (!inode) {
dput(dentry);
- goto out;
+ return -ENOMEM;
}
inode->i_mode = S_IFREG | files->mode;
simple_inode_init_ts(inode);
@@ -970,13 +966,7 @@ int simple_fill_super(struct super_block *s, unsigned long magic,
inode->i_ino = i;
d_add(dentry, inode);
}
- s->s_root = root;
return 0;
-out:
- d_genocide(root);
- shrink_dcache_parent(root);
- dput(root);
- return -ENOMEM;
}
EXPORT_SYMBOL(simple_fill_super);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 81be07c1d3d1..ce5862482097 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -345,10 +345,10 @@ static int lockd_get(void)
serv->sv_maxconn = nlm_max_connections;
error = svc_set_num_threads(serv, NULL, 1);
- /* The thread now holds the only reference */
- svc_put(serv);
- if (error < 0)
+ if (error < 0) {
+ svc_destroy(&serv);
return error;
+ }
nlmsvc_serv = serv;
register_inetaddr_notifier(&lockd_inetaddr_notifier);
@@ -372,11 +372,9 @@ static void lockd_put(void)
unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
#endif
- svc_get(nlmsvc_serv);
svc_set_num_threads(nlmsvc_serv, NULL, 0);
- svc_put(nlmsvc_serv);
timer_delete_sync(&nlmsvc_retry);
- nlmsvc_serv = NULL;
+ svc_destroy(&nlmsvc_serv);
dprintk("lockd_down: service destroyed\n");
}
@@ -475,7 +473,6 @@ static struct ctl_table nlm_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
#endif /* CONFIG_SYSCTL */
diff --git a/fs/locks.c b/fs/locks.c
index 46d88b9e222c..cc7c117ee192 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -111,7 +111,6 @@ static struct ctl_table locks_sysctls[] = {
.proc_handler = proc_dointvec,
},
#endif /* CONFIG_MMU */
- {}
};
static int __init init_fs_locks_sysctls(void)
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 62c313fc9a49..a224cf222570 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -26,12 +26,6 @@ const struct file_operations minix_dir_operations = {
.fsync = generic_file_fsync,
};
-static inline void dir_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
/*
* Return the offset into page `page_nr' of the last valid
* byte in that page, plus one.
@@ -70,13 +64,14 @@ static int minix_handle_dirsync(struct inode *dir)
return err;
}
-static struct page * dir_get_page(struct inode *dir, unsigned long n)
+static void *dir_get_page(struct inode *dir, unsigned long n, struct page **p)
{
struct address_space *mapping = dir->i_mapping;
struct page *page = read_mapping_page(mapping, n, NULL);
- if (!IS_ERR(page))
- kmap(page);
- return page;
+ if (IS_ERR(page))
+ return ERR_CAST(page);
+ *p = page;
+ return kmap_local_page(page);
}
static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
@@ -104,11 +99,11 @@ static int minix_readdir(struct file *file, struct dir_context *ctx)
for ( ; n < npages; n++, offset = 0) {
char *p, *kaddr, *limit;
- struct page *page = dir_get_page(inode, n);
+ struct page *page;
- if (IS_ERR(page))
+ kaddr = dir_get_page(inode, n, &page);
+ if (IS_ERR(kaddr))
continue;
- kaddr = (char *)page_address(page);
p = kaddr+offset;
limit = kaddr + minix_last_byte(inode, n) - chunk_size;
for ( ; p <= limit; p = minix_next_entry(p, sbi)) {
@@ -127,13 +122,13 @@ static int minix_readdir(struct file *file, struct dir_context *ctx)
unsigned l = strnlen(name, sbi->s_namelen);
if (!dir_emit(ctx, name, l,
inumber, DT_UNKNOWN)) {
- dir_put_page(page);
+ unmap_and_put_page(page, p);
return 0;
}
}
ctx->pos += chunk_size;
}
- dir_put_page(page);
+ unmap_and_put_page(page, kaddr);
}
return 0;
}
@@ -173,11 +168,10 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page)
for (n = 0; n < npages; n++) {
char *kaddr, *limit;
- page = dir_get_page(dir, n);
- if (IS_ERR(page))
+ kaddr = dir_get_page(dir, n, &page);
+ if (IS_ERR(kaddr))
continue;
- kaddr = (char*)page_address(page);
limit = kaddr + minix_last_byte(dir, n) - sbi->s_dirsize;
for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
if (sbi->s_version == MINIX_V3) {
@@ -194,7 +188,7 @@ minix_dirent *minix_find_entry(struct dentry *dentry, struct page **res_page)
if (namecompare(namelen, sbi->s_namelen, name, namx))
goto found;
}
- dir_put_page(page);
+ unmap_and_put_page(page, kaddr);
}
return NULL;
@@ -229,12 +223,10 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
for (n = 0; n <= npages; n++) {
char *limit, *dir_end;
- page = dir_get_page(dir, n);
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto out;
+ kaddr = dir_get_page(dir, n, &page);
+ if (IS_ERR(kaddr))
+ return PTR_ERR(kaddr);
lock_page(page);
- kaddr = (char*)page_address(page);
dir_end = kaddr + minix_last_byte(dir, n);
limit = kaddr + PAGE_SIZE - sbi->s_dirsize;
for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
@@ -262,13 +254,13 @@ int minix_add_link(struct dentry *dentry, struct inode *inode)
goto out_unlock;
}
unlock_page(page);
- dir_put_page(page);
+ unmap_and_put_page(page, kaddr);
}
BUG();
return -EINVAL;
got_it:
- pos = page_offset(page) + p - (char *)page_address(page);
+ pos = page_offset(page) + offset_in_page(p);
err = minix_prepare_chunk(page, pos, sbi->s_dirsize);
if (err)
goto out_unlock;
@@ -285,8 +277,7 @@ got_it:
mark_inode_dirty(dir);
err = minix_handle_dirsync(dir);
out_put:
- dir_put_page(page);
-out:
+ unmap_and_put_page(page, kaddr);
return err;
out_unlock:
unlock_page(page);
@@ -296,8 +287,7 @@ out_unlock:
int minix_delete_entry(struct minix_dir_entry *de, struct page *page)
{
struct inode *inode = page->mapping->host;
- char *kaddr = page_address(page);
- loff_t pos = page_offset(page) + (char*)de - kaddr;
+ loff_t pos = page_offset(page) + offset_in_page(de);
struct minix_sb_info *sbi = minix_sb(inode->i_sb);
unsigned len = sbi->s_dirsize;
int err;
@@ -333,7 +323,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
goto fail;
}
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_page(page);
memset(kaddr, 0, PAGE_SIZE);
if (sbi->s_version == MINIX_V3) {
@@ -353,7 +343,7 @@ int minix_make_empty(struct inode *inode, struct inode *dir)
de->inode = dir->i_ino;
strcpy(de->name, "..");
}
- kunmap_atomic(kaddr);
+ kunmap_local(kaddr);
dir_commit_chunk(page, 0, 2 * sbi->s_dirsize);
err = minix_handle_dirsync(inode);
@@ -370,17 +360,16 @@ int minix_empty_dir(struct inode * inode)
struct page *page = NULL;
unsigned long i, npages = dir_pages(inode);
struct minix_sb_info *sbi = minix_sb(inode->i_sb);
- char *name;
+ char *name, *kaddr;
__u32 inumber;
for (i = 0; i < npages; i++) {
- char *p, *kaddr, *limit;
+ char *p, *limit;
- page = dir_get_page(inode, i);
- if (IS_ERR(page))
+ kaddr = dir_get_page(inode, i, &page);
+ if (IS_ERR(kaddr))
continue;
- kaddr = (char *)page_address(page);
limit = kaddr + minix_last_byte(inode, i) - sbi->s_dirsize;
for (p = kaddr; p <= limit; p = minix_next_entry(p, sbi)) {
if (sbi->s_version == MINIX_V3) {
@@ -406,12 +395,12 @@ int minix_empty_dir(struct inode * inode)
goto not_empty;
}
}
- dir_put_page(page);
+ unmap_and_put_page(page, kaddr);
}
return 1;
not_empty:
- dir_put_page(page);
+ unmap_and_put_page(page, kaddr);
return 0;
}
@@ -421,8 +410,7 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page,
{
struct inode *dir = page->mapping->host;
struct minix_sb_info *sbi = minix_sb(dir->i_sb);
- loff_t pos = page_offset(page) +
- (char *)de-(char*)page_address(page);
+ loff_t pos = page_offset(page) + offset_in_page(de);
int err;
lock_page(page);
@@ -443,15 +431,12 @@ int minix_set_link(struct minix_dir_entry *de, struct page *page,
struct minix_dir_entry * minix_dotdot (struct inode *dir, struct page **p)
{
- struct page *page = dir_get_page(dir, 0);
struct minix_sb_info *sbi = minix_sb(dir->i_sb);
- struct minix_dir_entry *de = NULL;
+ struct minix_dir_entry *de = dir_get_page(dir, 0, p);
- if (!IS_ERR(page)) {
- de = minix_next_entry(page_address(page), sbi);
- *p = page;
- }
- return de;
+ if (!IS_ERR(de))
+ return minix_next_entry(de, sbi);
+ return NULL;
}
ino_t minix_inode_by_name(struct dentry *dentry)
@@ -469,7 +454,7 @@ ino_t minix_inode_by_name(struct dentry *dentry)
res = ((minix3_dirent *) de)->inode;
else
res = de->inode;
- dir_put_page(page);
+ unmap_and_put_page(page, de);
}
return res;
}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f8af6c3ae336..73f37f298087 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -17,6 +17,7 @@
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/highuid.h>
+#include <linux/mpage.h>
#include <linux/vfs.h>
#include <linux/writeback.h>
@@ -397,9 +398,10 @@ static int minix_get_block(struct inode *inode, sector_t block,
return V2_minix_get_block(inode, block, bh_result, create);
}
-static int minix_writepage(struct page *page, struct writeback_control *wbc)
+static int minix_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, minix_get_block, wbc);
+ return mpage_writepages(mapping, wbc, minix_get_block);
}
static int minix_read_folio(struct file *file, struct folio *folio)
@@ -444,9 +446,10 @@ static const struct address_space_operations minix_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = minix_read_folio,
- .writepage = minix_writepage,
+ .writepages = minix_writepages,
.write_begin = minix_write_begin,
.write_end = generic_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = minix_bmap,
.direct_IO = noop_direct_IO
};
diff --git a/fs/minix/namei.c b/fs/minix/namei.c
index 114084d5636a..d6031acc34f0 100644
--- a/fs/minix/namei.c
+++ b/fs/minix/namei.c
@@ -149,8 +149,7 @@ static int minix_unlink(struct inode * dir, struct dentry *dentry)
if (!de)
return -ENOENT;
err = minix_delete_entry(de, page);
- kunmap(page);
- put_page(page);
+ unmap_and_put_page(page, de);
if (err)
return err;
@@ -242,13 +241,10 @@ static int minix_rename(struct mnt_idmap *idmap,
inode_dec_link_count(old_dir);
}
out_dir:
- if (dir_de) {
- kunmap(dir_page);
- put_page(dir_page);
- }
+ if (dir_de)
+ unmap_and_put_page(dir_page, dir_de);
out_old:
- kunmap(old_page);
- put_page(old_page);
+ unmap_and_put_page(old_page, old_de);
out:
return err;
}
diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c
index 57d1dedf3f8f..64c5205e2b5e 100644
--- a/fs/mnt_idmapping.c
+++ b/fs/mnt_idmapping.c
@@ -9,8 +9,16 @@
#include "internal.h"
+/*
+ * Outside of this file vfs{g,u}id_t are always created from k{g,u}id_t,
+ * never from raw values. These are just internal helpers.
+ */
+#define VFSUIDT_INIT_RAW(val) (vfsuid_t){ val }
+#define VFSGIDT_INIT_RAW(val) (vfsgid_t){ val }
+
struct mnt_idmap {
- struct user_namespace *owner;
+ struct uid_gid_map uid_map;
+ struct uid_gid_map gid_map;
refcount_t count;
};
@@ -20,25 +28,11 @@ struct mnt_idmap {
* mapped to {g,u}id 1, [...], {g,u}id 1000 to {g,u}id 1000, [...].
*/
struct mnt_idmap nop_mnt_idmap = {
- .owner = &init_user_ns,
.count = REFCOUNT_INIT(1),
};
EXPORT_SYMBOL_GPL(nop_mnt_idmap);
/**
- * check_fsmapping - check whether an mount idmapping is allowed
- * @idmap: idmap of the relevent mount
- * @sb: super block of the filesystem
- *
- * Return: true if @idmap is allowed, false if not.
- */
-bool check_fsmapping(const struct mnt_idmap *idmap,
- const struct super_block *sb)
-{
- return idmap->owner != sb->s_user_ns;
-}
-
-/**
* initial_idmapping - check whether this is the initial mapping
* @ns: idmapping to check
*
@@ -53,26 +47,6 @@ static inline bool initial_idmapping(const struct user_namespace *ns)
}
/**
- * no_idmapping - check whether we can skip remapping a kuid/gid
- * @mnt_userns: the mount's idmapping
- * @fs_userns: the filesystem's idmapping
- *
- * This function can be used to check whether a remapping between two
- * idmappings is required.
- * An idmapped mount is a mount that has an idmapping attached to it that
- * is different from the filsystem's idmapping and the initial idmapping.
- * If the initial mapping is used or the idmapping of the mount and the
- * filesystem are identical no remapping is required.
- *
- * Return: true if remapping can be skipped, false if not.
- */
-static inline bool no_idmapping(const struct user_namespace *mnt_userns,
- const struct user_namespace *fs_userns)
-{
- return initial_idmapping(mnt_userns) || mnt_userns == fs_userns;
-}
-
-/**
* make_vfsuid - map a filesystem kuid according to an idmapping
* @idmap: the mount's idmapping
* @fs_userns: the filesystem's idmapping
@@ -81,8 +55,8 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns,
* Take a @kuid and remap it from @fs_userns into @idmap. Use this
* function when preparing a @kuid to be reported to userspace.
*
- * If no_idmapping() determines that this is not an idmapped mount we can
- * simply return @kuid unchanged.
+ * If initial_idmapping() determines that this is not an idmapped mount
+ * we can simply return @kuid unchanged.
* If initial_idmapping() tells us that the filesystem is not mounted with an
* idmapping we know the value of @kuid won't change when calling
* from_kuid() so we can simply retrieve the value via __kuid_val()
@@ -94,13 +68,12 @@ static inline bool no_idmapping(const struct user_namespace *mnt_userns,
*/
vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
- struct user_namespace *fs_userns,
- kuid_t kuid)
+ struct user_namespace *fs_userns,
+ kuid_t kuid)
{
uid_t uid;
- struct user_namespace *mnt_userns = idmap->owner;
- if (no_idmapping(mnt_userns, fs_userns))
+ if (idmap == &nop_mnt_idmap)
return VFSUIDT_INIT(kuid);
if (initial_idmapping(fs_userns))
uid = __kuid_val(kuid);
@@ -108,7 +81,7 @@ vfsuid_t make_vfsuid(struct mnt_idmap *idmap,
uid = from_kuid(fs_userns, kuid);
if (uid == (uid_t)-1)
return INVALID_VFSUID;
- return VFSUIDT_INIT(make_kuid(mnt_userns, uid));
+ return VFSUIDT_INIT_RAW(map_id_down(&idmap->uid_map, uid));
}
EXPORT_SYMBOL_GPL(make_vfsuid);
@@ -121,8 +94,8 @@ EXPORT_SYMBOL_GPL(make_vfsuid);
* Take a @kgid and remap it from @fs_userns into @idmap. Use this
* function when preparing a @kgid to be reported to userspace.
*
- * If no_idmapping() determines that this is not an idmapped mount we can
- * simply return @kgid unchanged.
+ * If initial_idmapping() determines that this is not an idmapped mount
+ * we can simply return @kgid unchanged.
* If initial_idmapping() tells us that the filesystem is not mounted with an
* idmapping we know the value of @kgid won't change when calling
* from_kgid() so we can simply retrieve the value via __kgid_val()
@@ -136,9 +109,8 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
struct user_namespace *fs_userns, kgid_t kgid)
{
gid_t gid;
- struct user_namespace *mnt_userns = idmap->owner;
- if (no_idmapping(mnt_userns, fs_userns))
+ if (idmap == &nop_mnt_idmap)
return VFSGIDT_INIT(kgid);
if (initial_idmapping(fs_userns))
gid = __kgid_val(kgid);
@@ -146,7 +118,7 @@ vfsgid_t make_vfsgid(struct mnt_idmap *idmap,
gid = from_kgid(fs_userns, kgid);
if (gid == (gid_t)-1)
return INVALID_VFSGID;
- return VFSGIDT_INIT(make_kgid(mnt_userns, gid));
+ return VFSGIDT_INIT_RAW(map_id_down(&idmap->gid_map, gid));
}
EXPORT_SYMBOL_GPL(make_vfsgid);
@@ -165,11 +137,10 @@ kuid_t from_vfsuid(struct mnt_idmap *idmap,
struct user_namespace *fs_userns, vfsuid_t vfsuid)
{
uid_t uid;
- struct user_namespace *mnt_userns = idmap->owner;
- if (no_idmapping(mnt_userns, fs_userns))
+ if (idmap == &nop_mnt_idmap)
return AS_KUIDT(vfsuid);
- uid = from_kuid(mnt_userns, AS_KUIDT(vfsuid));
+ uid = map_id_up(&idmap->uid_map, __vfsuid_val(vfsuid));
if (uid == (uid_t)-1)
return INVALID_UID;
if (initial_idmapping(fs_userns))
@@ -193,11 +164,10 @@ kgid_t from_vfsgid(struct mnt_idmap *idmap,
struct user_namespace *fs_userns, vfsgid_t vfsgid)
{
gid_t gid;
- struct user_namespace *mnt_userns = idmap->owner;
- if (no_idmapping(mnt_userns, fs_userns))
+ if (idmap == &nop_mnt_idmap)
return AS_KGIDT(vfsgid);
- gid = from_kgid(mnt_userns, AS_KGIDT(vfsgid));
+ gid = map_id_up(&idmap->gid_map, __vfsgid_val(vfsgid));
if (gid == (gid_t)-1)
return INVALID_GID;
if (initial_idmapping(fs_userns))
@@ -228,16 +198,91 @@ int vfsgid_in_group_p(vfsgid_t vfsgid)
#endif
EXPORT_SYMBOL_GPL(vfsgid_in_group_p);
+static int copy_mnt_idmap(struct uid_gid_map *map_from,
+ struct uid_gid_map *map_to)
+{
+ struct uid_gid_extent *forward, *reverse;
+ u32 nr_extents = READ_ONCE(map_from->nr_extents);
+ /* Pairs with smp_wmb() when writing the idmapping. */
+ smp_rmb();
+
+ /*
+ * Don't blindly copy @map_to into @map_from if nr_extents is
+ * smaller or equal to UID_GID_MAP_MAX_BASE_EXTENTS. Since we
+ * read @nr_extents someone could have written an idmapping and
+ * then we might end up with inconsistent data. So just don't do
+ * anything at all.
+ */
+ if (nr_extents == 0)
+ return 0;
+
+ /*
+ * Here we know that nr_extents is greater than zero which means
+ * a map has been written. Since idmappings can't be changed
+ * once they have been written we know that we can safely copy
+ * from @map_to into @map_from.
+ */
+
+ if (nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) {
+ *map_to = *map_from;
+ return 0;
+ }
+
+ forward = kmemdup(map_from->forward,
+ nr_extents * sizeof(struct uid_gid_extent),
+ GFP_KERNEL_ACCOUNT);
+ if (!forward)
+ return -ENOMEM;
+
+ reverse = kmemdup(map_from->reverse,
+ nr_extents * sizeof(struct uid_gid_extent),
+ GFP_KERNEL_ACCOUNT);
+ if (!reverse) {
+ kfree(forward);
+ return -ENOMEM;
+ }
+
+ /*
+ * The idmapping isn't exposed anywhere so we don't need to care
+ * about ordering between extent pointers and @nr_extents
+ * initialization.
+ */
+ map_to->forward = forward;
+ map_to->reverse = reverse;
+ map_to->nr_extents = nr_extents;
+ return 0;
+}
+
+static void free_mnt_idmap(struct mnt_idmap *idmap)
+{
+ if (idmap->uid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(idmap->uid_map.forward);
+ kfree(idmap->uid_map.reverse);
+ }
+ if (idmap->gid_map.nr_extents > UID_GID_MAP_MAX_BASE_EXTENTS) {
+ kfree(idmap->gid_map.forward);
+ kfree(idmap->gid_map.reverse);
+ }
+ kfree(idmap);
+}
+
struct mnt_idmap *alloc_mnt_idmap(struct user_namespace *mnt_userns)
{
struct mnt_idmap *idmap;
+ int ret;
idmap = kzalloc(sizeof(struct mnt_idmap), GFP_KERNEL_ACCOUNT);
if (!idmap)
return ERR_PTR(-ENOMEM);
- idmap->owner = get_user_ns(mnt_userns);
refcount_set(&idmap->count, 1);
+ ret = copy_mnt_idmap(&mnt_userns->uid_map, &idmap->uid_map);
+ if (!ret)
+ ret = copy_mnt_idmap(&mnt_userns->gid_map, &idmap->gid_map);
+ if (ret) {
+ free_mnt_idmap(idmap);
+ idmap = ERR_PTR(ret);
+ }
return idmap;
}
@@ -267,9 +312,7 @@ EXPORT_SYMBOL_GPL(mnt_idmap_get);
*/
void mnt_idmap_put(struct mnt_idmap *idmap)
{
- if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count)) {
- put_user_ns(idmap->owner);
- kfree(idmap);
- }
+ if (idmap != &nop_mnt_idmap && refcount_dec_and_test(&idmap->count))
+ free_mnt_idmap(idmap);
}
EXPORT_SYMBOL_GPL(mnt_idmap_put);
diff --git a/fs/mount.h b/fs/mount.h
index 130c07c2f8d2..4a42fc68f4cc 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -8,19 +8,13 @@
struct mnt_namespace {
struct ns_common ns;
struct mount * root;
- /*
- * Traversal and modification of .list is protected by either
- * - taking namespace_sem for write, OR
- * - taking namespace_sem for read AND taking .ns_lock.
- */
- struct list_head list;
- spinlock_t ns_lock;
+ struct rb_root mounts; /* Protected by namespace_sem */
struct user_namespace *user_ns;
struct ucounts *ucounts;
u64 seq; /* Sequence number to prevent loops */
wait_queue_head_t poll;
u64 event;
- unsigned int mounts; /* # of mounts in the namespace */
+ unsigned int nr_mounts; /* # of mounts in the namespace */
unsigned int pending_mounts;
} __randomize_layout;
@@ -55,7 +49,10 @@ struct mount {
struct list_head mnt_child; /* and going through their mnt_child */
struct list_head mnt_instance; /* mount instance on sb->s_mounts */
const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */
- struct list_head mnt_list;
+ union {
+ struct rb_node mnt_node; /* Under ns->mounts */
+ struct list_head mnt_list;
+ };
struct list_head mnt_expire; /* link in fs-specific expiry list */
struct list_head mnt_share; /* circular list of shared mounts */
struct list_head mnt_slave_list;/* list of slave mounts */
@@ -72,7 +69,8 @@ struct mount {
struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks;
__u32 mnt_fsnotify_mask;
#endif
- int mnt_id; /* mount identifier */
+ int mnt_id; /* mount identifier, reused */
+ u64 mnt_id_unique; /* mount ID unique until reboot */
int mnt_group_id; /* peer group identifier */
int mnt_expiry_mark; /* true if marked for expiry */
struct hlist_head mnt_pins;
@@ -127,7 +125,6 @@ struct proc_mounts {
struct mnt_namespace *ns;
struct path root;
int (*show)(struct seq_file *, struct vfsmount *);
- struct mount cursor;
};
extern const struct seq_operations mounts_op;
@@ -146,4 +143,12 @@ static inline bool is_anon_ns(struct mnt_namespace *ns)
return ns->seq == 0;
}
+static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list)
+{
+ WARN_ON(!(mnt->mnt.mnt_flags & MNT_ONRB));
+ mnt->mnt.mnt_flags &= ~MNT_ONRB;
+ rb_erase(&mnt->mnt_node, &mnt->mnt_ns->mounts);
+ list_add_tail(&mnt->mnt_list, dt_list);
+}
+
extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
diff --git a/fs/mpage.c b/fs/mpage.c
index ffb064ed9d04..738882e0766d 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -166,7 +166,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
sector_t block_in_file;
sector_t last_block;
sector_t last_block_in_file;
- sector_t blocks[MAX_BUF_PER_PAGE];
+ sector_t first_block;
unsigned page_block;
unsigned first_hole = blocks_per_page;
struct block_device *bdev = NULL;
@@ -205,6 +205,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
unsigned map_offset = block_in_file - args->first_logical_block;
unsigned last = nblocks - map_offset;
+ first_block = map_bh->b_blocknr + map_offset;
for (relative_block = 0; ; relative_block++) {
if (relative_block == last) {
clear_buffer_mapped(map_bh);
@@ -212,8 +213,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
}
if (page_block == blocks_per_page)
break;
- blocks[page_block] = map_bh->b_blocknr + map_offset +
- relative_block;
page_block++;
block_in_file++;
}
@@ -259,7 +258,9 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
- if (page_block && blocks[page_block-1] != map_bh->b_blocknr-1)
+ if (!page_block)
+ first_block = map_bh->b_blocknr;
+ else if (first_block + page_block != map_bh->b_blocknr)
goto confused;
nblocks = map_bh->b_size >> blkbits;
for (relative_block = 0; ; relative_block++) {
@@ -268,7 +269,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
break;
} else if (page_block == blocks_per_page)
break;
- blocks[page_block] = map_bh->b_blocknr+relative_block;
page_block++;
block_in_file++;
}
@@ -289,7 +289,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
/*
* This folio will go to BIO. Do we need to send this BIO off first?
*/
- if (args->bio && (args->last_block_in_bio != blocks[0] - 1))
+ if (args->bio && (args->last_block_in_bio != first_block - 1))
args->bio = mpage_bio_submit_read(args->bio);
alloc_new:
@@ -298,7 +298,7 @@ alloc_new:
gfp);
if (args->bio == NULL)
goto confused;
- args->bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
+ args->bio->bi_iter.bi_sector = first_block << (blkbits - 9);
}
length = first_hole << blkbits;
@@ -313,7 +313,7 @@ alloc_new:
(first_hole != blocks_per_page))
args->bio = mpage_bio_submit_read(args->bio);
else
- args->last_block_in_bio = blocks[blocks_per_page - 1];
+ args->last_block_in_bio = first_block + blocks_per_page - 1;
out:
return args->bio;
@@ -430,13 +430,13 @@ struct mpage_data {
* We have our BIO, so we can now mark the buffers clean. Make
* sure to only clean buffers which we know we'll be writing.
*/
-static void clean_buffers(struct page *page, unsigned first_unmapped)
+static void clean_buffers(struct folio *folio, unsigned first_unmapped)
{
unsigned buffer_counter = 0;
- struct buffer_head *bh, *head;
- if (!page_has_buffers(page))
+ struct buffer_head *bh, *head = folio_buffers(folio);
+
+ if (!head)
return;
- head = page_buffers(page);
bh = head;
do {
@@ -451,18 +451,8 @@ static void clean_buffers(struct page *page, unsigned first_unmapped)
* read_folio would fail to serialize with the bh and it would read from
* disk before we reach the platter.
*/
- if (buffer_heads_over_limit && PageUptodate(page))
- try_to_free_buffers(page_folio(page));
-}
-
-/*
- * For situations where we want to clean all buffers attached to a page.
- * We don't need to calculate how many buffers are attached to the page,
- * we just need to specify a number larger than the maximum number of buffers.
- */
-void clean_page_buffers(struct page *page)
-{
- clean_buffers(page, ~0U);
+ if (buffer_heads_over_limit && folio_test_uptodate(folio))
+ try_to_free_buffers(folio);
}
static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
@@ -476,7 +466,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
sector_t last_block;
sector_t block_in_file;
- sector_t blocks[MAX_BUF_PER_PAGE];
+ sector_t first_block;
unsigned page_block;
unsigned first_unmapped = blocks_per_page;
struct block_device *bdev = NULL;
@@ -514,10 +504,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
if (!buffer_dirty(bh) || !buffer_uptodate(bh))
goto confused;
if (page_block) {
- if (bh->b_blocknr != blocks[page_block-1] + 1)
+ if (bh->b_blocknr != first_block + page_block)
goto confused;
+ } else {
+ first_block = bh->b_blocknr;
}
- blocks[page_block++] = bh->b_blocknr;
+ page_block++;
boundary = buffer_boundary(bh);
if (boundary) {
boundary_block = bh->b_blocknr;
@@ -566,10 +558,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
boundary_bdev = map_bh.b_bdev;
}
if (page_block) {
- if (map_bh.b_blocknr != blocks[page_block-1] + 1)
+ if (map_bh.b_blocknr != first_block + page_block)
goto confused;
+ } else {
+ first_block = map_bh.b_blocknr;
}
- blocks[page_block++] = map_bh.b_blocknr;
+ page_block++;
boundary = buffer_boundary(&map_bh);
bdev = map_bh.b_bdev;
if (block_in_file == last_block)
@@ -601,7 +595,7 @@ page_is_mapped:
/*
* This page will go to BIO. Do we need to send this BIO off first?
*/
- if (bio && mpd->last_block_in_bio != blocks[0] - 1)
+ if (bio && mpd->last_block_in_bio != first_block - 1)
bio = mpage_bio_submit_write(bio);
alloc_new:
@@ -609,7 +603,7 @@ alloc_new:
bio = bio_alloc(bdev, BIO_MAX_VECS,
REQ_OP_WRITE | wbc_to_write_flags(wbc),
GFP_NOFS);
- bio->bi_iter.bi_sector = blocks[0] << (blkbits - 9);
+ bio->bi_iter.bi_sector = first_block << (blkbits - 9);
wbc_init_bio(wbc, bio);
}
@@ -625,7 +619,7 @@ alloc_new:
goto alloc_new;
}
- clean_buffers(&folio->page, first_unmapped);
+ clean_buffers(folio, first_unmapped);
BUG_ON(folio_test_writeback(folio));
folio_start_writeback(folio);
@@ -637,7 +631,7 @@ alloc_new:
boundary_block, 1 << blkbits);
}
} else {
- mpd->last_block_in_bio = blocks[blocks_per_page - 1];
+ mpd->last_block_in_bio = first_block + blocks_per_page - 1;
}
goto out;
@@ -648,7 +642,7 @@ confused:
/*
* The caller has a ref on the inode, so *mapping is stable
*/
- ret = block_write_full_page(&folio->page, mpd->get_block, wbc);
+ ret = block_write_full_folio(folio, wbc, mpd->get_block);
mapping_set_error(mapping, ret);
out:
mpd->bio = bio;
diff --git a/fs/namei.c b/fs/namei.c
index 71c13b2990b4..9342fa6a38c2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL(putname);
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
static int check_acl(struct mnt_idmap *idmap,
struct inode *inode, int mask)
@@ -334,7 +334,7 @@ static int check_acl(struct mnt_idmap *idmap,
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
static int acl_permission_check(struct mnt_idmap *idmap,
struct inode *inode, int mask)
@@ -395,7 +395,7 @@ static int acl_permission_check(struct mnt_idmap *idmap,
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int generic_permission(struct mnt_idmap *idmap, struct inode *inode,
int mask)
@@ -1071,7 +1071,6 @@ static struct ctl_table namei_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_TWO,
},
- { }
};
static int __init init_fs_namei_sysctls(void)
@@ -1718,7 +1717,11 @@ static inline int may_lookup(struct mnt_idmap *idmap,
{
if (nd->flags & LOOKUP_RCU) {
int err = inode_permission(idmap, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
- if (err != -ECHILD || !try_to_unlazy(nd))
+ if (!err) // success, keep going
+ return 0;
+ if (!try_to_unlazy(nd))
+ return -ECHILD; // redo it all non-lazy
+ if (err != -ECHILD) // hard error
return err;
}
return inode_permission(idmap, nd->inode, MAY_EXEC);
@@ -2467,7 +2470,7 @@ static int handle_lookup_down(struct nameidata *nd)
return PTR_ERR(step_into(nd, WALK_NOFOLLOW, nd->path.dentry));
}
-/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path)
{
const char *s = path_init(nd, flags);
@@ -2522,7 +2525,7 @@ int filename_lookup(int dfd, struct filename *name, unsigned flags,
return retval;
}
-/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+/* Returns 0 and nd will be valid on success; Returns error, otherwise. */
static int path_parentat(struct nameidata *nd, unsigned flags,
struct path *parent)
{
@@ -2573,13 +2576,13 @@ static int filename_parentat(int dfd, struct filename *name,
}
/* does lookup, returns the object with parent locked */
-static struct dentry *__kern_path_locked(struct filename *name, struct path *path)
+static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path)
{
struct dentry *d;
struct qstr last;
int type, error;
- error = filename_parentat(AT_FDCWD, name, 0, path, &last, &type);
+ error = filename_parentat(dfd, name, 0, path, &last, &type);
if (error)
return ERR_PTR(error);
if (unlikely(type != LAST_NORM)) {
@@ -2598,12 +2601,22 @@ static struct dentry *__kern_path_locked(struct filename *name, struct path *pat
struct dentry *kern_path_locked(const char *name, struct path *path)
{
struct filename *filename = getname_kernel(name);
- struct dentry *res = __kern_path_locked(filename, path);
+ struct dentry *res = __kern_path_locked(AT_FDCWD, filename, path);
putname(filename);
return res;
}
+struct dentry *user_path_locked_at(int dfd, const char __user *name, struct path *path)
+{
+ struct filename *filename = getname(name);
+ struct dentry *res = __kern_path_locked(dfd, filename, path);
+
+ putname(filename);
+ return res;
+}
+EXPORT_SYMBOL(user_path_locked_at);
+
int kern_path(const char *name, unsigned int flags, struct path *path)
{
struct filename *filename = getname_kernel(name);
@@ -3014,27 +3027,37 @@ static inline int may_create(struct mnt_idmap *idmap,
return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
}
+// p1 != p2, both are on the same filesystem, ->s_vfs_rename_mutex is held
static struct dentry *lock_two_directories(struct dentry *p1, struct dentry *p2)
{
- struct dentry *p;
+ struct dentry *p = p1, *q = p2, *r;
- p = d_ancestor(p2, p1);
- if (p) {
+ while ((r = p->d_parent) != p2 && r != p)
+ p = r;
+ if (r == p2) {
+ // p is a child of p2 and an ancestor of p1 or p1 itself
inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
- inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT2);
return p;
}
-
- p = d_ancestor(p1, p2);
- if (p) {
+ // p is the root of connected component that contains p1
+ // p2 does not occur on the path from p to p1
+ while ((r = q->d_parent) != p1 && r != p && r != q)
+ q = r;
+ if (r == p1) {
+ // q is a child of p1 and an ancestor of p2 or p2 itself
inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
- inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
- return p;
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
+ return q;
+ } else if (likely(r == p)) {
+ // both p2 and p1 are descendents of p
+ inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+ inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
+ return NULL;
+ } else { // no common ancestor at the time we'd been called
+ mutex_unlock(&p1->d_sb->s_vfs_rename_mutex);
+ return ERR_PTR(-EXDEV);
}
-
- lock_two_inodes(p1->d_inode, p2->d_inode,
- I_MUTEX_PARENT, I_MUTEX_PARENT2);
- return NULL;
}
/*
@@ -3158,7 +3181,7 @@ static inline umode_t vfs_prepare_mode(struct mnt_idmap *idmap,
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool want_excl)
@@ -3646,7 +3669,7 @@ static int do_open(struct nameidata *nd,
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
static int vfs_tmpfile(struct mnt_idmap *idmap,
const struct path *parentpath,
@@ -3785,10 +3808,7 @@ static struct file *path_openat(struct nameidata *nd,
WARN_ON(1);
error = -EINVAL;
}
- if (unlikely(file->f_mode & FMODE_OPENED))
- fput(file);
- else
- release_empty_file(file);
+ fput(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
@@ -3954,7 +3974,7 @@ EXPORT_SYMBOL(user_path_create);
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev)
@@ -4080,7 +4100,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode)
@@ -4161,7 +4181,7 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry)
@@ -4290,7 +4310,7 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, struct inode **delegated_inode)
@@ -4443,7 +4463,7 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, const char *oldname)
@@ -4535,7 +4555,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
* the vfsmount must be passed through @idmap. This function will then take
* care to map the inode according to @idmap before checking permissions.
* On non-idmapped mounts or if permission checking is to be performed on the
- * raw inode simply passs @nop_mnt_idmap.
+ * raw inode simply pass @nop_mnt_idmap.
*/
int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
struct inode *dir, struct dentry *new_dentry,
@@ -4716,11 +4736,12 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
*
* a) we can get into loop creation.
* b) race potential - two innocent renames can create a loop together.
- * That's where 4.4 screws up. Current fix: serialization on
+ * That's where 4.4BSD screws up. Current fix: serialization on
* sb->s_vfs_rename_mutex. We might be more accurate, but that's another
* story.
- * c) we have to lock _four_ objects - parents and victim (if it exists),
- * and source.
+ * c) we may have to lock up to _four_ objects - parents and victim (if it exists),
+ * and source (if it's a non-directory or a subdirectory that moves to
+ * different parent).
* And that - after we got ->i_mutex on parents (until then we don't know
* whether the target exists). Solution: try to be smart with locking
* order for inodes. We rely on the fact that tree topology may change
@@ -4752,6 +4773,7 @@ int vfs_rename(struct renamedata *rd)
bool new_is_dir = false;
unsigned max_links = new_dir->i_sb->s_max_links;
struct name_snapshot old_name;
+ bool lock_old_subdir, lock_new_subdir;
if (source == target)
return 0;
@@ -4805,15 +4827,32 @@ int vfs_rename(struct renamedata *rd)
take_dentry_name_snapshot(&old_name, old_dentry);
dget(new_dentry);
/*
- * Lock all moved children. Moved directories may need to change parent
- * pointer so they need the lock to prevent against concurrent
- * directory changes moving parent pointer. For regular files we've
- * historically always done this. The lockdep locking subclasses are
- * somewhat arbitrary but RENAME_EXCHANGE in particular can swap
- * regular files and directories so it's difficult to tell which
- * subclasses to use.
+ * Lock children.
+ * The source subdirectory needs to be locked on cross-directory
+ * rename or cross-directory exchange since its parent changes.
+ * The target subdirectory needs to be locked on cross-directory
+ * exchange due to parent change and on any rename due to becoming
+ * a victim.
+ * Non-directories need locking in all cases (for NFS reasons);
+ * they get locked after any subdirectories (in inode address order).
+ *
+ * NOTE: WE ONLY LOCK UNRELATED DIRECTORIES IN CROSS-DIRECTORY CASE.
+ * NEVER, EVER DO THAT WITHOUT ->s_vfs_rename_mutex.
*/
- lock_two_inodes(source, target, I_MUTEX_NORMAL, I_MUTEX_NONDIR2);
+ lock_old_subdir = new_dir != old_dir;
+ lock_new_subdir = new_dir != old_dir || !(flags & RENAME_EXCHANGE);
+ if (is_dir) {
+ if (lock_old_subdir)
+ inode_lock_nested(source, I_MUTEX_CHILD);
+ if (target && (!new_is_dir || lock_new_subdir))
+ inode_lock(target);
+ } else if (new_is_dir) {
+ if (lock_new_subdir)
+ inode_lock_nested(target, I_MUTEX_CHILD);
+ inode_lock(source);
+ } else {
+ lock_two_nondirectories(source, target);
+ }
error = -EPERM;
if (IS_SWAPFILE(source) || (target && IS_SWAPFILE(target)))
@@ -4861,8 +4900,9 @@ int vfs_rename(struct renamedata *rd)
d_exchange(old_dentry, new_dentry);
}
out:
- inode_unlock(source);
- if (target)
+ if (!is_dir || lock_old_subdir)
+ inode_unlock(source);
+ if (target && (!new_is_dir || lock_new_subdir))
inode_unlock(target);
dput(new_dentry);
if (!error) {
@@ -4933,6 +4973,10 @@ retry:
retry_deleg:
trap = lock_rename(new_path.dentry, old_path.dentry);
+ if (IS_ERR(trap)) {
+ error = PTR_ERR(trap);
+ goto exit_lock_rename;
+ }
old_dentry = lookup_one_qstr_excl(&old_last, old_path.dentry,
lookup_flags);
@@ -5000,6 +5044,7 @@ exit4:
dput(old_dentry);
exit3:
unlock_rename(new_path.dentry, old_path.dentry);
+exit_lock_rename:
if (delegated_inode) {
error = break_deleg_wait(&delegated_inode);
if (!error)
diff --git a/fs/namespace.c b/fs/namespace.c
index fbf0e596fcd3..5a51315c6678 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -32,6 +32,7 @@
#include <linux/fs_context.h>
#include <linux/shmem_fs.h>
#include <linux/mnt_idmapping.h>
+#include <linux/nospec.h>
#include "pnode.h"
#include "internal.h"
@@ -68,6 +69,9 @@ static u64 event;
static DEFINE_IDA(mnt_id_ida);
static DEFINE_IDA(mnt_group_ida);
+/* Don't allow confusion with old 32bit mount ID */
+static atomic64_t mnt_id_ctr = ATOMIC64_INIT(1ULL << 32);
+
static struct hlist_head *mount_hashtable __ro_after_init;
static struct hlist_head *mountpoint_hashtable __ro_after_init;
static struct kmem_cache *mnt_cache __ro_after_init;
@@ -131,6 +135,7 @@ static int mnt_alloc_id(struct mount *mnt)
if (res < 0)
return res;
mnt->mnt_id = res;
+ mnt->mnt_id_unique = atomic64_inc_return(&mnt_id_ctr);
return 0;
}
@@ -730,21 +735,6 @@ struct vfsmount *lookup_mnt(const struct path *path)
return m;
}
-static inline void lock_ns_list(struct mnt_namespace *ns)
-{
- spin_lock(&ns->ns_lock);
-}
-
-static inline void unlock_ns_list(struct mnt_namespace *ns)
-{
- spin_unlock(&ns->ns_lock);
-}
-
-static inline bool mnt_is_cursor(struct mount *mnt)
-{
- return mnt->mnt.mnt_flags & MNT_CURSOR;
-}
-
/*
* __is_local_mountpoint - Test to see if dentry is a mountpoint in the
* current mount namespace.
@@ -763,19 +753,15 @@ static inline bool mnt_is_cursor(struct mount *mnt)
bool __is_local_mountpoint(struct dentry *dentry)
{
struct mnt_namespace *ns = current->nsproxy->mnt_ns;
- struct mount *mnt;
+ struct mount *mnt, *n;
bool is_covered = false;
down_read(&namespace_sem);
- lock_ns_list(ns);
- list_for_each_entry(mnt, &ns->list, mnt_list) {
- if (mnt_is_cursor(mnt))
- continue;
+ rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
is_covered = (mnt->mnt_mountpoint == dentry);
if (is_covered)
break;
}
- unlock_ns_list(ns);
up_read(&namespace_sem);
return is_covered;
@@ -1022,6 +1008,30 @@ void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct m
mnt_add_count(old_parent, -1);
}
+static inline struct mount *node_to_mount(struct rb_node *node)
+{
+ return node ? rb_entry(node, struct mount, mnt_node) : NULL;
+}
+
+static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt)
+{
+ struct rb_node **link = &ns->mounts.rb_node;
+ struct rb_node *parent = NULL;
+
+ WARN_ON(mnt->mnt.mnt_flags & MNT_ONRB);
+ mnt->mnt_ns = ns;
+ while (*link) {
+ parent = *link;
+ if (mnt->mnt_id_unique < node_to_mount(parent)->mnt_id_unique)
+ link = &parent->rb_left;
+ else
+ link = &parent->rb_right;
+ }
+ rb_link_node(&mnt->mnt_node, parent, link);
+ rb_insert_color(&mnt->mnt_node, &ns->mounts);
+ mnt->mnt.mnt_flags |= MNT_ONRB;
+}
+
/*
* vfsmount lock must be held for write
*/
@@ -1035,12 +1045,13 @@ static void commit_tree(struct mount *mnt)
BUG_ON(parent == mnt);
list_add_tail(&head, &mnt->mnt_list);
- list_for_each_entry(m, &head, mnt_list)
- m->mnt_ns = n;
+ while (!list_empty(&head)) {
+ m = list_first_entry(&head, typeof(*m), mnt_list);
+ list_del(&m->mnt_list);
- list_splice(&head, n->list.prev);
-
- n->mounts += n->pending_mounts;
+ mnt_add_to_ns(n, m);
+ }
+ n->nr_mounts += n->pending_mounts;
n->pending_mounts = 0;
__attach_mnt(mnt, parent);
@@ -1188,7 +1199,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
}
mnt->mnt.mnt_flags = old->mnt.mnt_flags;
- mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL);
+ mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL|MNT_ONRB);
atomic_inc(&sb->s_active);
mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt));
@@ -1413,65 +1424,57 @@ struct vfsmount *mnt_clone_internal(const struct path *path)
return &p->mnt;
}
-#ifdef CONFIG_PROC_FS
-static struct mount *mnt_list_next(struct mnt_namespace *ns,
- struct list_head *p)
+/*
+ * Returns the mount which either has the specified mnt_id, or has the next
+ * smallest id afer the specified one.
+ */
+static struct mount *mnt_find_id_at(struct mnt_namespace *ns, u64 mnt_id)
{
- struct mount *mnt, *ret = NULL;
+ struct rb_node *node = ns->mounts.rb_node;
+ struct mount *ret = NULL;
- lock_ns_list(ns);
- list_for_each_continue(p, &ns->list) {
- mnt = list_entry(p, typeof(*mnt), mnt_list);
- if (!mnt_is_cursor(mnt)) {
- ret = mnt;
- break;
+ while (node) {
+ struct mount *m = node_to_mount(node);
+
+ if (mnt_id <= m->mnt_id_unique) {
+ ret = node_to_mount(node);
+ if (mnt_id == m->mnt_id_unique)
+ break;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
}
}
- unlock_ns_list(ns);
-
return ret;
}
+#ifdef CONFIG_PROC_FS
+
/* iterator; we want it to have access to namespace_sem, thus here... */
static void *m_start(struct seq_file *m, loff_t *pos)
{
struct proc_mounts *p = m->private;
- struct list_head *prev;
down_read(&namespace_sem);
- if (!*pos) {
- prev = &p->ns->list;
- } else {
- prev = &p->cursor.mnt_list;
-
- /* Read after we'd reached the end? */
- if (list_empty(prev))
- return NULL;
- }
- return mnt_list_next(p->ns, prev);
+ return mnt_find_id_at(p->ns, *pos);
}
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct proc_mounts *p = m->private;
- struct mount *mnt = v;
+ struct mount *next = NULL, *mnt = v;
+ struct rb_node *node = rb_next(&mnt->mnt_node);
++*pos;
- return mnt_list_next(p->ns, &mnt->mnt_list);
+ if (node) {
+ next = node_to_mount(node);
+ *pos = next->mnt_id_unique;
+ }
+ return next;
}
static void m_stop(struct seq_file *m, void *v)
{
- struct proc_mounts *p = m->private;
- struct mount *mnt = v;
-
- lock_ns_list(p->ns);
- if (mnt)
- list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list);
- else
- list_del_init(&p->cursor.mnt_list);
- unlock_ns_list(p->ns);
up_read(&namespace_sem);
}
@@ -1489,14 +1492,6 @@ const struct seq_operations mounts_op = {
.show = m_show,
};
-void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor)
-{
- down_read(&namespace_sem);
- lock_ns_list(ns);
- list_del(&cursor->mnt_list);
- unlock_ns_list(ns);
- up_read(&namespace_sem);
-}
#endif /* CONFIG_PROC_FS */
/**
@@ -1638,7 +1633,10 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
/* Gather the mounts to umount */
for (p = mnt; p; p = next_mnt(p, mnt)) {
p->mnt.mnt_flags |= MNT_UMOUNT;
- list_move(&p->mnt_list, &tmp_list);
+ if (p->mnt.mnt_flags & MNT_ONRB)
+ move_from_ns(p, &tmp_list);
+ else
+ list_move(&p->mnt_list, &tmp_list);
}
/* Hide the mounts from mnt_mounts */
@@ -1658,7 +1656,7 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how)
list_del_init(&p->mnt_list);
ns = p->mnt_ns;
if (ns) {
- ns->mounts--;
+ ns->nr_mounts--;
__touch_mnt_namespace(ns);
}
p->mnt_ns = NULL;
@@ -1784,14 +1782,16 @@ static int do_umount(struct mount *mnt, int flags)
event++;
if (flags & MNT_DETACH) {
- if (!list_empty(&mnt->mnt_list))
+ if (mnt->mnt.mnt_flags & MNT_ONRB ||
+ !list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE);
retval = 0;
} else {
shrink_submounts(mnt);
retval = -EBUSY;
if (!propagate_mount_busy(mnt, 2)) {
- if (!list_empty(&mnt->mnt_list))
+ if (mnt->mnt.mnt_flags & MNT_ONRB ||
+ !list_empty(&mnt->mnt_list))
umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC);
retval = 0;
}
@@ -2209,9 +2209,9 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt)
unsigned int mounts = 0;
struct mount *p;
- if (ns->mounts >= max)
+ if (ns->nr_mounts >= max)
return -ENOSPC;
- max -= ns->mounts;
+ max -= ns->nr_mounts;
if (ns->pending_mounts >= max)
return -ENOSPC;
max -= ns->pending_mounts;
@@ -2355,8 +2355,12 @@ static int attach_recursive_mnt(struct mount *source_mnt,
touch_mnt_namespace(source_mnt->mnt_ns);
} else {
if (source_mnt->mnt_ns) {
+ LIST_HEAD(head);
+
/* move from anon - the caller will destroy */
- list_del_init(&source_mnt->mnt_ns->list);
+ for (p = source_mnt; p; p = next_mnt(p, source_mnt))
+ move_from_ns(p, &head);
+ list_del_init(&head);
}
if (beneath)
mnt_set_mountpoint_beneath(source_mnt, top_mnt, smp);
@@ -2667,11 +2671,10 @@ static struct file *open_detached_copy(struct path *path, bool recursive)
lock_mount_hash();
for (p = mnt; p; p = next_mnt(p, mnt)) {
- p->mnt_ns = ns;
- ns->mounts++;
+ mnt_add_to_ns(ns, p);
+ ns->nr_mounts++;
}
ns->root = mnt;
- list_add_tail(&ns->list, &mnt->mnt_list);
mntget(&mnt->mnt);
unlock_mount_hash();
namespace_unlock();
@@ -2875,7 +2878,12 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
if (IS_ERR(fc))
return PTR_ERR(fc);
+ /*
+ * Indicate to the filesystem that the remount request is coming
+ * from the legacy mount system call.
+ */
fc->oldapi = true;
+
err = parse_monolithic_mount_data(fc, data);
if (!err) {
down_write(&sb->s_umount);
@@ -3026,6 +3034,7 @@ static inline bool path_overmounted(const struct path *path)
* can_move_mount_beneath - check that we can mount beneath the top mount
* @from: mount to mount beneath
* @to: mount under which to mount
+ * @mp: mountpoint of @to
*
* - Make sure that @to->dentry is actually the root of a mount under
* which we can mount another mount.
@@ -3324,6 +3333,12 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags,
if (IS_ERR(fc))
return PTR_ERR(fc);
+ /*
+ * Indicate to the filesystem that the mount request is coming
+ * from the legacy mount system call.
+ */
+ fc->oldapi = true;
+
if (subtype)
err = vfs_parse_fs_string(fc, "subtype",
subtype, strlen(subtype));
@@ -3734,9 +3749,8 @@ static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool a
if (!anon)
new_ns->seq = atomic64_add_return(1, &mnt_ns_seq);
refcount_set(&new_ns->ns.count, 1);
- INIT_LIST_HEAD(&new_ns->list);
+ new_ns->mounts = RB_ROOT;
init_waitqueue_head(&new_ns->poll);
- spin_lock_init(&new_ns->ns_lock);
new_ns->user_ns = get_user_ns(user_ns);
new_ns->ucounts = ucounts;
return new_ns;
@@ -3783,7 +3797,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
unlock_mount_hash();
}
new_ns->root = new;
- list_add_tail(&new_ns->list, &new->mnt_list);
/*
* Second pass: switch the tsk->fs->* elements and mark new vfsmounts
@@ -3793,8 +3806,8 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
p = old;
q = new;
while (p) {
- q->mnt_ns = new_ns;
- new_ns->mounts++;
+ mnt_add_to_ns(new_ns, q);
+ new_ns->nr_mounts++;
if (new_fs) {
if (&p->mnt == new_fs->root.mnt) {
new_fs->root.mnt = mntget(&q->mnt);
@@ -3836,10 +3849,9 @@ struct dentry *mount_subtree(struct vfsmount *m, const char *name)
mntput(m);
return ERR_CAST(ns);
}
- mnt->mnt_ns = ns;
ns->root = mnt;
- ns->mounts++;
- list_add(&mnt->mnt_list, &ns->list);
+ ns->nr_mounts++;
+ mnt_add_to_ns(ns, mnt);
err = vfs_path_lookup(m->mnt_root, m,
name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
@@ -4017,10 +4029,9 @@ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags,
goto err_path;
}
mnt = real_mount(newmount.mnt);
- mnt->mnt_ns = ns;
ns->root = mnt;
- ns->mounts = 1;
- list_add(&mnt->mnt_list, &ns->list);
+ ns->nr_mounts = 1;
+ mnt_add_to_ns(ns, mnt);
mntget(newmount.mnt);
/* Attach to an apparent O_PATH fd with a note that we need to unmount
@@ -4288,7 +4299,7 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt)
* Creating an idmapped mount with the filesystem wide idmapping
* doesn't make sense so block that. We don't allow mushy semantics.
*/
- if (!check_fsmapping(kattr->mnt_idmap, m->mnt_sb))
+ if (kattr->mnt_userns == m->mnt_sb->s_user_ns)
return -EINVAL;
/*
@@ -4461,10 +4472,15 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr)
/*
* If this is an attached mount make sure it's located in the callers
* mount namespace. If it's not don't let the caller interact with it.
- * If this is a detached mount make sure it has an anonymous mount
- * namespace attached to it, i.e. we've created it via OPEN_TREE_CLONE.
+ *
+ * If this mount doesn't have a parent it's most often simply a
+ * detached mount with an anonymous mount namespace. IOW, something
+ * that's simply not attached yet. But there are apparently also users
+ * that do change mount properties on the rootfs itself. That obviously
+ * neither has a parent nor is it a detached mount so we cannot
+ * unconditionally check for detached mounts.
*/
- if (!(mnt_has_parent(mnt) ? check_mnt(mnt) : is_anon_ns(mnt->mnt_ns)))
+ if ((mnt_has_parent(mnt) || !is_anon_ns(mnt->mnt_ns)) && !check_mnt(mnt))
goto out;
/*
@@ -4676,6 +4692,444 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path,
return err;
}
+int show_path(struct seq_file *m, struct dentry *root)
+{
+ if (root->d_sb->s_op->show_path)
+ return root->d_sb->s_op->show_path(m, root);
+
+ seq_dentry(m, root, " \t\n\\");
+ return 0;
+}
+
+static struct vfsmount *lookup_mnt_in_ns(u64 id, struct mnt_namespace *ns)
+{
+ struct mount *mnt = mnt_find_id_at(ns, id);
+
+ if (!mnt || mnt->mnt_id_unique != id)
+ return NULL;
+
+ return &mnt->mnt;
+}
+
+struct kstatmount {
+ struct statmount __user *buf;
+ size_t bufsize;
+ struct vfsmount *mnt;
+ u64 mask;
+ struct path root;
+ struct statmount sm;
+ struct seq_file seq;
+};
+
+static u64 mnt_to_attr_flags(struct vfsmount *mnt)
+{
+ unsigned int mnt_flags = READ_ONCE(mnt->mnt_flags);
+ u64 attr_flags = 0;
+
+ if (mnt_flags & MNT_READONLY)
+ attr_flags |= MOUNT_ATTR_RDONLY;
+ if (mnt_flags & MNT_NOSUID)
+ attr_flags |= MOUNT_ATTR_NOSUID;
+ if (mnt_flags & MNT_NODEV)
+ attr_flags |= MOUNT_ATTR_NODEV;
+ if (mnt_flags & MNT_NOEXEC)
+ attr_flags |= MOUNT_ATTR_NOEXEC;
+ if (mnt_flags & MNT_NODIRATIME)
+ attr_flags |= MOUNT_ATTR_NODIRATIME;
+ if (mnt_flags & MNT_NOSYMFOLLOW)
+ attr_flags |= MOUNT_ATTR_NOSYMFOLLOW;
+
+ if (mnt_flags & MNT_NOATIME)
+ attr_flags |= MOUNT_ATTR_NOATIME;
+ else if (mnt_flags & MNT_RELATIME)
+ attr_flags |= MOUNT_ATTR_RELATIME;
+ else
+ attr_flags |= MOUNT_ATTR_STRICTATIME;
+
+ if (is_idmapped_mnt(mnt))
+ attr_flags |= MOUNT_ATTR_IDMAP;
+
+ return attr_flags;
+}
+
+static u64 mnt_to_propagation_flags(struct mount *m)
+{
+ u64 propagation = 0;
+
+ if (IS_MNT_SHARED(m))
+ propagation |= MS_SHARED;
+ if (IS_MNT_SLAVE(m))
+ propagation |= MS_SLAVE;
+ if (IS_MNT_UNBINDABLE(m))
+ propagation |= MS_UNBINDABLE;
+ if (!propagation)
+ propagation |= MS_PRIVATE;
+
+ return propagation;
+}
+
+static void statmount_sb_basic(struct kstatmount *s)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+
+ s->sm.mask |= STATMOUNT_SB_BASIC;
+ s->sm.sb_dev_major = MAJOR(sb->s_dev);
+ s->sm.sb_dev_minor = MINOR(sb->s_dev);
+ s->sm.sb_magic = sb->s_magic;
+ s->sm.sb_flags = sb->s_flags & (SB_RDONLY|SB_SYNCHRONOUS|SB_DIRSYNC|SB_LAZYTIME);
+}
+
+static void statmount_mnt_basic(struct kstatmount *s)
+{
+ struct mount *m = real_mount(s->mnt);
+
+ s->sm.mask |= STATMOUNT_MNT_BASIC;
+ s->sm.mnt_id = m->mnt_id_unique;
+ s->sm.mnt_parent_id = m->mnt_parent->mnt_id_unique;
+ s->sm.mnt_id_old = m->mnt_id;
+ s->sm.mnt_parent_id_old = m->mnt_parent->mnt_id;
+ s->sm.mnt_attr = mnt_to_attr_flags(&m->mnt);
+ s->sm.mnt_propagation = mnt_to_propagation_flags(m);
+ s->sm.mnt_peer_group = IS_MNT_SHARED(m) ? m->mnt_group_id : 0;
+ s->sm.mnt_master = IS_MNT_SLAVE(m) ? m->mnt_master->mnt_group_id : 0;
+}
+
+static void statmount_propagate_from(struct kstatmount *s)
+{
+ struct mount *m = real_mount(s->mnt);
+
+ s->sm.mask |= STATMOUNT_PROPAGATE_FROM;
+ if (IS_MNT_SLAVE(m))
+ s->sm.propagate_from = get_dominating_id(m, &current->fs->root);
+}
+
+static int statmount_mnt_root(struct kstatmount *s, struct seq_file *seq)
+{
+ int ret;
+ size_t start = seq->count;
+
+ ret = show_path(seq, s->mnt->mnt_root);
+ if (ret)
+ return ret;
+
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ /*
+ * Unescape the result. It would be better if supplied string was not
+ * escaped in the first place, but that's a pretty invasive change.
+ */
+ seq->buf[seq->count] = '\0';
+ seq->count = start;
+ seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL));
+ return 0;
+}
+
+static int statmount_mnt_point(struct kstatmount *s, struct seq_file *seq)
+{
+ struct vfsmount *mnt = s->mnt;
+ struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
+ int err;
+
+ err = seq_path_root(seq, &mnt_path, &s->root, "");
+ return err == SEQ_SKIP ? 0 : err;
+}
+
+static int statmount_fs_type(struct kstatmount *s, struct seq_file *seq)
+{
+ struct super_block *sb = s->mnt->mnt_sb;
+
+ seq_puts(seq, sb->s_type->name);
+ return 0;
+}
+
+static int statmount_string(struct kstatmount *s, u64 flag)
+{
+ int ret;
+ size_t kbufsize;
+ struct seq_file *seq = &s->seq;
+ struct statmount *sm = &s->sm;
+
+ switch (flag) {
+ case STATMOUNT_FS_TYPE:
+ sm->fs_type = seq->count;
+ ret = statmount_fs_type(s, seq);
+ break;
+ case STATMOUNT_MNT_ROOT:
+ sm->mnt_root = seq->count;
+ ret = statmount_mnt_root(s, seq);
+ break;
+ case STATMOUNT_MNT_POINT:
+ sm->mnt_point = seq->count;
+ ret = statmount_mnt_point(s, seq);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ return -EINVAL;
+ }
+
+ if (unlikely(check_add_overflow(sizeof(*sm), seq->count, &kbufsize)))
+ return -EOVERFLOW;
+ if (kbufsize >= s->bufsize)
+ return -EOVERFLOW;
+
+ /* signal a retry */
+ if (unlikely(seq_has_overflowed(seq)))
+ return -EAGAIN;
+
+ if (ret)
+ return ret;
+
+ seq->buf[seq->count++] = '\0';
+ sm->mask |= flag;
+ return 0;
+}
+
+static int copy_statmount_to_user(struct kstatmount *s)
+{
+ struct statmount *sm = &s->sm;
+ struct seq_file *seq = &s->seq;
+ char __user *str = ((char __user *)s->buf) + sizeof(*sm);
+ size_t copysize = min_t(size_t, s->bufsize, sizeof(*sm));
+
+ if (seq->count && copy_to_user(str, seq->buf, seq->count))
+ return -EFAULT;
+
+ /* Return the number of bytes copied to the buffer */
+ sm->size = copysize + seq->count;
+ if (copy_to_user(s->buf, sm, copysize))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int do_statmount(struct kstatmount *s)
+{
+ struct mount *m = real_mount(s->mnt);
+ int err;
+
+ /*
+ * Don't trigger audit denials. We just want to determine what
+ * mounts to show users.
+ */
+ if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
+ !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ err = security_sb_statfs(s->mnt->mnt_root);
+ if (err)
+ return err;
+
+ if (s->mask & STATMOUNT_SB_BASIC)
+ statmount_sb_basic(s);
+
+ if (s->mask & STATMOUNT_MNT_BASIC)
+ statmount_mnt_basic(s);
+
+ if (s->mask & STATMOUNT_PROPAGATE_FROM)
+ statmount_propagate_from(s);
+
+ if (s->mask & STATMOUNT_FS_TYPE)
+ err = statmount_string(s, STATMOUNT_FS_TYPE);
+
+ if (!err && s->mask & STATMOUNT_MNT_ROOT)
+ err = statmount_string(s, STATMOUNT_MNT_ROOT);
+
+ if (!err && s->mask & STATMOUNT_MNT_POINT)
+ err = statmount_string(s, STATMOUNT_MNT_POINT);
+
+ if (err)
+ return err;
+
+ return 0;
+}
+
+static inline bool retry_statmount(const long ret, size_t *seq_size)
+{
+ if (likely(ret != -EAGAIN))
+ return false;
+ if (unlikely(check_mul_overflow(*seq_size, 2, seq_size)))
+ return false;
+ if (unlikely(*seq_size > MAX_RW_COUNT))
+ return false;
+ return true;
+}
+
+static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq,
+ struct statmount __user *buf, size_t bufsize,
+ size_t seq_size)
+{
+ if (!access_ok(buf, bufsize))
+ return -EFAULT;
+
+ memset(ks, 0, sizeof(*ks));
+ ks->mask = kreq->param;
+ ks->buf = buf;
+ ks->bufsize = bufsize;
+ ks->seq.size = seq_size;
+ ks->seq.buf = kvmalloc(seq_size, GFP_KERNEL_ACCOUNT);
+ if (!ks->seq.buf)
+ return -ENOMEM;
+ return 0;
+}
+
+static int copy_mnt_id_req(const struct mnt_id_req __user *req,
+ struct mnt_id_req *kreq)
+{
+ int ret;
+ size_t usize;
+
+ BUILD_BUG_ON(sizeof(struct mnt_id_req) != MNT_ID_REQ_SIZE_VER0);
+
+ ret = get_user(usize, &req->size);
+ if (ret)
+ return -EFAULT;
+ if (unlikely(usize > PAGE_SIZE))
+ return -E2BIG;
+ if (unlikely(usize < MNT_ID_REQ_SIZE_VER0))
+ return -EINVAL;
+ memset(kreq, 0, sizeof(*kreq));
+ ret = copy_struct_from_user(kreq, sizeof(*kreq), req, usize);
+ if (ret)
+ return ret;
+ if (kreq->spare != 0)
+ return -EINVAL;
+ return 0;
+}
+
+SYSCALL_DEFINE4(statmount, const struct mnt_id_req __user *, req,
+ struct statmount __user *, buf, size_t, bufsize,
+ unsigned int, flags)
+{
+ struct vfsmount *mnt;
+ struct mnt_id_req kreq;
+ struct kstatmount ks;
+ /* We currently support retrieval of 3 strings. */
+ size_t seq_size = 3 * PATH_MAX;
+ int ret;
+
+ if (flags)
+ return -EINVAL;
+
+ ret = copy_mnt_id_req(req, &kreq);
+ if (ret)
+ return ret;
+
+retry:
+ ret = prepare_kstatmount(&ks, &kreq, buf, bufsize, seq_size);
+ if (ret)
+ return ret;
+
+ down_read(&namespace_sem);
+ mnt = lookup_mnt_in_ns(kreq.mnt_id, current->nsproxy->mnt_ns);
+ if (!mnt) {
+ up_read(&namespace_sem);
+ kvfree(ks.seq.buf);
+ return -ENOENT;
+ }
+
+ ks.mnt = mnt;
+ get_fs_root(current->fs, &ks.root);
+ ret = do_statmount(&ks);
+ path_put(&ks.root);
+ up_read(&namespace_sem);
+
+ if (!ret)
+ ret = copy_statmount_to_user(&ks);
+ kvfree(ks.seq.buf);
+ if (retry_statmount(ret, &seq_size))
+ goto retry;
+ return ret;
+}
+
+static struct mount *listmnt_next(struct mount *curr)
+{
+ return node_to_mount(rb_next(&curr->mnt_node));
+}
+
+static ssize_t do_listmount(struct mount *first, struct path *orig,
+ u64 mnt_parent_id, u64 __user *mnt_ids,
+ size_t nr_mnt_ids, const struct path *root)
+{
+ struct mount *r;
+ ssize_t ret;
+
+ /*
+ * Don't trigger audit denials. We just want to determine what
+ * mounts to show users.
+ */
+ if (!is_path_reachable(real_mount(orig->mnt), orig->dentry, root) &&
+ !ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = security_sb_statfs(orig->dentry);
+ if (ret)
+ return ret;
+
+ for (ret = 0, r = first; r && nr_mnt_ids; r = listmnt_next(r)) {
+ if (r->mnt_id_unique == mnt_parent_id)
+ continue;
+ if (!is_path_reachable(r, r->mnt.mnt_root, orig))
+ continue;
+ if (put_user(r->mnt_id_unique, mnt_ids))
+ return -EFAULT;
+ mnt_ids++;
+ nr_mnt_ids--;
+ ret++;
+ }
+ return ret;
+}
+
+SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *,
+ mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
+{
+ struct mnt_namespace *ns = current->nsproxy->mnt_ns;
+ struct mnt_id_req kreq;
+ struct mount *first;
+ struct path root, orig;
+ u64 mnt_parent_id, last_mnt_id;
+ const size_t maxcount = (size_t)-1 >> 3;
+ ssize_t ret;
+
+ if (flags)
+ return -EINVAL;
+
+ if (unlikely(nr_mnt_ids > maxcount))
+ return -EFAULT;
+
+ if (!access_ok(mnt_ids, nr_mnt_ids * sizeof(*mnt_ids)))
+ return -EFAULT;
+
+ ret = copy_mnt_id_req(req, &kreq);
+ if (ret)
+ return ret;
+ mnt_parent_id = kreq.mnt_id;
+ last_mnt_id = kreq.param;
+
+ down_read(&namespace_sem);
+ get_fs_root(current->fs, &root);
+ if (mnt_parent_id == LSMT_ROOT) {
+ orig = root;
+ } else {
+ ret = -ENOENT;
+ orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
+ if (!orig.mnt)
+ goto err;
+ orig.dentry = orig.mnt->mnt_root;
+ }
+ if (!last_mnt_id)
+ first = node_to_mount(rb_first(&ns->mounts));
+ else
+ first = mnt_find_id_at(ns, last_mnt_id + 1);
+
+ ret = do_listmount(first, &orig, mnt_parent_id, mnt_ids, nr_mnt_ids, &root);
+err:
+ path_put(&root);
+ up_read(&namespace_sem);
+ return ret;
+}
+
+
static void __init init_mount_tree(void)
{
struct vfsmount *mnt;
@@ -4691,10 +5145,9 @@ static void __init init_mount_tree(void)
if (IS_ERR(ns))
panic("Can't allocate initial namespace");
m = real_mount(mnt);
- m->mnt_ns = ns;
ns->root = m;
- ns->mounts = 1;
- list_add(&m->mnt_list, &ns->list);
+ ns->nr_mounts = 1;
+ mnt_add_to_ns(ns, m);
init_task.nsproxy->mnt_ns = ns;
get_mnt_ns(ns);
@@ -4821,18 +5274,14 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
int *new_mnt_flags)
{
int new_flags = *new_mnt_flags;
- struct mount *mnt;
+ struct mount *mnt, *n;
bool visible = false;
down_read(&namespace_sem);
- lock_ns_list(ns);
- list_for_each_entry(mnt, &ns->list, mnt_list) {
+ rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) {
struct mount *child;
int mnt_flags;
- if (mnt_is_cursor(mnt))
- continue;
-
if (mnt->mnt.mnt_sb->s_type != sb->s_type)
continue;
@@ -4880,7 +5329,6 @@ static bool mnt_already_visible(struct mnt_namespace *ns,
next: ;
}
found:
- unlock_ns_list(ns);
up_read(&namespace_sem);
return visible;
}
@@ -5010,7 +5458,6 @@ static struct ctl_table fs_namespace_sysctls[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
- { }
};
static int __init init_fs_namespace_sysctls(void)
diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig
index b4db21022cb4..bec805e0c44c 100644
--- a/fs/netfs/Kconfig
+++ b/fs/netfs/Kconfig
@@ -21,3 +21,42 @@ config NETFS_STATS
multi-CPU system these may be on cachelines that keep bouncing
between CPUs. On the other hand, the stats are very useful for
debugging purposes. Saying 'Y' here is recommended.
+
+config FSCACHE
+ bool "General filesystem local caching manager"
+ depends on NETFS_SUPPORT
+ help
+ This option enables a generic filesystem caching manager that can be
+ used by various network and other filesystems to cache data locally.
+ Different sorts of caches can be plugged in, depending on the
+ resources available.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
+
+config FSCACHE_STATS
+ bool "Gather statistical information on local caching"
+ depends on FSCACHE && PROC_FS
+ select NETFS_STATS
+ help
+ This option causes statistical information to be gathered on local
+ caching and exported through file:
+
+ /proc/fs/fscache/stats
+
+ The gathering of statistics adds a certain amount of overhead to
+ execution as there are a quite a few stats gathered, and on a
+ multi-CPU system these may be on cachelines that keep bouncing
+ between CPUs. On the other hand, the stats are very useful for
+ debugging purposes. Saying 'Y' here is recommended.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
+
+config FSCACHE_DEBUG
+ bool "Debug FS-Cache"
+ depends on FSCACHE
+ help
+ This permits debugging to be dynamically enabled in the local caching
+ management module. If this is set, the debugging output may be
+ enabled by setting bits in /sys/modules/fscache/parameter/debug.
+
+ See Documentation/filesystems/caching/fscache.rst for more information.
diff --git a/fs/netfs/Makefile b/fs/netfs/Makefile
index 386d6fb92793..d4d1d799819e 100644
--- a/fs/netfs/Makefile
+++ b/fs/netfs/Makefile
@@ -2,11 +2,29 @@
netfs-y := \
buffered_read.o \
+ buffered_write.o \
+ direct_read.o \
+ direct_write.o \
io.o \
iterator.o \
+ locking.o \
main.o \
- objects.o
+ misc.o \
+ objects.o \
+ output.o
netfs-$(CONFIG_NETFS_STATS) += stats.o
-obj-$(CONFIG_NETFS_SUPPORT) := netfs.o
+netfs-$(CONFIG_FSCACHE) += \
+ fscache_cache.o \
+ fscache_cookie.o \
+ fscache_io.o \
+ fscache_main.o \
+ fscache_volume.o
+
+ifeq ($(CONFIG_PROC_FS),y)
+netfs-$(CONFIG_FSCACHE) += fscache_proc.o
+endif
+netfs-$(CONFIG_FSCACHE_STATS) += fscache_stats.o
+
+obj-$(CONFIG_NETFS_SUPPORT) += netfs.o
diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 2cd3ccf4c439..3298c29b5548 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -16,6 +16,7 @@
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
+ struct netfs_folio *finfo;
struct folio *folio;
pgoff_t start_page = rreq->start / PAGE_SIZE;
pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
@@ -63,6 +64,7 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
break;
}
if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
+ trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
folio_start_fscache(folio);
folio_started = true;
}
@@ -86,11 +88,20 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
if (!pg_failed) {
flush_dcache_folio(folio);
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ trace_netfs_folio(folio, netfs_folio_trace_filled_gaps);
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ kfree(finfo);
+ }
folio_mark_uptodate(folio);
}
if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
- if (folio_index(folio) == rreq->no_unlock_folio &&
+ if (folio->index == rreq->no_unlock_folio &&
test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
_debug("no unlock");
else
@@ -147,6 +158,15 @@ static void netfs_rreq_expand(struct netfs_io_request *rreq,
}
}
+/*
+ * Begin an operation, and fetch the stored zero point value from the cookie if
+ * available.
+ */
+static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_inode *ctx)
+{
+ return fscache_begin_read_operation(&rreq->cache_resources, netfs_i_cookie(ctx));
+}
+
/**
* netfs_readahead - Helper to manage a read request
* @ractl: The description of the readahead request
@@ -180,11 +200,9 @@ void netfs_readahead(struct readahead_control *ractl)
if (IS_ERR(rreq))
return;
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto cleanup_free;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto cleanup_free;
netfs_stat(&netfs_n_rh_readahead);
trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
@@ -192,6 +210,10 @@ void netfs_readahead(struct readahead_control *ractl)
netfs_rreq_expand(rreq, ractl);
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &ractl->mapping->i_pages,
+ rreq->start, rreq->len);
+
/* Drop the refs on the folios here rather than in the cache or
* filesystem. The locks will be dropped in netfs_rreq_unlock().
*/
@@ -199,6 +221,7 @@ void netfs_readahead(struct readahead_control *ractl)
;
netfs_begin_read(rreq, false);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
return;
cleanup_free:
@@ -223,12 +246,13 @@ EXPORT_SYMBOL(netfs_readahead);
*/
int netfs_read_folio(struct file *file, struct folio *folio)
{
- struct address_space *mapping = folio_file_mapping(folio);
+ struct address_space *mapping = folio->mapping;
struct netfs_io_request *rreq;
struct netfs_inode *ctx = netfs_inode(mapping->host);
+ struct folio *sink = NULL;
int ret;
- _enter("%lx", folio_index(folio));
+ _enter("%lx", folio->index);
rreq = netfs_alloc_request(mapping, file,
folio_file_pos(folio), folio_size(folio),
@@ -238,15 +262,64 @@ int netfs_read_folio(struct file *file, struct folio *folio)
goto alloc_error;
}
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto discard;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto discard;
netfs_stat(&netfs_n_rh_readpage);
trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
- return netfs_begin_read(rreq, true);
+
+ /* Set up the output buffer */
+ if (folio_test_dirty(folio)) {
+ /* Handle someone trying to read from an unflushed streaming
+ * write. We fiddle the buffer so that a gap at the beginning
+ * and/or a gap at the end get copied to, but the middle is
+ * discarded.
+ */
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct bio_vec *bvec;
+ unsigned int from = finfo->dirty_offset;
+ unsigned int to = from + finfo->dirty_len;
+ unsigned int off = 0, i = 0;
+ size_t flen = folio_size(folio);
+ size_t nr_bvec = flen / PAGE_SIZE + 2;
+ size_t part;
+
+ ret = -ENOMEM;
+ bvec = kmalloc_array(nr_bvec, sizeof(*bvec), GFP_KERNEL);
+ if (!bvec)
+ goto discard;
+
+ sink = folio_alloc(GFP_KERNEL, 0);
+ if (!sink)
+ goto discard;
+
+ trace_netfs_folio(folio, netfs_folio_trace_read_gaps);
+
+ rreq->direct_bv = bvec;
+ rreq->direct_bv_count = nr_bvec;
+ if (from > 0) {
+ bvec_set_folio(&bvec[i++], folio, from, 0);
+ off = from;
+ }
+ while (off < to) {
+ part = min_t(size_t, to - off, PAGE_SIZE);
+ bvec_set_folio(&bvec[i++], sink, part, 0);
+ off += part;
+ }
+ if (to < flen)
+ bvec_set_folio(&bvec[i++], folio, flen - to, to);
+ iov_iter_bvec(&rreq->iter, ITER_DEST, bvec, i, rreq->len);
+ } else {
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+ }
+
+ ret = netfs_begin_read(rreq, true);
+ if (sink)
+ folio_put(sink);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret < 0 ? ret : 0;
discard:
netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
@@ -387,14 +460,12 @@ retry:
ret = PTR_ERR(rreq);
goto error;
}
- rreq->no_unlock_folio = folio_index(folio);
+ rreq->no_unlock_folio = folio->index;
__set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
- if (ctx->ops->begin_cache_operation) {
- ret = ctx->ops->begin_cache_operation(rreq);
- if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
- goto error_put;
- }
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
netfs_stat(&netfs_n_rh_write_begin);
trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
@@ -405,6 +476,10 @@ retry:
ractl._nr_pages = folio_nr_pages(folio);
netfs_rreq_expand(rreq, &ractl);
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+
/* We hold the folio locks, so we can drop the references */
folio_get(folio);
while (readahead_folio(&ractl))
@@ -413,6 +488,7 @@ retry:
ret = netfs_begin_read(rreq, true);
if (ret < 0)
goto error;
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
have_folio:
ret = folio_wait_fscache_killable(folio);
@@ -434,3 +510,124 @@ error:
return ret;
}
EXPORT_SYMBOL(netfs_write_begin);
+
+/*
+ * Preload the data into a page we're proposing to write into.
+ */
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+ size_t offset, size_t len)
+{
+ struct netfs_io_request *rreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ unsigned long long start = folio_pos(folio);
+ size_t flen = folio_size(folio);
+ int ret;
+
+ _enter("%zx @%llx", flen, start);
+
+ ret = -ENOMEM;
+
+ rreq = netfs_alloc_request(mapping, file, start, flen,
+ NETFS_READ_FOR_WRITE);
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto error;
+ }
+
+ rreq->no_unlock_folio = folio->index;
+ __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
+ ret = netfs_begin_cache_read(rreq, ctx);
+ if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
+ goto error_put;
+
+ netfs_stat(&netfs_n_rh_write_begin);
+ trace_netfs_read(rreq, start, flen, netfs_read_trace_prefetch_for_write);
+
+ /* Set up the output buffer */
+ iov_iter_xarray(&rreq->iter, ITER_DEST, &mapping->i_pages,
+ rreq->start, rreq->len);
+
+ ret = netfs_begin_read(rreq, true);
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ return ret;
+
+error_put:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
+error:
+ _leave(" = %d", ret);
+ return ret;
+}
+
+/**
+ * netfs_buffered_read_iter - Filesystem buffered I/O read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead. When no data can be read,
+ * -EAGAIN shall be returned. When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ struct netfs_inode *ictx = netfs_inode(inode);
+ ssize_t ret;
+
+ if (WARN_ON_ONCE((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags)))
+ return -EINVAL;
+
+ ret = netfs_start_io_read(inode);
+ if (ret == 0) {
+ ret = filemap_read(iocb, iter, 0);
+ netfs_end_io_read(inode);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netfs_buffered_read_iter);
+
+/**
+ * netfs_file_read_iter - Generic filesystem read routine
+ * @iocb: kernel I/O control block
+ * @iter: destination for the data read
+ *
+ * This is the ->read_iter() routine for all filesystems that can use the page
+ * cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall be
+ * returned when no data can be read without waiting for I/O requests to
+ * complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O requests
+ * shall be made for the read or for readahead. When no data can be read,
+ * -EAGAIN shall be returned. When readahead would be triggered, a partial,
+ * possibly empty read shall be returned.
+ *
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
+ */
+ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct netfs_inode *ictx = netfs_inode(iocb->ki_filp->f_mapping->host);
+
+ if ((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+ return netfs_unbuffered_read_iter(iocb, iter);
+
+ return netfs_buffered_read_iter(iocb, iter);
+}
+EXPORT_SYMBOL(netfs_file_read_iter);
diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c
new file mode 100644
index 000000000000..9a0d32e4b422
--- /dev/null
+++ b/fs/netfs/buffered_write.c
@@ -0,0 +1,1257 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/*
+ * Determined write method. Adjust netfs_folio_traces if this is changed.
+ */
+enum netfs_how_to_modify {
+ NETFS_FOLIO_IS_UPTODATE, /* Folio is uptodate already */
+ NETFS_JUST_PREFETCH, /* We have to read the folio anyway */
+ NETFS_WHOLE_FOLIO_MODIFY, /* We're going to overwrite the whole folio */
+ NETFS_MODIFY_AND_CLEAR, /* We can assume there is no data to be downloaded. */
+ NETFS_STREAMING_WRITE, /* Store incomplete data in non-uptodate page. */
+ NETFS_STREAMING_WRITE_CONT, /* Continue streaming write. */
+ NETFS_FLUSH_CONTENT, /* Flush incompatible content. */
+};
+
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq);
+
+static void netfs_set_group(struct folio *folio, struct netfs_group *netfs_group)
+{
+ if (netfs_group && !folio_get_private(folio))
+ folio_attach_private(folio, netfs_get_group(netfs_group));
+}
+
+#if IS_ENABLED(CONFIG_FSCACHE)
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+ if (caching)
+ folio_start_fscache(folio);
+}
+#else
+static void netfs_folio_start_fscache(bool caching, struct folio *folio)
+{
+}
+#endif
+
+/*
+ * Decide how we should modify a folio. We might be attempting to do
+ * write-streaming, in which case we don't want to a local RMW cycle if we can
+ * avoid it. If we're doing local caching or content crypto, we award that
+ * priority over avoiding RMW. If the file is open readably, then we also
+ * assume that we may want to read what we wrote.
+ */
+static enum netfs_how_to_modify netfs_how_to_modify(struct netfs_inode *ctx,
+ struct file *file,
+ struct folio *folio,
+ void *netfs_group,
+ size_t flen,
+ size_t offset,
+ size_t len,
+ bool maybe_trouble)
+{
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ loff_t pos = folio_file_pos(folio);
+
+ _enter("");
+
+ if (netfs_folio_group(folio) != netfs_group)
+ return NETFS_FLUSH_CONTENT;
+
+ if (folio_test_uptodate(folio))
+ return NETFS_FOLIO_IS_UPTODATE;
+
+ if (pos >= ctx->zero_point)
+ return NETFS_MODIFY_AND_CLEAR;
+
+ if (!maybe_trouble && offset == 0 && len >= flen)
+ return NETFS_WHOLE_FOLIO_MODIFY;
+
+ if (file->f_mode & FMODE_READ)
+ goto no_write_streaming;
+ if (test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+ goto no_write_streaming;
+
+ if (netfs_is_cache_enabled(ctx)) {
+ /* We don't want to get a streaming write on a file that loses
+ * caching service temporarily because the backing store got
+ * culled.
+ */
+ if (!test_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags))
+ set_bit(NETFS_ICTX_NO_WRITE_STREAMING, &ctx->flags);
+ goto no_write_streaming;
+ }
+
+ if (!finfo)
+ return NETFS_STREAMING_WRITE;
+
+ /* We can continue a streaming write only if it continues on from the
+ * previous. If it overlaps, we must flush lest we suffer a partial
+ * copy and disjoint dirty regions.
+ */
+ if (offset == finfo->dirty_offset + finfo->dirty_len)
+ return NETFS_STREAMING_WRITE_CONT;
+ return NETFS_FLUSH_CONTENT;
+
+no_write_streaming:
+ if (finfo) {
+ netfs_stat(&netfs_n_wh_wstream_conflict);
+ return NETFS_FLUSH_CONTENT;
+ }
+ return NETFS_JUST_PREFETCH;
+}
+
+/*
+ * Grab a folio for writing and lock it. Attempt to allocate as large a folio
+ * as possible to hold as much of the remaining length as possible in one go.
+ */
+static struct folio *netfs_grab_folio_for_write(struct address_space *mapping,
+ loff_t pos, size_t part)
+{
+ pgoff_t index = pos / PAGE_SIZE;
+ fgf_t fgp_flags = FGP_WRITEBEGIN;
+
+ if (mapping_large_folio_support(mapping))
+ fgp_flags |= fgf_set_order(pos % PAGE_SIZE + part);
+
+ return __filemap_get_folio(mapping, index, fgp_flags,
+ mapping_gfp_mask(mapping));
+}
+
+/**
+ * netfs_perform_write - Copy data into the pagecache.
+ * @iocb: The operation parameters
+ * @iter: The source buffer
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * Copy data into pagecache pages attached to the inode specified by @iocb.
+ * The caller must hold appropriate inode locks.
+ *
+ * Dirty pages are tagged with a netfs_folio struct if they're not up to date
+ * to indicate the range modified. Dirty pages may also be tagged with a
+ * netfs-specific grouping such that data from an old group gets flushed before
+ * a new one is started.
+ */
+ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct address_space *mapping = inode->i_mapping;
+ struct netfs_inode *ctx = netfs_inode(inode);
+ struct writeback_control wbc = {
+ .sync_mode = WB_SYNC_NONE,
+ .for_sync = true,
+ .nr_to_write = LONG_MAX,
+ .range_start = iocb->ki_pos,
+ .range_end = iocb->ki_pos + iter->count,
+ };
+ struct netfs_io_request *wreq = NULL;
+ struct netfs_folio *finfo;
+ struct folio *folio;
+ enum netfs_how_to_modify howto;
+ enum netfs_folio_trace trace;
+ unsigned int bdp_flags = (iocb->ki_flags & IOCB_SYNC) ? 0: BDP_ASYNC;
+ ssize_t written = 0, ret;
+ loff_t i_size, pos = iocb->ki_pos, from, to;
+ size_t max_chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
+ bool maybe_trouble = false;
+
+ if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) ||
+ iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC))
+ ) {
+ if (pos < i_size_read(inode)) {
+ ret = filemap_write_and_wait_range(mapping, pos, pos + iter->count);
+ if (ret < 0) {
+ goto out;
+ }
+ }
+
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+
+ wreq = netfs_begin_writethrough(iocb, iter->count);
+ if (IS_ERR(wreq)) {
+ wbc_detach_inode(&wbc);
+ ret = PTR_ERR(wreq);
+ wreq = NULL;
+ goto out;
+ }
+ if (!is_sync_kiocb(iocb))
+ wreq->iocb = iocb;
+ wreq->cleanup = netfs_cleanup_buffered_write;
+ }
+
+ do {
+ size_t flen;
+ size_t offset; /* Offset into pagecache folio */
+ size_t part; /* Bytes to write to folio */
+ size_t copied; /* Bytes copied from user */
+
+ ret = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags);
+ if (unlikely(ret < 0))
+ break;
+
+ offset = pos & (max_chunk - 1);
+ part = min(max_chunk - offset, iov_iter_count(iter));
+
+ /* Bring in the user pages that we will copy from _first_ lest
+ * we hit a nasty deadlock on copying from the same page as
+ * we're writing to, without it being marked uptodate.
+ *
+ * Not only is this an optimisation, but it is also required to
+ * check that the address is actually valid, when atomic
+ * usercopies are used below.
+ *
+ * We rely on the page being held onto long enough by the LRU
+ * that we can grab it below if this causes it to be read.
+ */
+ ret = -EFAULT;
+ if (unlikely(fault_in_iov_iter_readable(iter, part) == part))
+ break;
+
+ folio = netfs_grab_folio_for_write(mapping, pos, part);
+ if (IS_ERR(folio)) {
+ ret = PTR_ERR(folio);
+ break;
+ }
+
+ flen = folio_size(folio);
+ offset = pos & (flen - 1);
+ part = min_t(size_t, flen - offset, part);
+
+ if (signal_pending(current)) {
+ ret = written ? -EINTR : -ERESTARTSYS;
+ goto error_folio_unlock;
+ }
+
+ /* See if we need to prefetch the area we're going to modify.
+ * We need to do this before we get a lock on the folio in case
+ * there's more than one writer competing for the same cache
+ * block.
+ */
+ howto = netfs_how_to_modify(ctx, file, folio, netfs_group,
+ flen, offset, part, maybe_trouble);
+ _debug("howto %u", howto);
+ switch (howto) {
+ case NETFS_JUST_PREFETCH:
+ ret = netfs_prefetch_for_write(file, folio, offset, part);
+ if (ret < 0) {
+ _debug("prefetch = %zd", ret);
+ goto error_folio_unlock;
+ }
+ break;
+ case NETFS_FOLIO_IS_UPTODATE:
+ case NETFS_WHOLE_FOLIO_MODIFY:
+ case NETFS_STREAMING_WRITE_CONT:
+ break;
+ case NETFS_MODIFY_AND_CLEAR:
+ zero_user_segment(&folio->page, 0, offset);
+ break;
+ case NETFS_STREAMING_WRITE:
+ ret = -EIO;
+ if (WARN_ON(folio_get_private(folio)))
+ goto error_folio_unlock;
+ break;
+ case NETFS_FLUSH_CONTENT:
+ trace_netfs_folio(folio, netfs_flush_content);
+ from = folio_pos(folio);
+ to = from + folio_size(folio) - 1;
+ folio_unlock(folio);
+ folio_put(folio);
+ ret = filemap_write_and_wait_range(mapping, from, to);
+ if (ret < 0)
+ goto error_folio_unlock;
+ continue;
+ }
+
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_folio(folio);
+
+ copied = copy_folio_from_iter_atomic(folio, offset, part, iter);
+
+ flush_dcache_folio(folio);
+
+ /* Deal with a (partially) failed copy */
+ if (copied == 0) {
+ ret = -EFAULT;
+ goto error_folio_unlock;
+ }
+
+ trace = (enum netfs_folio_trace)howto;
+ switch (howto) {
+ case NETFS_FOLIO_IS_UPTODATE:
+ case NETFS_JUST_PREFETCH:
+ netfs_set_group(folio, netfs_group);
+ break;
+ case NETFS_MODIFY_AND_CLEAR:
+ zero_user_segment(&folio->page, offset + copied, flen);
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ break;
+ case NETFS_WHOLE_FOLIO_MODIFY:
+ if (unlikely(copied < part)) {
+ maybe_trouble = true;
+ iov_iter_revert(iter, copied);
+ copied = 0;
+ goto retry;
+ }
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ break;
+ case NETFS_STREAMING_WRITE:
+ if (offset == 0 && copied == flen) {
+ netfs_set_group(folio, netfs_group);
+ folio_mark_uptodate(folio);
+ trace = netfs_streaming_filled_page;
+ break;
+ }
+ finfo = kzalloc(sizeof(*finfo), GFP_KERNEL);
+ if (!finfo) {
+ iov_iter_revert(iter, copied);
+ ret = -ENOMEM;
+ goto error_folio_unlock;
+ }
+ finfo->netfs_group = netfs_get_group(netfs_group);
+ finfo->dirty_offset = offset;
+ finfo->dirty_len = copied;
+ folio_attach_private(folio, (void *)((unsigned long)finfo |
+ NETFS_FOLIO_INFO));
+ break;
+ case NETFS_STREAMING_WRITE_CONT:
+ finfo = netfs_folio_info(folio);
+ finfo->dirty_len += copied;
+ if (finfo->dirty_offset == 0 && finfo->dirty_len == flen) {
+ if (finfo->netfs_group)
+ folio_change_private(folio, finfo->netfs_group);
+ else
+ folio_detach_private(folio);
+ folio_mark_uptodate(folio);
+ kfree(finfo);
+ trace = netfs_streaming_cont_filled_page;
+ }
+ break;
+ default:
+ WARN(true, "Unexpected modify type %u ix=%lx\n",
+ howto, folio->index);
+ ret = -EIO;
+ goto error_folio_unlock;
+ }
+
+ trace_netfs_folio(folio, trace);
+
+ /* Update the inode size if we moved the EOF marker */
+ i_size = i_size_read(inode);
+ pos += copied;
+ if (pos > i_size) {
+ if (ctx->ops->update_i_size) {
+ ctx->ops->update_i_size(inode, pos);
+ } else {
+ i_size_write(inode, pos);
+#if IS_ENABLED(CONFIG_FSCACHE)
+ fscache_update_cookie(ctx->cache, NULL, &pos);
+#endif
+ }
+ }
+ written += copied;
+
+ if (likely(!wreq)) {
+ folio_mark_dirty(folio);
+ } else {
+ if (folio_test_dirty(folio))
+ /* Sigh. mmap. */
+ folio_clear_dirty_for_io(folio);
+ /* We make multiple writes to the folio... */
+ if (!folio_test_writeback(folio)) {
+ folio_wait_fscache(folio);
+ folio_start_writeback(folio);
+ folio_start_fscache(folio);
+ if (wreq->iter.count == 0)
+ trace_netfs_folio(folio, netfs_folio_trace_wthru);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_wthru_plus);
+ }
+ netfs_advance_writethrough(wreq, copied,
+ offset + copied == flen);
+ }
+ retry:
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
+
+ cond_resched();
+ } while (iov_iter_count(iter));
+
+out:
+ if (unlikely(wreq)) {
+ ret = netfs_end_writethrough(wreq, iocb);
+ wbc_detach_inode(&wbc);
+ if (ret == -EIOCBQUEUED)
+ return ret;
+ }
+
+ iocb->ki_pos += written;
+ _leave(" = %zd [%zd]", written, ret);
+ return written ? written : ret;
+
+error_folio_unlock:
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
+}
+EXPORT_SYMBOL(netfs_perform_write);
+
+/**
+ * netfs_buffered_write_iter_locked - write data to a file
+ * @iocb: IO state structure (file, offset, etc.)
+ * @from: iov_iter with data to write
+ * @netfs_group: Grouping for dirty pages (eg. ceph snaps).
+ *
+ * This function does all the work needed for actually writing data to a
+ * file. It does all basic checks, removes SUID from the file, updates
+ * modification times and calls proper subroutines depending on whether we
+ * do direct IO or a standard buffered write.
+ *
+ * The caller must hold appropriate locks around this function and have called
+ * generic_write_checks() already. The caller is also responsible for doing
+ * any necessary syncing afterwards.
+ *
+ * This function does *not* take care of syncing data in case of O_SYNC write.
+ * A caller has to handle it. This is mainly due to the fact that we want to
+ * avoid syncing under i_rwsem.
+ *
+ * Return:
+ * * number of bytes written, even for truncated writes
+ * * negative error code if no data has been written at all
+ */
+ssize_t netfs_buffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *from,
+ struct netfs_group *netfs_group)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t ret;
+
+ trace_netfs_write_iter(iocb, from);
+
+ ret = file_remove_privs(file);
+ if (ret)
+ return ret;
+
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
+
+ return netfs_perform_write(iocb, from, netfs_group);
+}
+EXPORT_SYMBOL(netfs_buffered_write_iter_locked);
+
+/**
+ * netfs_file_write_iter - write data to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Perform a write to a file, writing into the pagecache if possible and doing
+ * an unbuffered write instead if not.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ ssize_t ret;
+
+ _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+ if (!iov_iter_count(from))
+ return 0;
+
+ if ((iocb->ki_flags & IOCB_DIRECT) ||
+ test_bit(NETFS_ICTX_UNBUFFERED, &ictx->flags))
+ return netfs_unbuffered_write_iter(iocb, from);
+
+ ret = netfs_start_io_write(inode);
+ if (ret < 0)
+ return ret;
+
+ ret = generic_write_checks(iocb, from);
+ if (ret > 0)
+ ret = netfs_buffered_write_iter_locked(iocb, from, NULL);
+ netfs_end_io_write(inode);
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_file_write_iter);
+
+/*
+ * Notification that a previously read-only page is about to become writable.
+ * Note that the caller indicates a single page of a multipage folio.
+ */
+vm_fault_t netfs_page_mkwrite(struct vm_fault *vmf, struct netfs_group *netfs_group)
+{
+ struct folio *folio = page_folio(vmf->page);
+ struct file *file = vmf->vma->vm_file;
+ struct inode *inode = file_inode(file);
+ vm_fault_t ret = VM_FAULT_RETRY;
+ int err;
+
+ _enter("%lx", folio->index);
+
+ sb_start_pagefault(inode->i_sb);
+
+ if (folio_wait_writeback_killable(folio))
+ goto out;
+
+ if (folio_lock_killable(folio) < 0)
+ goto out;
+
+ /* Can we see a streaming write here? */
+ if (WARN_ON(!folio_test_uptodate(folio))) {
+ ret = VM_FAULT_SIGBUS | VM_FAULT_LOCKED;
+ goto out;
+ }
+
+ if (netfs_folio_group(folio) != netfs_group) {
+ folio_unlock(folio);
+ err = filemap_fdatawait_range(inode->i_mapping,
+ folio_pos(folio),
+ folio_pos(folio) + folio_size(folio));
+ switch (err) {
+ case 0:
+ ret = VM_FAULT_RETRY;
+ goto out;
+ case -ENOMEM:
+ ret = VM_FAULT_OOM;
+ goto out;
+ default:
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ }
+
+ if (folio_test_dirty(folio))
+ trace_netfs_folio(folio, netfs_folio_trace_mkwrite_plus);
+ else
+ trace_netfs_folio(folio, netfs_folio_trace_mkwrite);
+ netfs_set_group(folio, netfs_group);
+ file_update_time(file);
+ ret = VM_FAULT_LOCKED;
+out:
+ sb_end_pagefault(inode->i_sb);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_page_mkwrite);
+
+/*
+ * Kill all the pages in the given range
+ */
+static void netfs_kill_pages(struct address_space *mapping,
+ loff_t start, loff_t len)
+{
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+ _enter("%llx-%llx", start, start + len - 1);
+
+ do {
+ _debug("kill %lx (to %lx)", index, last);
+
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ next = index + 1;
+ continue;
+ }
+
+ next = folio_next_index(folio);
+
+ trace_netfs_folio(folio, netfs_folio_trace_kill);
+ folio_clear_uptodate(folio);
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ folio_end_writeback(folio);
+ folio_lock(folio);
+ generic_error_remove_folio(mapping, folio);
+ folio_unlock(folio);
+ folio_put(folio);
+
+ } while (index = next, index <= last);
+
+ _leave("");
+}
+
+/*
+ * Redirty all the pages in a given range.
+ */
+static void netfs_redirty_pages(struct address_space *mapping,
+ loff_t start, loff_t len)
+{
+ struct folio *folio;
+ pgoff_t index = start / PAGE_SIZE;
+ pgoff_t last = (start + len - 1) / PAGE_SIZE, next;
+
+ _enter("%llx-%llx", start, start + len - 1);
+
+ do {
+ _debug("redirty %llx @%llx", len, start);
+
+ folio = filemap_get_folio(mapping, index);
+ if (IS_ERR(folio)) {
+ next = index + 1;
+ continue;
+ }
+
+ next = folio_next_index(folio);
+ trace_netfs_folio(folio, netfs_folio_trace_redirty);
+ filemap_dirty_folio(mapping, folio);
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ folio_end_writeback(folio);
+ folio_put(folio);
+ } while (index = next, index <= last);
+
+ balance_dirty_pages_ratelimited(mapping);
+
+ _leave("");
+}
+
+/*
+ * Completion of write to server
+ */
+static void netfs_pages_written_back(struct netfs_io_request *wreq)
+{
+ struct address_space *mapping = wreq->mapping;
+ struct netfs_folio *finfo;
+ struct netfs_group *group = NULL;
+ struct folio *folio;
+ pgoff_t last;
+ int gcount = 0;
+
+ XA_STATE(xas, &mapping->i_pages, wreq->start / PAGE_SIZE);
+
+ _enter("%llx-%llx", wreq->start, wreq->start + wreq->len);
+
+ rcu_read_lock();
+
+ last = (wreq->start + wreq->len - 1) / PAGE_SIZE;
+ xas_for_each(&xas, folio, last) {
+ WARN(!folio_test_writeback(folio),
+ "bad %zx @%llx page %lx %lx\n",
+ wreq->len, wreq->start, folio->index, last);
+
+ if ((finfo = netfs_folio_info(folio))) {
+ /* Streaming writes cannot be redirtied whilst under
+ * writeback, so discard the streaming record.
+ */
+ folio_detach_private(folio);
+ group = finfo->netfs_group;
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_s);
+ kfree(finfo);
+ } else if ((group = netfs_folio_group(folio))) {
+ /* Need to detach the group pointer if the page didn't
+ * get redirtied. If it has been redirtied, then it
+ * must be within the same group.
+ */
+ if (folio_test_dirty(folio)) {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ goto end_wb;
+ }
+ if (folio_trylock(folio)) {
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ }
+ folio_unlock(folio);
+ goto end_wb;
+ }
+
+ xas_pause(&xas);
+ rcu_read_unlock();
+ folio_lock(folio);
+ if (!folio_test_dirty(folio)) {
+ folio_detach_private(folio);
+ gcount++;
+ trace_netfs_folio(folio, netfs_folio_trace_clear_g);
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_redirtied);
+ }
+ folio_unlock(folio);
+ rcu_read_lock();
+ } else {
+ trace_netfs_folio(folio, netfs_folio_trace_clear);
+ }
+ end_wb:
+ if (folio_test_fscache(folio))
+ folio_end_fscache(folio);
+ xas_advance(&xas, folio_next_index(folio) - 1);
+ folio_end_writeback(folio);
+ }
+
+ rcu_read_unlock();
+ netfs_put_group_many(group, gcount);
+ _leave("");
+}
+
+/*
+ * Deal with the disposition of the folios that are under writeback to close
+ * out the operation.
+ */
+static void netfs_cleanup_buffered_write(struct netfs_io_request *wreq)
+{
+ struct address_space *mapping = wreq->mapping;
+
+ _enter("");
+
+ switch (wreq->error) {
+ case 0:
+ netfs_pages_written_back(wreq);
+ break;
+
+ default:
+ pr_notice("R=%08x Unexpected error %d\n", wreq->debug_id, wreq->error);
+ fallthrough;
+ case -EACCES:
+ case -EPERM:
+ case -ENOKEY:
+ case -EKEYEXPIRED:
+ case -EKEYREJECTED:
+ case -EKEYREVOKED:
+ case -ENETRESET:
+ case -EDQUOT:
+ case -ENOSPC:
+ netfs_redirty_pages(mapping, wreq->start, wreq->len);
+ break;
+
+ case -EROFS:
+ case -EIO:
+ case -EREMOTEIO:
+ case -EFBIG:
+ case -ENOENT:
+ case -ENOMEDIUM:
+ case -ENXIO:
+ netfs_kill_pages(mapping, wreq->start, wreq->len);
+ break;
+ }
+
+ if (wreq->error)
+ mapping_set_error(mapping, wreq->error);
+ if (wreq->netfs_ops->done)
+ wreq->netfs_ops->done(wreq);
+}
+
+/*
+ * Extend the region to be written back to include subsequent contiguously
+ * dirty pages if possible, but don't sleep while doing so.
+ *
+ * If this page holds new content, then we can include filler zeros in the
+ * writeback.
+ */
+static void netfs_extend_writeback(struct address_space *mapping,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ long *_count,
+ loff_t start,
+ loff_t max_len,
+ bool caching,
+ size_t *_len,
+ size_t *_top)
+{
+ struct netfs_folio *finfo;
+ struct folio_batch fbatch;
+ struct folio *folio;
+ unsigned int i;
+ pgoff_t index = (start + *_len) / PAGE_SIZE;
+ size_t len;
+ void *priv;
+ bool stop = true;
+
+ folio_batch_init(&fbatch);
+
+ do {
+ /* Firstly, we gather up a batch of contiguous dirty pages
+ * under the RCU read lock - but we can't clear the dirty flags
+ * there if any of those pages are mapped.
+ */
+ rcu_read_lock();
+
+ xas_for_each(xas, folio, ULONG_MAX) {
+ stop = true;
+ if (xas_retry(xas, folio))
+ continue;
+ if (xa_is_value(folio))
+ break;
+ if (folio->index != index) {
+ xas_reset(xas);
+ break;
+ }
+
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(xas);
+ continue;
+ }
+
+ /* Has the folio moved or been split? */
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+ if (!folio_test_dirty(folio) ||
+ folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+
+ stop = false;
+ len = folio_size(folio);
+ priv = folio_get_private(folio);
+ if ((const struct netfs_group *)priv != group) {
+ stop = true;
+ finfo = netfs_folio_info(folio);
+ if (finfo->netfs_group != group ||
+ finfo->dirty_offset > 0) {
+ folio_unlock(folio);
+ folio_put(folio);
+ xas_reset(xas);
+ break;
+ }
+ len = finfo->dirty_len;
+ }
+
+ *_top += folio_size(folio);
+ index += folio_nr_pages(folio);
+ *_count -= folio_nr_pages(folio);
+ *_len += len;
+ if (*_len >= max_len || *_count <= 0)
+ stop = true;
+
+ if (!folio_batch_add(&fbatch, folio))
+ break;
+ if (stop)
+ break;
+ }
+
+ xas_pause(xas);
+ rcu_read_unlock();
+
+ /* Now, if we obtained any folios, we can shift them to being
+ * writable and mark them for caching.
+ */
+ if (!folio_batch_count(&fbatch))
+ break;
+
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
+ trace_netfs_folio(folio, netfs_folio_trace_store_plus);
+
+ if (!folio_clear_dirty_for_io(folio))
+ BUG();
+ folio_start_writeback(folio);
+ netfs_folio_start_fscache(caching, folio);
+ folio_unlock(folio);
+ }
+
+ folio_batch_release(&fbatch);
+ cond_resched();
+ } while (!stop);
+}
+
+/*
+ * Synchronously write back the locked page and any subsequent non-locked dirty
+ * pages.
+ */
+static ssize_t netfs_write_back_from_locked_folio(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ struct folio *folio,
+ unsigned long long start,
+ unsigned long long end)
+{
+ struct netfs_io_request *wreq;
+ struct netfs_folio *finfo;
+ struct netfs_inode *ctx = netfs_inode(mapping->host);
+ unsigned long long i_size = i_size_read(&ctx->inode);
+ size_t len, max_len;
+ bool caching = netfs_is_cache_enabled(ctx);
+ long count = wbc->nr_to_write;
+ int ret;
+
+ _enter(",%lx,%llx-%llx,%u", folio->index, start, end, caching);
+
+ wreq = netfs_alloc_request(mapping, NULL, start, folio_size(folio),
+ NETFS_WRITEBACK);
+ if (IS_ERR(wreq)) {
+ folio_unlock(folio);
+ return PTR_ERR(wreq);
+ }
+
+ if (!folio_clear_dirty_for_io(folio))
+ BUG();
+ folio_start_writeback(folio);
+ netfs_folio_start_fscache(caching, folio);
+
+ count -= folio_nr_pages(folio);
+
+ /* Find all consecutive lockable dirty pages that have contiguous
+ * written regions, stopping when we find a page that is not
+ * immediately lockable, is not dirty or is missing, or we reach the
+ * end of the range.
+ */
+ trace_netfs_folio(folio, netfs_folio_trace_store);
+
+ len = wreq->len;
+ finfo = netfs_folio_info(folio);
+ if (finfo) {
+ start += finfo->dirty_offset;
+ if (finfo->dirty_offset + finfo->dirty_len != len) {
+ len = finfo->dirty_len;
+ goto cant_expand;
+ }
+ len = finfo->dirty_len;
+ }
+
+ if (start < i_size) {
+ /* Trim the write to the EOF; the extra data is ignored. Also
+ * put an upper limit on the size of a single storedata op.
+ */
+ max_len = 65536 * 4096;
+ max_len = min_t(unsigned long long, max_len, end - start + 1);
+ max_len = min_t(unsigned long long, max_len, i_size - start);
+
+ if (len < max_len)
+ netfs_extend_writeback(mapping, group, xas, &count, start,
+ max_len, caching, &len, &wreq->upper_len);
+ }
+
+cant_expand:
+ len = min_t(unsigned long long, len, i_size - start);
+
+ /* We now have a contiguous set of dirty pages, each with writeback
+ * set; the first page is still locked at this point, but all the rest
+ * have been unlocked.
+ */
+ folio_unlock(folio);
+ wreq->start = start;
+ wreq->len = len;
+
+ if (start < i_size) {
+ _debug("write back %zx @%llx [%llx]", len, start, i_size);
+
+ /* Speculatively write to the cache. We have to fix this up
+ * later if the store fails.
+ */
+ wreq->cleanup = netfs_cleanup_buffered_write;
+
+ iov_iter_xarray(&wreq->iter, ITER_SOURCE, &mapping->i_pages, start,
+ wreq->upper_len);
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ ret = netfs_begin_write(wreq, true, netfs_write_trace_writeback);
+ if (ret == 0 || ret == -EIOCBQUEUED)
+ wbc->nr_to_write -= len / PAGE_SIZE;
+ } else {
+ _debug("write discard %zx @%llx [%llx]", len, start, i_size);
+
+ /* The dirty region was entirely beyond the EOF. */
+ fscache_clear_page_bits(mapping, start, len, caching);
+ netfs_pages_written_back(wreq);
+ ret = 0;
+ }
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ _leave(" = 1");
+ return 1;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static ssize_t netfs_writepages_begin(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ struct xa_state *xas,
+ unsigned long long *_start,
+ unsigned long long end)
+{
+ const struct netfs_folio *finfo;
+ struct folio *folio;
+ unsigned long long start = *_start;
+ ssize_t ret;
+ void *priv;
+ int skips = 0;
+
+ _enter("%llx,%llx,", start, end);
+
+search_again:
+ /* Find the first dirty page in the group. */
+ rcu_read_lock();
+
+ for (;;) {
+ folio = xas_find_marked(xas, end / PAGE_SIZE, PAGECACHE_TAG_DIRTY);
+ if (xas_retry(xas, folio) || xa_is_value(folio))
+ continue;
+ if (!folio)
+ break;
+
+ if (!folio_try_get_rcu(folio)) {
+ xas_reset(xas);
+ continue;
+ }
+
+ if (unlikely(folio != xas_reload(xas))) {
+ folio_put(folio);
+ xas_reset(xas);
+ continue;
+ }
+
+ /* Skip any dirty folio that's not in the group of interest. */
+ priv = folio_get_private(folio);
+ if ((const struct netfs_group *)priv != group) {
+ finfo = netfs_folio_info(folio);
+ if (finfo->netfs_group != group) {
+ folio_put(folio);
+ continue;
+ }
+ }
+
+ xas_pause(xas);
+ break;
+ }
+ rcu_read_unlock();
+ if (!folio)
+ return 0;
+
+ start = folio_pos(folio); /* May regress with THPs */
+
+ _debug("wback %lx", folio->index);
+
+ /* At this point we hold neither the i_pages lock nor the page lock:
+ * the page may be truncated or invalidated (changing page->mapping to
+ * NULL), or even swizzled back from swapper_space to tmpfs file
+ * mapping
+ */
+lock_again:
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ ret = folio_lock_killable(folio);
+ if (ret < 0)
+ return ret;
+ } else {
+ if (!folio_trylock(folio))
+ goto search_again;
+ }
+
+ if (folio->mapping != mapping ||
+ !folio_test_dirty(folio)) {
+ start += folio_size(folio);
+ folio_unlock(folio);
+ goto search_again;
+ }
+
+ if (folio_test_writeback(folio) ||
+ folio_test_fscache(folio)) {
+ folio_unlock(folio);
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ folio_wait_writeback(folio);
+#ifdef CONFIG_FSCACHE
+ folio_wait_fscache(folio);
+#endif
+ goto lock_again;
+ }
+
+ start += folio_size(folio);
+ if (wbc->sync_mode == WB_SYNC_NONE) {
+ if (skips >= 5 || need_resched()) {
+ ret = 0;
+ goto out;
+ }
+ skips++;
+ }
+ goto search_again;
+ }
+
+ ret = netfs_write_back_from_locked_folio(mapping, wbc, group, xas,
+ folio, start, end);
+out:
+ if (ret > 0)
+ *_start = start + ret;
+ _leave(" = %zd [%llx]", ret, *_start);
+ return ret;
+}
+
+/*
+ * Write a region of pages back to the server
+ */
+static int netfs_writepages_region(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct netfs_group *group,
+ unsigned long long *_start,
+ unsigned long long end)
+{
+ ssize_t ret;
+
+ XA_STATE(xas, &mapping->i_pages, *_start / PAGE_SIZE);
+
+ do {
+ ret = netfs_writepages_begin(mapping, wbc, group, &xas,
+ _start, end);
+ if (ret > 0 && wbc->nr_to_write > 0)
+ cond_resched();
+ } while (ret > 0 && wbc->nr_to_write > 0);
+
+ return ret > 0 ? 0 : ret;
+}
+
+/*
+ * write some of the pending data back to the server
+ */
+int netfs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct netfs_group *group = NULL;
+ loff_t start, end;
+ int ret;
+
+ _enter("");
+
+ /* We have to be careful as we can end up racing with setattr()
+ * truncating the pagecache since the caller doesn't take a lock here
+ * to prevent it.
+ */
+
+ if (wbc->range_cyclic && mapping->writeback_index) {
+ start = mapping->writeback_index * PAGE_SIZE;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, LLONG_MAX);
+ if (ret < 0)
+ goto out;
+
+ if (wbc->nr_to_write <= 0) {
+ mapping->writeback_index = start / PAGE_SIZE;
+ goto out;
+ }
+
+ start = 0;
+ end = mapping->writeback_index * PAGE_SIZE;
+ mapping->writeback_index = 0;
+ ret = netfs_writepages_region(mapping, wbc, group, &start, end);
+ if (ret == 0)
+ mapping->writeback_index = start / PAGE_SIZE;
+ } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
+ start = 0;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, LLONG_MAX);
+ if (wbc->nr_to_write > 0 && ret == 0)
+ mapping->writeback_index = start / PAGE_SIZE;
+ } else {
+ start = wbc->range_start;
+ ret = netfs_writepages_region(mapping, wbc, group,
+ &start, wbc->range_end);
+ }
+
+out:
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_writepages);
+
+/*
+ * Deal with the disposition of a laundered folio.
+ */
+static void netfs_cleanup_launder_folio(struct netfs_io_request *wreq)
+{
+ if (wreq->error) {
+ pr_notice("R=%08x Laundering error %d\n", wreq->debug_id, wreq->error);
+ mapping_set_error(wreq->mapping, wreq->error);
+ }
+}
+
+/**
+ * netfs_launder_folio - Clean up a dirty folio that's being invalidated
+ * @folio: The folio to clean
+ *
+ * This is called to write back a folio that's being invalidated when an inode
+ * is getting torn down. Ideally, writepages would be used instead.
+ */
+int netfs_launder_folio(struct folio *folio)
+{
+ struct netfs_io_request *wreq;
+ struct address_space *mapping = folio->mapping;
+ struct netfs_folio *finfo = netfs_folio_info(folio);
+ struct netfs_group *group = netfs_folio_group(folio);
+ struct bio_vec bvec;
+ unsigned long long i_size = i_size_read(mapping->host);
+ unsigned long long start = folio_pos(folio);
+ size_t offset = 0, len;
+ int ret = 0;
+
+ if (finfo) {
+ offset = finfo->dirty_offset;
+ start += offset;
+ len = finfo->dirty_len;
+ } else {
+ len = folio_size(folio);
+ }
+ len = min_t(unsigned long long, len, i_size - start);
+
+ wreq = netfs_alloc_request(mapping, NULL, start, len, NETFS_LAUNDER_WRITE);
+ if (IS_ERR(wreq)) {
+ ret = PTR_ERR(wreq);
+ goto out;
+ }
+
+ if (!folio_clear_dirty_for_io(folio))
+ goto out_put;
+
+ trace_netfs_folio(folio, netfs_folio_trace_launder);
+
+ _debug("launder %llx-%llx", start, start + len - 1);
+
+ /* Speculatively write to the cache. We have to fix this up later if
+ * the store fails.
+ */
+ wreq->cleanup = netfs_cleanup_launder_folio;
+
+ bvec_set_folio(&bvec, folio, len, offset);
+ iov_iter_bvec(&wreq->iter, ITER_SOURCE, &bvec, 1, len);
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ ret = netfs_begin_write(wreq, true, netfs_write_trace_launder);
+
+out_put:
+ folio_detach_private(folio);
+ netfs_put_group(group);
+ kfree(finfo);
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+out:
+ folio_wait_fscache(folio);
+ _leave(" = %d", ret);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_launder_folio);
diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c
new file mode 100644
index 000000000000..ad4370b3935d
--- /dev/null
+++ b/fs/netfs/direct_read.c
@@ -0,0 +1,125 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Direct I/O support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/**
+ * netfs_unbuffered_read_iter_locked - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer. No use is made of the pagecache.
+ *
+ * The caller must hold any appropriate locks.
+ */
+static ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct netfs_io_request *rreq;
+ ssize_t ret;
+ size_t orig_count = iov_iter_count(iter);
+ bool async = !is_sync_kiocb(iocb);
+
+ _enter("");
+
+ if (!orig_count)
+ return 0; /* Don't update atime */
+
+ ret = kiocb_write_and_wait(iocb, orig_count);
+ if (ret < 0)
+ return ret;
+ file_accessed(iocb->ki_filp);
+
+ rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ iocb->ki_pos, orig_count,
+ NETFS_DIO_READ);
+ if (IS_ERR(rreq))
+ return PTR_ERR(rreq);
+
+ netfs_stat(&netfs_n_rh_dio_read);
+ trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_dio_read);
+
+ /* If this is an async op, we have to keep track of the destination
+ * buffer for ourselves as the caller's iterator will be trashed when
+ * we return.
+ *
+ * In such a case, extract an iterator to represent as much of the the
+ * output buffer as we can manage. Note that the extraction might not
+ * be able to allocate a sufficiently large bvec array and may shorten
+ * the request.
+ */
+ if (user_backed_iter(iter)) {
+ ret = netfs_extract_user_iter(iter, rreq->len, &rreq->iter, 0);
+ if (ret < 0)
+ goto out;
+ rreq->direct_bv = (struct bio_vec *)rreq->iter.bvec;
+ rreq->direct_bv_count = ret;
+ rreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+ rreq->len = iov_iter_count(&rreq->iter);
+ } else {
+ rreq->iter = *iter;
+ rreq->len = orig_count;
+ rreq->direct_bv_unpin = false;
+ iov_iter_advance(iter, orig_count);
+ }
+
+ // TODO: Set up bounce buffer if needed
+
+ if (async)
+ rreq->iocb = iocb;
+
+ ret = netfs_begin_read(rreq, is_sync_kiocb(iocb));
+ if (ret < 0)
+ goto out; /* May be -EIOCBQUEUED */
+ if (!async) {
+ // TODO: Copy from bounce buffer
+ iocb->ki_pos += rreq->transferred;
+ ret = rreq->transferred;
+ }
+
+out:
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_return);
+ if (ret > 0)
+ orig_count -= ret;
+ if (ret != -EIOCBQUEUED)
+ iov_iter_revert(iter, orig_count - iov_iter_count(iter));
+ return ret;
+}
+
+/**
+ * netfs_unbuffered_read_iter - Perform an unbuffered or direct I/O read
+ * @iocb: The I/O control descriptor describing the read
+ * @iter: The output buffer (also specifies read length)
+ *
+ * Perform an unbuffered I/O or direct I/O from the file in @iocb to the
+ * output buffer. No use is made of the pagecache.
+ */
+ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+ struct inode *inode = file_inode(iocb->ki_filp);
+ ssize_t ret;
+
+ if (!iter->count)
+ return 0; /* Don't update atime */
+
+ ret = netfs_start_io_direct(inode);
+ if (ret == 0) {
+ ret = netfs_unbuffered_read_iter_locked(iocb, iter);
+ netfs_end_io_direct(inode);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_read_iter);
diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c
new file mode 100644
index 000000000000..bee047e20f5d
--- /dev/null
+++ b/fs/netfs/direct_write.c
@@ -0,0 +1,174 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Unbuffered and direct write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/export.h>
+#include <linux/uio.h>
+#include "internal.h"
+
+static void netfs_cleanup_dio_write(struct netfs_io_request *wreq)
+{
+ struct inode *inode = wreq->inode;
+ unsigned long long end = wreq->start + wreq->len;
+
+ if (!wreq->error &&
+ i_size_read(inode) < end) {
+ if (wreq->netfs_ops->update_i_size)
+ wreq->netfs_ops->update_i_size(inode, end);
+ else
+ i_size_write(inode, end);
+ }
+}
+
+/*
+ * Perform an unbuffered write where we may have to do an RMW operation on an
+ * encrypted file. This can also be used for direct I/O writes.
+ */
+static ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *iter,
+ struct netfs_group *netfs_group)
+{
+ struct netfs_io_request *wreq;
+ unsigned long long start = iocb->ki_pos;
+ unsigned long long end = start + iov_iter_count(iter);
+ ssize_t ret, n;
+ bool async = !is_sync_kiocb(iocb);
+
+ _enter("");
+
+ /* We're going to need a bounce buffer if what we transmit is going to
+ * be different in some way to the source buffer, e.g. because it gets
+ * encrypted/compressed or because it needs expanding to a block size.
+ */
+ // TODO
+
+ _debug("uw %llx-%llx", start, end);
+
+ wreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp,
+ start, end - start,
+ iocb->ki_flags & IOCB_DIRECT ?
+ NETFS_DIO_WRITE : NETFS_UNBUFFERED_WRITE);
+ if (IS_ERR(wreq))
+ return PTR_ERR(wreq);
+
+ {
+ /* If this is an async op and we're not using a bounce buffer,
+ * we have to save the source buffer as the iterator is only
+ * good until we return. In such a case, extract an iterator
+ * to represent as much of the the output buffer as we can
+ * manage. Note that the extraction might not be able to
+ * allocate a sufficiently large bvec array and may shorten the
+ * request.
+ */
+ if (async || user_backed_iter(iter)) {
+ n = netfs_extract_user_iter(iter, wreq->len, &wreq->iter, 0);
+ if (n < 0) {
+ ret = n;
+ goto out;
+ }
+ wreq->direct_bv = (struct bio_vec *)wreq->iter.bvec;
+ wreq->direct_bv_count = n;
+ wreq->direct_bv_unpin = iov_iter_extract_will_pin(iter);
+ wreq->len = iov_iter_count(&wreq->iter);
+ } else {
+ wreq->iter = *iter;
+ }
+
+ wreq->io_iter = wreq->iter;
+ }
+
+ /* Copy the data into the bounce buffer and encrypt it. */
+ // TODO
+
+ /* Dispatch the write. */
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ if (async)
+ wreq->iocb = iocb;
+ wreq->cleanup = netfs_cleanup_dio_write;
+ ret = netfs_begin_write(wreq, is_sync_kiocb(iocb),
+ iocb->ki_flags & IOCB_DIRECT ?
+ netfs_write_trace_dio_write :
+ netfs_write_trace_unbuffered_write);
+ if (ret < 0) {
+ _debug("begin = %zd", ret);
+ goto out;
+ }
+
+ if (!async) {
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+
+ ret = wreq->error;
+ _debug("waited = %zd", ret);
+ if (ret == 0) {
+ ret = wreq->transferred;
+ iocb->ki_pos += ret;
+ }
+ } else {
+ ret = -EIOCBQUEUED;
+ }
+
+out:
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
+
+/**
+ * netfs_unbuffered_write_iter - Unbuffered write to a file
+ * @iocb: IO state structure
+ * @from: iov_iter with data to write
+ *
+ * Do an unbuffered write to a file, writing the data directly to the server
+ * and not lodging the data in the pagecache.
+ *
+ * Return:
+ * * Negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * Number of bytes written, even for truncated writes
+ */
+ssize_t netfs_unbuffered_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file->f_mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ unsigned long long end;
+ ssize_t ret;
+
+ _enter("%llx,%zx,%llx", iocb->ki_pos, iov_iter_count(from), i_size_read(inode));
+
+ if (!iov_iter_count(from))
+ return 0;
+
+ trace_netfs_write_iter(iocb, from);
+ netfs_stat(&netfs_n_rh_dio_write);
+
+ ret = netfs_start_io_direct(inode);
+ if (ret < 0)
+ return ret;
+ ret = generic_write_checks(iocb, from);
+ if (ret <= 0)
+ goto out;
+ ret = file_remove_privs(file);
+ if (ret < 0)
+ goto out;
+ ret = file_update_time(file);
+ if (ret < 0)
+ goto out;
+ ret = kiocb_invalidate_pages(iocb, iov_iter_count(from));
+ if (ret < 0)
+ goto out;
+ end = iocb->ki_pos + iov_iter_count(from);
+ if (end > ictx->zero_point)
+ ictx->zero_point = end;
+
+ fscache_invalidate(netfs_i_cookie(ictx), NULL, i_size_read(inode),
+ FSCACHE_INVAL_DIO_WRITE);
+ ret = netfs_unbuffered_write_iter_locked(iocb, from, NULL);
+out:
+ netfs_end_io_direct(inode);
+ return ret;
+}
+EXPORT_SYMBOL(netfs_unbuffered_write_iter);
diff --git a/fs/fscache/cache.c b/fs/netfs/fscache_cache.c
index d645f8b302a2..9397ed39b0b4 100644
--- a/fs/fscache/cache.c
+++ b/fs/netfs/fscache_cache.c
@@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache);
void fscache_put_cache(struct fscache_cache *cache,
enum fscache_cache_trace where)
{
- unsigned int debug_id = cache->debug_id;
+ unsigned int debug_id;
bool zero;
int ref;
if (IS_ERR_OR_NULL(cache))
return;
+ debug_id = cache->debug_id;
zero = __refcount_dec_and_test(&cache->ref, &ref);
trace_fscache_cache(debug_id, ref - 1, where);
diff --git a/fs/fscache/cookie.c b/fs/netfs/fscache_cookie.c
index bce2492186d0..bce2492186d0 100644
--- a/fs/fscache/cookie.c
+++ b/fs/netfs/fscache_cookie.c
diff --git a/fs/netfs/fscache_internal.h b/fs/netfs/fscache_internal.h
new file mode 100644
index 000000000000..a09b948fcef2
--- /dev/null
+++ b/fs/netfs/fscache_internal.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Internal definitions for FS-Cache
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include "internal.h"
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) "FS-Cache: " fmt
diff --git a/fs/fscache/io.c b/fs/netfs/fscache_io.c
index 0d2b8dec8f82..ad572f7ee897 100644
--- a/fs/fscache/io.c
+++ b/fs/netfs/fscache_io.c
@@ -158,46 +158,6 @@ int __fscache_begin_write_operation(struct netfs_cache_resources *cres,
}
EXPORT_SYMBOL(__fscache_begin_write_operation);
-/**
- * fscache_dirty_folio - Mark folio dirty and pin a cache object for writeback
- * @mapping: The mapping the folio belongs to.
- * @folio: The folio being dirtied.
- * @cookie: The cookie referring to the cache object
- *
- * Set the dirty flag on a folio and pin an in-use cache object in memory
- * so that writeback can later write to it. This is intended
- * to be called from the filesystem's ->dirty_folio() method.
- *
- * Return: true if the dirty flag was set on the folio, false otherwise.
- */
-bool fscache_dirty_folio(struct address_space *mapping, struct folio *folio,
- struct fscache_cookie *cookie)
-{
- struct inode *inode = mapping->host;
- bool need_use = false;
-
- _enter("");
-
- if (!filemap_dirty_folio(mapping, folio))
- return false;
- if (!fscache_cookie_valid(cookie))
- return true;
-
- if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
- spin_lock(&inode->i_lock);
- if (!(inode->i_state & I_PINNING_FSCACHE_WB)) {
- inode->i_state |= I_PINNING_FSCACHE_WB;
- need_use = true;
- }
- spin_unlock(&inode->i_lock);
-
- if (need_use)
- fscache_use_cookie(cookie, true);
- }
- return true;
-}
-EXPORT_SYMBOL(fscache_dirty_folio);
-
struct fscache_write_request {
struct netfs_cache_resources cache_resources;
struct address_space *mapping;
@@ -277,7 +237,7 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie,
fscache_access_io_write) < 0)
goto abandon_free;
- ret = cres->ops->prepare_write(cres, &start, &len, i_size, false);
+ ret = cres->ops->prepare_write(cres, &start, &len, len, i_size, false);
if (ret < 0)
goto abandon_end;
diff --git a/fs/fscache/main.c b/fs/netfs/fscache_main.c
index dad85fd84f6f..42e98bb523e3 100644
--- a/fs/fscache/main.c
+++ b/fs/netfs/fscache_main.c
@@ -8,18 +8,9 @@
#define FSCACHE_DEBUG_LEVEL CACHE
#include <linux/module.h>
#include <linux/init.h>
-#define CREATE_TRACE_POINTS
#include "internal.h"
-
-MODULE_DESCRIPTION("FS Cache Manager");
-MODULE_AUTHOR("Red Hat, Inc.");
-MODULE_LICENSE("GPL");
-
-unsigned fscache_debug;
-module_param_named(debug, fscache_debug, uint,
- S_IWUSR | S_IRUGO);
-MODULE_PARM_DESC(fscache_debug,
- "FS-Cache debugging mask");
+#define CREATE_TRACE_POINTS
+#include <trace/events/fscache.h>
EXPORT_TRACEPOINT_SYMBOL(fscache_access_cache);
EXPORT_TRACEPOINT_SYMBOL(fscache_access_volume);
@@ -71,7 +62,7 @@ unsigned int fscache_hash(unsigned int salt, const void *data, size_t len)
/*
* initialise the fs caching module
*/
-static int __init fscache_init(void)
+int __init fscache_init(void)
{
int ret = -ENOMEM;
@@ -92,7 +83,7 @@ static int __init fscache_init(void)
goto error_cookie_jar;
}
- pr_notice("Loaded\n");
+ pr_notice("FS-Cache loaded\n");
return 0;
error_cookie_jar:
@@ -103,19 +94,15 @@ error_wq:
return ret;
}
-fs_initcall(fscache_init);
-
/*
* clean up on module removal
*/
-static void __exit fscache_exit(void)
+void __exit fscache_exit(void)
{
_enter("");
kmem_cache_destroy(fscache_cookie_jar);
fscache_proc_cleanup();
destroy_workqueue(fscache_wq);
- pr_notice("Unloaded\n");
+ pr_notice("FS-Cache unloaded\n");
}
-
-module_exit(fscache_exit);
diff --git a/fs/fscache/proc.c b/fs/netfs/fscache_proc.c
index dc3b0e9c8cce..874d951bc390 100644
--- a/fs/fscache/proc.c
+++ b/fs/netfs/fscache_proc.c
@@ -12,41 +12,34 @@
#include "internal.h"
/*
- * initialise the /proc/fs/fscache/ directory
+ * Add files to /proc/fs/netfs/.
*/
int __init fscache_proc_init(void)
{
- if (!proc_mkdir("fs/fscache", NULL))
- goto error_dir;
+ if (!proc_symlink("fs/fscache", NULL, "netfs"))
+ goto error_sym;
- if (!proc_create_seq("fs/fscache/caches", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/caches", S_IFREG | 0444, NULL,
&fscache_caches_seq_ops))
goto error;
- if (!proc_create_seq("fs/fscache/volumes", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/volumes", S_IFREG | 0444, NULL,
&fscache_volumes_seq_ops))
goto error;
- if (!proc_create_seq("fs/fscache/cookies", S_IFREG | 0444, NULL,
+ if (!proc_create_seq("fs/netfs/cookies", S_IFREG | 0444, NULL,
&fscache_cookies_seq_ops))
goto error;
-
-#ifdef CONFIG_FSCACHE_STATS
- if (!proc_create_single("fs/fscache/stats", S_IFREG | 0444, NULL,
- fscache_stats_show))
- goto error;
-#endif
-
return 0;
error:
remove_proc_entry("fs/fscache", NULL);
-error_dir:
+error_sym:
return -ENOMEM;
}
/*
- * clean up the /proc/fs/fscache/ directory
+ * Clean up the /proc/fs/fscache symlink.
*/
void fscache_proc_cleanup(void)
{
diff --git a/fs/fscache/stats.c b/fs/netfs/fscache_stats.c
index fc94e5e79f1c..add21abdf713 100644
--- a/fs/fscache/stats.c
+++ b/fs/netfs/fscache_stats.c
@@ -48,13 +48,15 @@ atomic_t fscache_n_no_create_space;
EXPORT_SYMBOL(fscache_n_no_create_space);
atomic_t fscache_n_culled;
EXPORT_SYMBOL(fscache_n_culled);
+atomic_t fscache_n_dio_misfit;
+EXPORT_SYMBOL(fscache_n_dio_misfit);
/*
* display the general statistics
*/
-int fscache_stats_show(struct seq_file *m, void *v)
+int fscache_stats_show(struct seq_file *m)
{
- seq_puts(m, "FS-Cache statistics\n");
+ seq_puts(m, "-- FS-Cache statistics --\n");
seq_printf(m, "Cookies: n=%d v=%d vcol=%u voom=%u\n",
atomic_read(&fscache_n_cookies),
atomic_read(&fscache_n_volumes),
@@ -93,10 +95,9 @@ int fscache_stats_show(struct seq_file *m, void *v)
atomic_read(&fscache_n_no_create_space),
atomic_read(&fscache_n_culled));
- seq_printf(m, "IO : rd=%u wr=%u\n",
+ seq_printf(m, "IO : rd=%u wr=%u mis=%u\n",
atomic_read(&fscache_n_read),
- atomic_read(&fscache_n_write));
-
- netfs_stats_show(m);
+ atomic_read(&fscache_n_write),
+ atomic_read(&fscache_n_dio_misfit));
return 0;
}
diff --git a/fs/fscache/volume.c b/fs/netfs/fscache_volume.c
index cdf991bdd9de..cdf991bdd9de 100644
--- a/fs/fscache/volume.c
+++ b/fs/netfs/fscache_volume.c
diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h
index 43fac1b14e40..ec7045d24400 100644
--- a/fs/netfs/internal.h
+++ b/fs/netfs/internal.h
@@ -5,9 +5,13 @@
* Written by David Howells (dhowells@redhat.com)
*/
+#include <linux/slab.h>
+#include <linux/seq_file.h>
#include <linux/netfs.h>
#include <linux/fscache.h>
+#include <linux/fscache-cache.h>
#include <trace/events/netfs.h>
+#include <trace/events/fscache.h>
#ifdef pr_fmt
#undef pr_fmt
@@ -19,6 +23,8 @@
* buffered_read.c
*/
void netfs_rreq_unlock_folios(struct netfs_io_request *rreq);
+int netfs_prefetch_for_write(struct file *file, struct folio *folio,
+ size_t offset, size_t len);
/*
* io.c
@@ -29,6 +35,41 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync);
* main.c
*/
extern unsigned int netfs_debug;
+extern struct list_head netfs_io_requests;
+extern spinlock_t netfs_proc_lock;
+
+#ifdef CONFIG_PROC_FS
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq)
+{
+ spin_lock(&netfs_proc_lock);
+ list_add_tail_rcu(&rreq->proc_link, &netfs_io_requests);
+ spin_unlock(&netfs_proc_lock);
+}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq)
+{
+ if (!list_empty(&rreq->proc_link)) {
+ spin_lock(&netfs_proc_lock);
+ list_del_rcu(&rreq->proc_link);
+ spin_unlock(&netfs_proc_lock);
+ }
+}
+#else
+static inline void netfs_proc_add_rreq(struct netfs_io_request *rreq) {}
+static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {}
+#endif
+
+/*
+ * misc.c
+ */
+#define NETFS_FLAG_PUT_MARK BIT(0)
+#define NETFS_FLAG_PAGECACHE_MARK BIT(1)
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+ struct folio *folio, unsigned int flags,
+ gfp_t gfp_mask);
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+ struct address_space *mapping,
+ pgoff_t index, pgoff_t to, gfp_t gfp_mask);
+void netfs_clear_buffer(struct xarray *buffer);
/*
* objects.c
@@ -50,9 +91,20 @@ static inline void netfs_see_request(struct netfs_io_request *rreq,
}
/*
+ * output.c
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+ enum netfs_write_trace what);
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len);
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end);
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb);
+
+/*
* stats.c
*/
#ifdef CONFIG_NETFS_STATS
+extern atomic_t netfs_n_rh_dio_read;
+extern atomic_t netfs_n_rh_dio_write;
extern atomic_t netfs_n_rh_readahead;
extern atomic_t netfs_n_rh_readpage;
extern atomic_t netfs_n_rh_rreq;
@@ -71,7 +123,15 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
+extern atomic_t netfs_n_wh_wstream_conflict;
+extern atomic_t netfs_n_wh_upload;
+extern atomic_t netfs_n_wh_upload_done;
+extern atomic_t netfs_n_wh_upload_failed;
+extern atomic_t netfs_n_wh_write;
+extern atomic_t netfs_n_wh_write_done;
+extern atomic_t netfs_n_wh_write_failed;
+int netfs_stats_show(struct seq_file *m, void *v);
static inline void netfs_stat(atomic_t *stat)
{
@@ -103,6 +163,176 @@ static inline bool netfs_is_cache_enabled(struct netfs_inode *ctx)
#endif
}
+/*
+ * Get a ref on a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline struct netfs_group *netfs_get_group(struct netfs_group *netfs_group)
+{
+ if (netfs_group)
+ refcount_inc(&netfs_group->ref);
+ return netfs_group;
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group(struct netfs_group *netfs_group)
+{
+ if (netfs_group && refcount_dec_and_test(&netfs_group->ref))
+ netfs_group->free(netfs_group);
+}
+
+/*
+ * Dispose of a netfs group attached to a dirty page (e.g. a ceph snap).
+ */
+static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr)
+{
+ if (netfs_group && refcount_sub_and_test(nr, &netfs_group->ref))
+ netfs_group->free(netfs_group);
+}
+
+/*
+ * fscache-cache.c
+ */
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_caches_seq_ops;
+#endif
+bool fscache_begin_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+void fscache_end_cache_access(struct fscache_cache *cache, enum fscache_access_trace why);
+struct fscache_cache *fscache_lookup_cache(const char *name, bool is_cache);
+void fscache_put_cache(struct fscache_cache *cache, enum fscache_cache_trace where);
+
+static inline enum fscache_cache_state fscache_cache_state(const struct fscache_cache *cache)
+{
+ return smp_load_acquire(&cache->state);
+}
+
+static inline bool fscache_cache_is_live(const struct fscache_cache *cache)
+{
+ return fscache_cache_state(cache) == FSCACHE_CACHE_IS_ACTIVE;
+}
+
+static inline void fscache_set_cache_state(struct fscache_cache *cache,
+ enum fscache_cache_state new_state)
+{
+ smp_store_release(&cache->state, new_state);
+
+}
+
+static inline bool fscache_set_cache_state_maybe(struct fscache_cache *cache,
+ enum fscache_cache_state old_state,
+ enum fscache_cache_state new_state)
+{
+ return try_cmpxchg_release(&cache->state, &old_state, new_state);
+}
+
+/*
+ * fscache-cookie.c
+ */
+extern struct kmem_cache *fscache_cookie_jar;
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_cookies_seq_ops;
+#endif
+extern struct timer_list fscache_cookie_lru_timer;
+
+extern void fscache_print_cookie(struct fscache_cookie *cookie, char prefix);
+extern bool fscache_begin_cookie_access(struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+
+static inline void fscache_see_cookie(struct fscache_cookie *cookie,
+ enum fscache_cookie_trace where)
+{
+ trace_fscache_cookie(cookie->debug_id, refcount_read(&cookie->ref),
+ where);
+}
+
+/*
+ * fscache-main.c
+ */
+extern unsigned int fscache_hash(unsigned int salt, const void *data, size_t len);
+#ifdef CONFIG_FSCACHE
+int __init fscache_init(void);
+void __exit fscache_exit(void);
+#else
+static inline int fscache_init(void) { return 0; }
+static inline void fscache_exit(void) {}
+#endif
+
+/*
+ * fscache-proc.c
+ */
+#ifdef CONFIG_PROC_FS
+extern int __init fscache_proc_init(void);
+extern void fscache_proc_cleanup(void);
+#else
+#define fscache_proc_init() (0)
+#define fscache_proc_cleanup() do {} while (0)
+#endif
+
+/*
+ * fscache-stats.c
+ */
+#ifdef CONFIG_FSCACHE_STATS
+extern atomic_t fscache_n_volumes;
+extern atomic_t fscache_n_volumes_collision;
+extern atomic_t fscache_n_volumes_nomem;
+extern atomic_t fscache_n_cookies;
+extern atomic_t fscache_n_cookies_lru;
+extern atomic_t fscache_n_cookies_lru_expired;
+extern atomic_t fscache_n_cookies_lru_removed;
+extern atomic_t fscache_n_cookies_lru_dropped;
+
+extern atomic_t fscache_n_acquires;
+extern atomic_t fscache_n_acquires_ok;
+extern atomic_t fscache_n_acquires_oom;
+
+extern atomic_t fscache_n_invalidates;
+
+extern atomic_t fscache_n_relinquishes;
+extern atomic_t fscache_n_relinquishes_retire;
+extern atomic_t fscache_n_relinquishes_dropped;
+
+extern atomic_t fscache_n_resizes;
+extern atomic_t fscache_n_resizes_null;
+
+static inline void fscache_stat(atomic_t *stat)
+{
+ atomic_inc(stat);
+}
+
+static inline void fscache_stat_d(atomic_t *stat)
+{
+ atomic_dec(stat);
+}
+
+#define __fscache_stat(stat) (stat)
+
+int fscache_stats_show(struct seq_file *m);
+#else
+
+#define __fscache_stat(stat) (NULL)
+#define fscache_stat(stat) do {} while (0)
+#define fscache_stat_d(stat) do {} while (0)
+
+static inline int fscache_stats_show(struct seq_file *m) { return 0; }
+#endif
+
+/*
+ * fscache-volume.c
+ */
+#ifdef CONFIG_PROC_FS
+extern const struct seq_operations fscache_volumes_seq_ops;
+#endif
+
+struct fscache_volume *fscache_get_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+void fscache_put_volume(struct fscache_volume *volume,
+ enum fscache_volume_trace where);
+bool fscache_begin_volume_access(struct fscache_volume *volume,
+ struct fscache_cookie *cookie,
+ enum fscache_access_trace why);
+void fscache_create_volume(struct fscache_volume *volume, bool wait);
+
/*****************************************************************************/
/*
* debug tracing
@@ -143,3 +373,57 @@ do { \
#define _leave(FMT, ...) no_printk("<== %s()"FMT"", __func__, ##__VA_ARGS__)
#define _debug(FMT, ...) no_printk(FMT, ##__VA_ARGS__)
#endif
+
+/*
+ * assertions
+ */
+#if 1 /* defined(__KDEBUGALL) */
+
+#define ASSERT(X) \
+do { \
+ if (unlikely(!(X))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTCMP(X, OP, Y) \
+do { \
+ if (unlikely(!((X) OP (Y)))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIF(C, X) \
+do { \
+ if (unlikely((C) && !(X))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ BUG(); \
+ } \
+} while (0)
+
+#define ASSERTIFCMP(C, X, OP, Y) \
+do { \
+ if (unlikely((C) && !((X) OP (Y)))) { \
+ pr_err("\n"); \
+ pr_err("Assertion failed\n"); \
+ pr_err("%lx " #OP " %lx is false\n", \
+ (unsigned long)(X), (unsigned long)(Y)); \
+ BUG(); \
+ } \
+} while (0)
+
+#else
+
+#define ASSERT(X) do {} while (0)
+#define ASSERTCMP(X, OP, Y) do {} while (0)
+#define ASSERTIF(C, X) do {} while (0)
+#define ASSERTIFCMP(C, X, OP, Y) do {} while (0)
+
+#endif /* assert or not */
diff --git a/fs/netfs/io.c b/fs/netfs/io.c
index 7f753380e047..4261ad6c55b6 100644
--- a/fs/netfs/io.c
+++ b/fs/netfs/io.c
@@ -21,12 +21,7 @@
*/
static void netfs_clear_unread(struct netfs_io_subrequest *subreq)
{
- struct iov_iter iter;
-
- iov_iter_xarray(&iter, ITER_DEST, &subreq->rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
- iov_iter_zero(iov_iter_count(&iter), &iter);
+ iov_iter_zero(iov_iter_count(&subreq->io_iter), &subreq->io_iter);
}
static void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error,
@@ -46,14 +41,9 @@ static void netfs_read_from_cache(struct netfs_io_request *rreq,
enum netfs_read_from_hole read_hole)
{
struct netfs_cache_resources *cres = &rreq->cache_resources;
- struct iov_iter iter;
netfs_stat(&netfs_n_rh_read);
- iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages,
- subreq->start + subreq->transferred,
- subreq->len - subreq->transferred);
-
- cres->ops->read(cres, subreq->start, &iter, read_hole,
+ cres->ops->read(cres, subreq->start, &subreq->io_iter, read_hole,
netfs_cache_read_terminated, subreq);
}
@@ -88,6 +78,13 @@ static void netfs_read_from_server(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
netfs_stat(&netfs_n_rh_download);
+
+ if (rreq->origin != NETFS_DIO_READ &&
+ iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+ pr_warn("R=%08x[%u] ITER PRE-MISMATCH %zx != %zx-%zx %lx\n",
+ rreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->len,
+ subreq->transferred, subreq->flags);
rreq->netfs_ops->issue_read(subreq);
}
@@ -127,9 +124,10 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq,
/* We might have multiple writes from the same huge
* folio, but we mustn't unlock a folio more than once.
*/
- if (have_unlocked && folio_index(folio) <= unlocked)
+ if (have_unlocked && folio->index <= unlocked)
continue;
- unlocked = folio_index(folio);
+ unlocked = folio_next_index(folio) - 1;
+ trace_netfs_folio(folio, netfs_folio_trace_end_copy);
folio_end_fscache(folio);
have_unlocked = true;
}
@@ -201,7 +199,7 @@ static void netfs_rreq_do_write_to_cache(struct netfs_io_request *rreq)
}
ret = cres->ops->prepare_write(cres, &subreq->start, &subreq->len,
- rreq->i_size, true);
+ subreq->len, rreq->i_size, true);
if (ret < 0) {
trace_netfs_failure(rreq, subreq, ret, netfs_fail_prepare_write);
trace_netfs_sreq(subreq, netfs_sreq_trace_write_skip);
@@ -260,6 +258,30 @@ static void netfs_rreq_short_read(struct netfs_io_request *rreq,
}
/*
+ * Reset the subrequest iterator prior to resubmission.
+ */
+static void netfs_reset_subreq_iter(struct netfs_io_request *rreq,
+ struct netfs_io_subrequest *subreq)
+{
+ size_t remaining = subreq->len - subreq->transferred;
+ size_t count = iov_iter_count(&subreq->io_iter);
+
+ if (count == remaining)
+ return;
+
+ _debug("R=%08x[%u] ITER RESUB-MISMATCH %zx != %zx-%zx-%llx %x\n",
+ rreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->transferred,
+ subreq->len, rreq->i_size,
+ subreq->io_iter.iter_type);
+
+ if (count < remaining)
+ iov_iter_revert(&subreq->io_iter, remaining - count);
+ else
+ iov_iter_advance(&subreq->io_iter, count - remaining);
+}
+
+/*
* Resubmit any short or failed operations. Returns true if we got the rreq
* ref back.
*/
@@ -287,6 +309,7 @@ static bool netfs_rreq_perform_resubmissions(struct netfs_io_request *rreq)
trace_netfs_sreq(subreq, netfs_sreq_trace_download_instead);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
atomic_inc(&rreq->nr_outstanding);
+ netfs_reset_subreq_iter(rreq, subreq);
netfs_read_from_server(rreq, subreq);
} else if (test_bit(NETFS_SREQ_SHORT_IO, &subreq->flags)) {
netfs_rreq_short_read(rreq, subreq);
@@ -321,6 +344,43 @@ static void netfs_rreq_is_still_valid(struct netfs_io_request *rreq)
}
/*
+ * Determine how much we can admit to having read from a DIO read.
+ */
+static void netfs_rreq_assess_dio(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ unsigned int i;
+ size_t transferred = 0;
+
+ for (i = 0; i < rreq->direct_bv_count; i++)
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+ list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
+ if (subreq->error || subreq->transferred == 0)
+ break;
+ transferred += subreq->transferred;
+ if (subreq->transferred < subreq->len)
+ break;
+ }
+
+ for (i = 0; i < rreq->direct_bv_count; i++)
+ flush_dcache_page(rreq->direct_bv[i].bv_page);
+
+ rreq->transferred = transferred;
+ task_io_account_read(transferred);
+
+ if (rreq->iocb) {
+ rreq->iocb->ki_pos += transferred;
+ if (rreq->iocb->ki_complete)
+ rreq->iocb->ki_complete(
+ rreq->iocb, rreq->error ? rreq->error : transferred);
+ }
+ if (rreq->netfs_ops->done)
+ rreq->netfs_ops->done(rreq);
+ inode_dio_end(rreq->inode);
+}
+
+/*
* Assess the state of a read request and decide what to do next.
*
* Note that we could be in an ordinary kernel thread, on a workqueue or in
@@ -340,8 +400,12 @@ again:
return;
}
- netfs_rreq_unlock_folios(rreq);
+ if (rreq->origin != NETFS_DIO_READ)
+ netfs_rreq_unlock_folios(rreq);
+ else
+ netfs_rreq_assess_dio(rreq);
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip);
clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
wake_up_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS);
@@ -399,9 +463,9 @@ void netfs_subreq_terminated(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
int u;
- _enter("[%u]{%llx,%lx},%zd",
- subreq->debug_index, subreq->start, subreq->flags,
- transferred_or_error);
+ _enter("R=%x[%x]{%llx,%lx},%zd",
+ rreq->debug_id, subreq->debug_index,
+ subreq->start, subreq->flags, transferred_or_error);
switch (subreq->source) {
case NETFS_READ_FROM_CACHE:
@@ -501,15 +565,20 @@ static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_subrequest
*/
static enum netfs_io_source
netfs_rreq_prepare_read(struct netfs_io_request *rreq,
- struct netfs_io_subrequest *subreq)
+ struct netfs_io_subrequest *subreq,
+ struct iov_iter *io_iter)
{
- enum netfs_io_source source;
+ enum netfs_io_source source = NETFS_DOWNLOAD_FROM_SERVER;
+ struct netfs_inode *ictx = netfs_inode(rreq->inode);
+ size_t lsize;
_enter("%llx-%llx,%llx", subreq->start, subreq->start + subreq->len, rreq->i_size);
- source = netfs_cache_prepare_read(subreq, rreq->i_size);
- if (source == NETFS_INVALID_READ)
- goto out;
+ if (rreq->origin != NETFS_DIO_READ) {
+ source = netfs_cache_prepare_read(subreq, rreq->i_size);
+ if (source == NETFS_INVALID_READ)
+ goto out;
+ }
if (source == NETFS_DOWNLOAD_FROM_SERVER) {
/* Call out to the netfs to let it shrink the request to fit
@@ -518,19 +587,52 @@ netfs_rreq_prepare_read(struct netfs_io_request *rreq,
* to make serial calls, it can indicate a short read and then
* we will call it again.
*/
+ if (rreq->origin != NETFS_DIO_READ) {
+ if (subreq->start >= ictx->zero_point) {
+ source = NETFS_FILL_WITH_ZEROES;
+ goto set;
+ }
+ if (subreq->len > ictx->zero_point - subreq->start)
+ subreq->len = ictx->zero_point - subreq->start;
+ }
if (subreq->len > rreq->i_size - subreq->start)
subreq->len = rreq->i_size - subreq->start;
+ if (rreq->rsize && subreq->len > rreq->rsize)
+ subreq->len = rreq->rsize;
if (rreq->netfs_ops->clamp_length &&
!rreq->netfs_ops->clamp_length(subreq)) {
source = NETFS_INVALID_READ;
goto out;
}
+
+ if (subreq->max_nr_segs) {
+ lsize = netfs_limit_iter(io_iter, 0, subreq->len,
+ subreq->max_nr_segs);
+ if (subreq->len > lsize) {
+ subreq->len = lsize;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_limited);
+ }
+ }
}
- if (WARN_ON(subreq->len == 0))
+set:
+ if (subreq->len > rreq->len)
+ pr_warn("R=%08x[%u] SREQ>RREQ %zx > %zx\n",
+ rreq->debug_id, subreq->debug_index,
+ subreq->len, rreq->len);
+
+ if (WARN_ON(subreq->len == 0)) {
source = NETFS_INVALID_READ;
+ goto out;
+ }
+ subreq->source = source;
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+
+ subreq->io_iter = *io_iter;
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+ iov_iter_advance(io_iter, subreq->len);
out:
subreq->source = source;
trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
@@ -541,6 +643,7 @@ out:
* Slice off a piece of a read request and submit an I/O request for it.
*/
static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
+ struct iov_iter *io_iter,
unsigned int *_debug_index)
{
struct netfs_io_subrequest *subreq;
@@ -552,7 +655,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
subreq->debug_index = (*_debug_index)++;
subreq->start = rreq->start + rreq->submitted;
- subreq->len = rreq->len - rreq->submitted;
+ subreq->len = io_iter->count;
_debug("slice %llx,%zx,%zx", subreq->start, subreq->len, rreq->submitted);
list_add_tail(&subreq->rreq_link, &rreq->subrequests);
@@ -565,7 +668,7 @@ static bool netfs_rreq_submit_slice(struct netfs_io_request *rreq,
* (the starts must coincide), in which case, we go around the loop
* again and ask it to download the next piece.
*/
- source = netfs_rreq_prepare_read(rreq, subreq);
+ source = netfs_rreq_prepare_read(rreq, subreq, io_iter);
if (source == NETFS_INVALID_READ)
goto subreq_failed;
@@ -603,6 +706,7 @@ subreq_failed:
*/
int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
{
+ struct iov_iter io_iter;
unsigned int debug_index = 0;
int ret;
@@ -611,50 +715,73 @@ int netfs_begin_read(struct netfs_io_request *rreq, bool sync)
if (rreq->len == 0) {
pr_err("Zero-sized read [R=%x]\n", rreq->debug_id);
- netfs_put_request(rreq, false, netfs_rreq_trace_put_zero_len);
return -EIO;
}
- INIT_WORK(&rreq->work, netfs_rreq_work);
+ if (rreq->origin == NETFS_DIO_READ)
+ inode_dio_begin(rreq->inode);
- if (sync)
- netfs_get_request(rreq, netfs_rreq_trace_get_hold);
+ // TODO: Use bounce buffer if requested
+ rreq->io_iter = rreq->iter;
+
+ INIT_WORK(&rreq->work, netfs_rreq_work);
/* Chop the read into slices according to what the cache and the netfs
* want and submit each one.
*/
+ netfs_get_request(rreq, netfs_rreq_trace_get_for_outstanding);
atomic_set(&rreq->nr_outstanding, 1);
+ io_iter = rreq->io_iter;
do {
- if (!netfs_rreq_submit_slice(rreq, &debug_index))
+ _debug("submit %llx + %zx >= %llx",
+ rreq->start, rreq->submitted, rreq->i_size);
+ if (rreq->origin == NETFS_DIO_READ &&
+ rreq->start + rreq->submitted >= rreq->i_size)
+ break;
+ if (!netfs_rreq_submit_slice(rreq, &io_iter, &debug_index))
+ break;
+ if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) &&
+ test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags))
break;
} while (rreq->submitted < rreq->len);
+ if (!rreq->submitted) {
+ netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit);
+ if (rreq->origin == NETFS_DIO_READ)
+ inode_dio_end(rreq->inode);
+ ret = 0;
+ goto out;
+ }
+
if (sync) {
- /* Keep nr_outstanding incremented so that the ref always belongs to
- * us, and the service code isn't punted off to a random thread pool to
- * process.
+ /* Keep nr_outstanding incremented so that the ref always
+ * belongs to us, and the service code isn't punted off to a
+ * random thread pool to process. Note that this might start
+ * further work, such as writing to the cache.
*/
- for (;;) {
- wait_var_event(&rreq->nr_outstanding,
- atomic_read(&rreq->nr_outstanding) == 1);
+ wait_var_event(&rreq->nr_outstanding,
+ atomic_read(&rreq->nr_outstanding) == 1);
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
- if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags))
- break;
- cond_resched();
- }
+
+ trace_netfs_rreq(rreq, netfs_rreq_trace_wait_ip);
+ wait_on_bit(&rreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
ret = rreq->error;
- if (ret == 0 && rreq->submitted < rreq->len) {
+ if (ret == 0 && rreq->submitted < rreq->len &&
+ rreq->origin != NETFS_DIO_READ) {
trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read);
ret = -EIO;
}
- netfs_put_request(rreq, false, netfs_rreq_trace_put_hold);
} else {
/* If we decrement nr_outstanding to 0, the ref belongs to us. */
if (atomic_dec_and_test(&rreq->nr_outstanding))
netfs_rreq_assess(rreq, false);
- ret = 0;
+ ret = -EIOCBQUEUED;
}
+
+out:
return ret;
}
diff --git a/fs/netfs/iterator.c b/fs/netfs/iterator.c
index 2ff07ba655a0..b781bbbf1d8d 100644
--- a/fs/netfs/iterator.c
+++ b/fs/netfs/iterator.c
@@ -101,3 +101,100 @@ ssize_t netfs_extract_user_iter(struct iov_iter *orig, size_t orig_len,
return npages;
}
EXPORT_SYMBOL_GPL(netfs_extract_user_iter);
+
+/*
+ * Select the span of a bvec iterator we're going to use. Limit it by both maximum
+ * size and maximum number of segments. Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_bvec(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ const struct bio_vec *bvecs = iter->bvec;
+ unsigned int nbv = iter->nr_segs, ix = 0, nsegs = 0;
+ size_t len, span = 0, n = iter->count;
+ size_t skip = iter->iov_offset + start_offset;
+
+ if (WARN_ON(!iov_iter_is_bvec(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+
+ while (n && ix < nbv && skip) {
+ len = bvecs[ix].bv_len;
+ if (skip < len)
+ break;
+ skip -= len;
+ n -= len;
+ ix++;
+ }
+
+ while (n && ix < nbv) {
+ len = min3(n, bvecs[ix].bv_len - skip, max_size);
+ span += len;
+ nsegs++;
+ ix++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ skip = 0;
+ n -= len;
+ }
+
+ return min(span, max_size);
+}
+
+/*
+ * Select the span of an xarray iterator we're going to use. Limit it by both
+ * maximum size and maximum number of segments. It is assumed that segments
+ * can be larger than a page in size, provided they're physically contiguous.
+ * Returns the size of the span in bytes.
+ */
+static size_t netfs_limit_xarray(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ struct folio *folio;
+ unsigned int nsegs = 0;
+ loff_t pos = iter->xarray_start + iter->iov_offset;
+ pgoff_t index = pos / PAGE_SIZE;
+ size_t span = 0, n = iter->count;
+
+ XA_STATE(xas, iter->xarray, index);
+
+ if (WARN_ON(!iov_iter_is_xarray(iter)) ||
+ WARN_ON(start_offset > n) ||
+ n == 0)
+ return 0;
+ max_size = min(max_size, n - start_offset);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ size_t offset, flen, len;
+ if (xas_retry(&xas, folio))
+ continue;
+ if (WARN_ON(xa_is_value(folio)))
+ break;
+ if (WARN_ON(folio_test_hugetlb(folio)))
+ break;
+
+ flen = folio_size(folio);
+ offset = offset_in_folio(folio, pos);
+ len = min(max_size, flen - offset);
+ span += len;
+ nsegs++;
+ if (span >= max_size || nsegs >= max_segs)
+ break;
+ }
+
+ rcu_read_unlock();
+ return min(span, max_size);
+}
+
+size_t netfs_limit_iter(const struct iov_iter *iter, size_t start_offset,
+ size_t max_size, size_t max_segs)
+{
+ if (iov_iter_is_bvec(iter))
+ return netfs_limit_bvec(iter, start_offset, max_size, max_segs);
+ if (iov_iter_is_xarray(iter))
+ return netfs_limit_xarray(iter, start_offset, max_size, max_segs);
+ BUG();
+}
+EXPORT_SYMBOL(netfs_limit_iter);
diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
new file mode 100644
index 000000000000..75dc52a49b3a
--- /dev/null
+++ b/fs/netfs/locking.c
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * I/O and data path helper functionality.
+ *
+ * Borrowed from NFS Copyright (c) 2016 Trond Myklebust
+ */
+
+#include <linux/kernel.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+/*
+ * inode_dio_wait_interruptible - wait for outstanding DIO requests to finish
+ * @inode: inode to wait for
+ *
+ * Waits for all pending direct I/O requests to finish so that we can
+ * proceed with a truncate or equivalent operation.
+ *
+ * Must be called under a lock that serializes taking new references
+ * to i_dio_count, usually by inode->i_mutex.
+ */
+static int inode_dio_wait_interruptible(struct inode *inode)
+{
+ if (!atomic_read(&inode->i_dio_count))
+ return 0;
+
+ wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
+ DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
+
+ for (;;) {
+ prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE);
+ if (!atomic_read(&inode->i_dio_count))
+ break;
+ if (signal_pending(current))
+ break;
+ schedule();
+ }
+ finish_wait(wq, &q.wq_entry);
+
+ return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0;
+}
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_o_direct(struct netfs_inode *ictx)
+{
+ if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags))
+ return 0;
+ clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ return inode_dio_wait_interruptible(&ictx->inode);
+}
+
+/**
+ * netfs_start_io_read - declare the file is being used for buffered reads
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is unset,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that buffered read operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas direct I/O
+ * operations need to wait to grab an exclusive lock in order to set
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. the reads.
+ */
+int netfs_start_io_read(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+
+ /* Be an optimist! */
+ if (down_read_interruptible(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) == 0)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (netfs_block_o_direct(ictx) < 0) {
+ up_write(&inode->i_rwsem);
+ return -ERESTARTSYS;
+ }
+ downgrade_write(&inode->i_rwsem);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_read);
+
+/**
+ * netfs_end_io_read - declare that the buffered read operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_read(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_read);
+
+/**
+ * netfs_start_io_write - declare the file is being used for buffered writes
+ * @inode: file inode
+ *
+ * Declare that a buffered read operation is about to start, and ensure
+ * that we block all direct I/O.
+ */
+int netfs_start_io_write(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (netfs_block_o_direct(ictx) < 0) {
+ up_write(&inode->i_rwsem);
+ return -ERESTARTSYS;
+ }
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_write);
+
+/**
+ * netfs_end_io_write - declare that the buffered write operation is done
+ * @inode: file inode
+ *
+ * Declare that a buffered write operation is done, and release the
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_write(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_write(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_write);
+
+/* Call with exclusively locked inode->i_rwsem */
+static int netfs_block_buffered(struct inode *inode)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+ int ret;
+
+ if (!test_bit(NETFS_ICTX_ODIRECT, &ictx->flags)) {
+ set_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ if (inode->i_mapping->nrpages != 0) {
+ unmap_mapping_range(inode->i_mapping, 0, 0, 0);
+ ret = filemap_fdatawait(inode->i_mapping);
+ if (ret < 0) {
+ clear_bit(NETFS_ICTX_ODIRECT, &ictx->flags);
+ return ret;
+ }
+ }
+ }
+ return 0;
+}
+
+/**
+ * netfs_start_io_direct - declare the file is being used for direct i/o
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is about to start, and ensure
+ * that we block all buffered I/O.
+ * On exit, the function ensures that the NETFS_ICTX_ODIRECT flag is set,
+ * and holds a shared lock on inode->i_rwsem to ensure that the flag
+ * cannot be changed.
+ * In practice, this means that direct I/O operations are allowed to
+ * execute in parallel, thanks to the shared lock, whereas buffered I/O
+ * operations need to wait to grab an exclusive lock in order to clear
+ * NETFS_ICTX_ODIRECT.
+ * Note that buffered writes and truncates both take a write lock on
+ * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT.
+ */
+int netfs_start_io_direct(struct inode *inode)
+ __acquires(inode->i_rwsem)
+{
+ struct netfs_inode *ictx = netfs_inode(inode);
+ int ret;
+
+ /* Be an optimist! */
+ if (down_read_interruptible(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ if (test_bit(NETFS_ICTX_ODIRECT, &ictx->flags) != 0)
+ return 0;
+ up_read(&inode->i_rwsem);
+
+ /* Slow path.... */
+ if (down_write_killable(&inode->i_rwsem) < 0)
+ return -ERESTARTSYS;
+ ret = netfs_block_buffered(inode);
+ if (ret < 0) {
+ up_write(&inode->i_rwsem);
+ return ret;
+ }
+ downgrade_write(&inode->i_rwsem);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_start_io_direct);
+
+/**
+ * netfs_end_io_direct - declare that the direct i/o operation is done
+ * @inode: file inode
+ *
+ * Declare that a direct I/O operation is done, and release the shared
+ * lock on inode->i_rwsem.
+ */
+void netfs_end_io_direct(struct inode *inode)
+ __releases(inode->i_rwsem)
+{
+ up_read(&inode->i_rwsem);
+}
+EXPORT_SYMBOL(netfs_end_io_direct);
diff --git a/fs/netfs/main.c b/fs/netfs/main.c
index 068568702957..5e77618a7940 100644
--- a/fs/netfs/main.c
+++ b/fs/netfs/main.c
@@ -7,6 +7,8 @@
#include <linux/module.h>
#include <linux/export.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
#include <trace/events/netfs.h>
@@ -15,6 +17,113 @@ MODULE_DESCRIPTION("Network fs support");
MODULE_AUTHOR("Red Hat, Inc.");
MODULE_LICENSE("GPL");
+EXPORT_TRACEPOINT_SYMBOL(netfs_sreq);
+
unsigned netfs_debug;
module_param_named(debug, netfs_debug, uint, S_IWUSR | S_IRUGO);
MODULE_PARM_DESC(netfs_debug, "Netfs support debugging mask");
+
+#ifdef CONFIG_PROC_FS
+LIST_HEAD(netfs_io_requests);
+DEFINE_SPINLOCK(netfs_proc_lock);
+
+static const char *netfs_origins[nr__netfs_io_origin] = {
+ [NETFS_READAHEAD] = "RA",
+ [NETFS_READPAGE] = "RP",
+ [NETFS_READ_FOR_WRITE] = "RW",
+ [NETFS_WRITEBACK] = "WB",
+ [NETFS_WRITETHROUGH] = "WT",
+ [NETFS_LAUNDER_WRITE] = "LW",
+ [NETFS_UNBUFFERED_WRITE] = "UW",
+ [NETFS_DIO_READ] = "DR",
+ [NETFS_DIO_WRITE] = "DW",
+};
+
+/*
+ * Generate a list of I/O requests in /proc/fs/netfs/requests
+ */
+static int netfs_requests_seq_show(struct seq_file *m, void *v)
+{
+ struct netfs_io_request *rreq;
+
+ if (v == &netfs_io_requests) {
+ seq_puts(m,
+ "REQUEST OR REF FL ERR OPS COVERAGE\n"
+ "======== == === == ==== === =========\n"
+ );
+ return 0;
+ }
+
+ rreq = list_entry(v, struct netfs_io_request, proc_link);
+ seq_printf(m,
+ "%08x %s %3d %2lx %4d %3d @%04llx %zx/%zx",
+ rreq->debug_id,
+ netfs_origins[rreq->origin],
+ refcount_read(&rreq->ref),
+ rreq->flags,
+ rreq->error,
+ atomic_read(&rreq->nr_outstanding),
+ rreq->start, rreq->submitted, rreq->len);
+ seq_putc(m, '\n');
+ return 0;
+}
+
+static void *netfs_requests_seq_start(struct seq_file *m, loff_t *_pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+ return seq_list_start_head(&netfs_io_requests, *_pos);
+}
+
+static void *netfs_requests_seq_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+ return seq_list_next(v, &netfs_io_requests, _pos);
+}
+
+static void netfs_requests_seq_stop(struct seq_file *m, void *v)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+static const struct seq_operations netfs_requests_seq_ops = {
+ .start = netfs_requests_seq_start,
+ .next = netfs_requests_seq_next,
+ .stop = netfs_requests_seq_stop,
+ .show = netfs_requests_seq_show,
+};
+#endif /* CONFIG_PROC_FS */
+
+static int __init netfs_init(void)
+{
+ int ret = -ENOMEM;
+
+ if (!proc_mkdir("fs/netfs", NULL))
+ goto error;
+ if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL,
+ &netfs_requests_seq_ops))
+ goto error_proc;
+#ifdef CONFIG_FSCACHE_STATS
+ if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL,
+ netfs_stats_show))
+ goto error_proc;
+#endif
+
+ ret = fscache_init();
+ if (ret < 0)
+ goto error_proc;
+ return 0;
+
+error_proc:
+ remove_proc_entry("fs/netfs", NULL);
+error:
+ return ret;
+}
+fs_initcall(netfs_init);
+
+static void __exit netfs_exit(void)
+{
+ fscache_exit();
+ remove_proc_entry("fs/netfs", NULL);
+}
+module_exit(netfs_exit);
diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c
new file mode 100644
index 000000000000..90051ced8e2a
--- /dev/null
+++ b/fs/netfs/misc.c
@@ -0,0 +1,260 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Miscellaneous routines.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/swap.h>
+#include "internal.h"
+
+/*
+ * Attach a folio to the buffer and maybe set marks on it to say that we need
+ * to put the folio later and twiddle the pagecache flags.
+ */
+int netfs_xa_store_and_mark(struct xarray *xa, unsigned long index,
+ struct folio *folio, unsigned int flags,
+ gfp_t gfp_mask)
+{
+ XA_STATE_ORDER(xas, xa, index, folio_order(folio));
+
+retry:
+ xas_lock(&xas);
+ for (;;) {
+ xas_store(&xas, folio);
+ if (!xas_error(&xas))
+ break;
+ xas_unlock(&xas);
+ if (!xas_nomem(&xas, gfp_mask))
+ return xas_error(&xas);
+ goto retry;
+ }
+
+ if (flags & NETFS_FLAG_PUT_MARK)
+ xas_set_mark(&xas, NETFS_BUF_PUT_MARK);
+ if (flags & NETFS_FLAG_PAGECACHE_MARK)
+ xas_set_mark(&xas, NETFS_BUF_PAGECACHE_MARK);
+ xas_unlock(&xas);
+ return xas_error(&xas);
+}
+
+/*
+ * Create the specified range of folios in the buffer attached to the read
+ * request. The folios are marked with NETFS_BUF_PUT_MARK so that we know that
+ * these need freeing later.
+ */
+int netfs_add_folios_to_buffer(struct xarray *buffer,
+ struct address_space *mapping,
+ pgoff_t index, pgoff_t to, gfp_t gfp_mask)
+{
+ struct folio *folio;
+ int ret;
+
+ if (to + 1 == index) /* Page range is inclusive */
+ return 0;
+
+ do {
+ /* TODO: Figure out what order folio can be allocated here */
+ folio = filemap_alloc_folio(readahead_gfp_mask(mapping), 0);
+ if (!folio)
+ return -ENOMEM;
+ folio->index = index;
+ ret = netfs_xa_store_and_mark(buffer, index, folio,
+ NETFS_FLAG_PUT_MARK, gfp_mask);
+ if (ret < 0) {
+ folio_put(folio);
+ return ret;
+ }
+
+ index += folio_nr_pages(folio);
+ } while (index <= to && index != 0);
+
+ return 0;
+}
+
+/*
+ * Clear an xarray buffer, putting a ref on the folios that have
+ * NETFS_BUF_PUT_MARK set.
+ */
+void netfs_clear_buffer(struct xarray *buffer)
+{
+ struct folio *folio;
+ XA_STATE(xas, buffer, 0);
+
+ rcu_read_lock();
+ xas_for_each_marked(&xas, folio, ULONG_MAX, NETFS_BUF_PUT_MARK) {
+ folio_put(folio);
+ }
+ rcu_read_unlock();
+ xa_destroy(buffer);
+}
+
+/**
+ * netfs_dirty_folio - Mark folio dirty and pin a cache object for writeback
+ * @mapping: The mapping the folio belongs to.
+ * @folio: The folio being dirtied.
+ *
+ * Set the dirty flag on a folio and pin an in-use cache object in memory so
+ * that writeback can later write to it. This is intended to be called from
+ * the filesystem's ->dirty_folio() method.
+ *
+ * Return: true if the dirty flag was set on the folio, false otherwise.
+ */
+bool netfs_dirty_folio(struct address_space *mapping, struct folio *folio)
+{
+ struct inode *inode = mapping->host;
+ struct netfs_inode *ictx = netfs_inode(inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(ictx);
+ bool need_use = false;
+
+ _enter("");
+
+ if (!filemap_dirty_folio(mapping, folio))
+ return false;
+ if (!fscache_cookie_valid(cookie))
+ return true;
+
+ if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ spin_lock(&inode->i_lock);
+ if (!(inode->i_state & I_PINNING_NETFS_WB)) {
+ inode->i_state |= I_PINNING_NETFS_WB;
+ need_use = true;
+ }
+ spin_unlock(&inode->i_lock);
+
+ if (need_use)
+ fscache_use_cookie(cookie, true);
+ }
+ return true;
+}
+EXPORT_SYMBOL(netfs_dirty_folio);
+
+/**
+ * netfs_unpin_writeback - Unpin writeback resources
+ * @inode: The inode on which the cookie resides
+ * @wbc: The writeback control
+ *
+ * Unpin the writeback resources pinned by netfs_dirty_folio(). This is
+ * intended to be called as/by the netfs's ->write_inode() method.
+ */
+int netfs_unpin_writeback(struct inode *inode, struct writeback_control *wbc)
+{
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+ if (wbc->unpinned_netfs_wb)
+ fscache_unuse_cookie(cookie, NULL, NULL);
+ return 0;
+}
+EXPORT_SYMBOL(netfs_unpin_writeback);
+
+/**
+ * netfs_clear_inode_writeback - Clear writeback resources pinned by an inode
+ * @inode: The inode to clean up
+ * @aux: Auxiliary data to apply to the inode
+ *
+ * Clear any writeback resources held by an inode when the inode is evicted.
+ * This must be called before clear_inode() is called.
+ */
+void netfs_clear_inode_writeback(struct inode *inode, const void *aux)
+{
+ struct fscache_cookie *cookie = netfs_i_cookie(netfs_inode(inode));
+
+ if (inode->i_state & I_PINNING_NETFS_WB) {
+ loff_t i_size = i_size_read(inode);
+ fscache_unuse_cookie(cookie, aux, &i_size);
+ }
+}
+EXPORT_SYMBOL(netfs_clear_inode_writeback);
+
+/**
+ * netfs_invalidate_folio - Invalidate or partially invalidate a folio
+ * @folio: Folio proposed for release
+ * @offset: Offset of the invalidated region
+ * @length: Length of the invalidated region
+ *
+ * Invalidate part or all of a folio for a network filesystem. The folio will
+ * be removed afterwards if the invalidated region covers the entire folio.
+ */
+void netfs_invalidate_folio(struct folio *folio, size_t offset, size_t length)
+{
+ struct netfs_folio *finfo = NULL;
+ size_t flen = folio_size(folio);
+
+ _enter("{%lx},%zx,%zx", folio->index, offset, length);
+
+ folio_wait_fscache(folio);
+
+ if (!folio_test_private(folio))
+ return;
+
+ finfo = netfs_folio_info(folio);
+
+ if (offset == 0 && length >= flen)
+ goto erase_completely;
+
+ if (finfo) {
+ /* We have a partially uptodate page from a streaming write. */
+ unsigned int fstart = finfo->dirty_offset;
+ unsigned int fend = fstart + finfo->dirty_len;
+ unsigned int end = offset + length;
+
+ if (offset >= fend)
+ return;
+ if (end <= fstart)
+ return;
+ if (offset <= fstart && end >= fend)
+ goto erase_completely;
+ if (offset <= fstart && end > fstart)
+ goto reduce_len;
+ if (offset > fstart && end >= fend)
+ goto move_start;
+ /* A partial write was split. The caller has already zeroed
+ * it, so just absorb the hole.
+ */
+ }
+ return;
+
+erase_completely:
+ netfs_put_group(netfs_folio_group(folio));
+ folio_detach_private(folio);
+ folio_clear_uptodate(folio);
+ kfree(finfo);
+ return;
+reduce_len:
+ finfo->dirty_len = offset + length - finfo->dirty_offset;
+ return;
+move_start:
+ finfo->dirty_len -= offset - finfo->dirty_offset;
+ finfo->dirty_offset = offset;
+}
+EXPORT_SYMBOL(netfs_invalidate_folio);
+
+/**
+ * netfs_release_folio - Try to release a folio
+ * @folio: Folio proposed for release
+ * @gfp: Flags qualifying the release
+ *
+ * Request release of a folio and clean up its private state if it's not busy.
+ * Returns true if the folio can now be released, false if not
+ */
+bool netfs_release_folio(struct folio *folio, gfp_t gfp)
+{
+ struct netfs_inode *ctx = netfs_inode(folio_inode(folio));
+ unsigned long long end;
+
+ end = folio_pos(folio) + folio_size(folio);
+ if (end > ctx->zero_point)
+ ctx->zero_point = end;
+
+ if (folio_test_private(folio))
+ return false;
+ if (folio_test_fscache(folio)) {
+ if (current_is_kswapd() || !(gfp & __GFP_FS))
+ return false;
+ folio_wait_fscache(folio);
+ }
+
+ fscache_note_page_release(netfs_i_cookie(ctx));
+ return true;
+}
+EXPORT_SYMBOL(netfs_release_folio);
diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c
index e17cdf53f6a7..610ceb5bd86c 100644
--- a/fs/netfs/objects.c
+++ b/fs/netfs/objects.c
@@ -20,14 +20,20 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
struct inode *inode = file ? file_inode(file) : mapping->host;
struct netfs_inode *ctx = netfs_inode(inode);
struct netfs_io_request *rreq;
+ bool is_unbuffered = (origin == NETFS_UNBUFFERED_WRITE ||
+ origin == NETFS_DIO_READ ||
+ origin == NETFS_DIO_WRITE);
+ bool cached = !is_unbuffered && netfs_is_cache_enabled(ctx);
int ret;
- rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+ rreq = kzalloc(ctx->ops->io_request_size ?: sizeof(struct netfs_io_request),
+ GFP_KERNEL);
if (!rreq)
return ERR_PTR(-ENOMEM);
rreq->start = start;
rreq->len = len;
+ rreq->upper_len = len;
rreq->origin = origin;
rreq->netfs_ops = ctx->ops;
rreq->mapping = mapping;
@@ -35,8 +41,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
rreq->i_size = i_size_read(inode);
rreq->debug_id = atomic_inc_return(&debug_ids);
INIT_LIST_HEAD(&rreq->subrequests);
+ INIT_WORK(&rreq->work, NULL);
refcount_set(&rreq->ref, 1);
+
__set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags);
+ if (cached)
+ __set_bit(NETFS_RREQ_WRITE_TO_CACHE, &rreq->flags);
+ if (file && file->f_flags & O_NONBLOCK)
+ __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags);
if (rreq->netfs_ops->init_request) {
ret = rreq->netfs_ops->init_request(rreq, file);
if (ret < 0) {
@@ -45,6 +57,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping,
}
}
+ trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new);
+ netfs_proc_add_rreq(rreq);
netfs_stat(&netfs_n_rh_rreq);
return rreq;
}
@@ -74,33 +88,47 @@ static void netfs_free_request(struct work_struct *work)
{
struct netfs_io_request *rreq =
container_of(work, struct netfs_io_request, work);
+ unsigned int i;
trace_netfs_rreq(rreq, netfs_rreq_trace_free);
+ netfs_proc_del_rreq(rreq);
netfs_clear_subrequests(rreq, false);
if (rreq->netfs_ops->free_request)
rreq->netfs_ops->free_request(rreq);
if (rreq->cache_resources.ops)
rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
- kfree(rreq);
+ if (rreq->direct_bv) {
+ for (i = 0; i < rreq->direct_bv_count; i++) {
+ if (rreq->direct_bv[i].bv_page) {
+ if (rreq->direct_bv_unpin)
+ unpin_user_page(rreq->direct_bv[i].bv_page);
+ }
+ }
+ kvfree(rreq->direct_bv);
+ }
+ kfree_rcu(rreq, rcu);
netfs_stat_d(&netfs_n_rh_rreq);
}
void netfs_put_request(struct netfs_io_request *rreq, bool was_async,
enum netfs_rreq_ref_trace what)
{
- unsigned int debug_id = rreq->debug_id;
+ unsigned int debug_id;
bool dead;
int r;
- dead = __refcount_dec_and_test(&rreq->ref, &r);
- trace_netfs_rreq_ref(debug_id, r - 1, what);
- if (dead) {
- if (was_async) {
- rreq->work.func = netfs_free_request;
- if (!queue_work(system_unbound_wq, &rreq->work))
- BUG();
- } else {
- netfs_free_request(&rreq->work);
+ if (rreq) {
+ debug_id = rreq->debug_id;
+ dead = __refcount_dec_and_test(&rreq->ref, &r);
+ trace_netfs_rreq_ref(debug_id, r - 1, what);
+ if (dead) {
+ if (was_async) {
+ rreq->work.func = netfs_free_request;
+ if (!queue_work(system_unbound_wq, &rreq->work))
+ BUG();
+ } else {
+ netfs_free_request(&rreq->work);
+ }
}
}
}
@@ -112,8 +140,11 @@ struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq
{
struct netfs_io_subrequest *subreq;
- subreq = kzalloc(sizeof(struct netfs_io_subrequest), GFP_KERNEL);
+ subreq = kzalloc(rreq->netfs_ops->io_subrequest_size ?:
+ sizeof(struct netfs_io_subrequest),
+ GFP_KERNEL);
if (subreq) {
+ INIT_WORK(&subreq->work, NULL);
INIT_LIST_HEAD(&subreq->rreq_link);
refcount_set(&subreq->ref, 2);
subreq->rreq = rreq;
@@ -140,6 +171,8 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq,
struct netfs_io_request *rreq = subreq->rreq;
trace_netfs_sreq(subreq, netfs_sreq_trace_free);
+ if (rreq->netfs_ops->free_subrequest)
+ rreq->netfs_ops->free_subrequest(subreq);
kfree(subreq);
netfs_stat_d(&netfs_n_rh_sreq);
netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq);
diff --git a/fs/netfs/output.c b/fs/netfs/output.c
new file mode 100644
index 000000000000..625eb68f3e5a
--- /dev/null
+++ b/fs/netfs/output.c
@@ -0,0 +1,478 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Network filesystem high-level write support.
+ *
+ * Copyright (C) 2023 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/writeback.h>
+#include <linux/pagevec.h>
+#include "internal.h"
+
+/**
+ * netfs_create_write_request - Create a write operation.
+ * @wreq: The write request this is storing from.
+ * @dest: The destination type
+ * @start: Start of the region this write will modify
+ * @len: Length of the modification
+ * @worker: The worker function to handle the write(s)
+ *
+ * Allocate a write operation, set it up and add it to the list on a write
+ * request.
+ */
+struct netfs_io_subrequest *netfs_create_write_request(struct netfs_io_request *wreq,
+ enum netfs_io_source dest,
+ loff_t start, size_t len,
+ work_func_t worker)
+{
+ struct netfs_io_subrequest *subreq;
+
+ subreq = netfs_alloc_subrequest(wreq);
+ if (subreq) {
+ INIT_WORK(&subreq->work, worker);
+ subreq->source = dest;
+ subreq->start = start;
+ subreq->len = len;
+ subreq->debug_index = wreq->subreq_counter++;
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write);
+ break;
+ default:
+ BUG();
+ }
+
+ subreq->io_iter = wreq->io_iter;
+ iov_iter_advance(&subreq->io_iter, subreq->start - wreq->start);
+ iov_iter_truncate(&subreq->io_iter, subreq->len);
+
+ trace_netfs_sreq_ref(wreq->debug_id, subreq->debug_index,
+ refcount_read(&subreq->ref),
+ netfs_sreq_trace_new);
+ atomic_inc(&wreq->nr_outstanding);
+ list_add_tail(&subreq->rreq_link, &wreq->subrequests);
+ trace_netfs_sreq(subreq, netfs_sreq_trace_prepare);
+ }
+
+ return subreq;
+}
+EXPORT_SYMBOL(netfs_create_write_request);
+
+/*
+ * Process a completed write request once all the component operations have
+ * been completed.
+ */
+static void netfs_write_terminated(struct netfs_io_request *wreq, bool was_async)
+{
+ struct netfs_io_subrequest *subreq;
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+ size_t transferred = 0;
+
+ _enter("R=%x[]", wreq->debug_id);
+
+ trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
+
+ list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+ if (subreq->error || subreq->transferred == 0)
+ break;
+ transferred += subreq->transferred;
+ if (subreq->transferred < subreq->len)
+ break;
+ }
+ wreq->transferred = transferred;
+
+ list_for_each_entry(subreq, &wreq->subrequests, rreq_link) {
+ if (!subreq->error)
+ continue;
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ /* Depending on the type of failure, this may prevent
+ * writeback completion unless we're in disconnected
+ * mode.
+ */
+ if (!wreq->error)
+ wreq->error = subreq->error;
+ break;
+
+ case NETFS_WRITE_TO_CACHE:
+ /* Failure doesn't prevent writeback completion unless
+ * we're in disconnected mode.
+ */
+ if (subreq->error != -ENOBUFS)
+ ctx->ops->invalidate_cache(wreq);
+ break;
+
+ default:
+ WARN_ON_ONCE(1);
+ if (!wreq->error)
+ wreq->error = -EIO;
+ return;
+ }
+ }
+
+ wreq->cleanup(wreq);
+
+ if (wreq->origin == NETFS_DIO_WRITE &&
+ wreq->mapping->nrpages) {
+ pgoff_t first = wreq->start >> PAGE_SHIFT;
+ pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
+ invalidate_inode_pages2_range(wreq->mapping, first, last);
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_end(wreq->inode);
+
+ _debug("finished");
+ trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip);
+ clear_bit_unlock(NETFS_RREQ_IN_PROGRESS, &wreq->flags);
+ wake_up_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS);
+
+ if (wreq->iocb) {
+ wreq->iocb->ki_pos += transferred;
+ if (wreq->iocb->ki_complete)
+ wreq->iocb->ki_complete(
+ wreq->iocb, wreq->error ? wreq->error : transferred);
+ }
+
+ netfs_clear_subrequests(wreq, was_async);
+ netfs_put_request(wreq, was_async, netfs_rreq_trace_put_complete);
+}
+
+/*
+ * Deal with the completion of writing the data to the cache.
+ */
+void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error,
+ bool was_async)
+{
+ struct netfs_io_subrequest *subreq = _op;
+ struct netfs_io_request *wreq = subreq->rreq;
+ unsigned int u;
+
+ _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error);
+
+ switch (subreq->source) {
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_done);
+ break;
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_done);
+ break;
+ case NETFS_INVALID_WRITE:
+ break;
+ default:
+ BUG();
+ }
+
+ if (IS_ERR_VALUE(transferred_or_error)) {
+ subreq->error = transferred_or_error;
+ trace_netfs_failure(wreq, subreq, transferred_or_error,
+ netfs_fail_write);
+ goto failed;
+ }
+
+ if (WARN(transferred_or_error > subreq->len - subreq->transferred,
+ "Subreq excess write: R%x[%x] %zd > %zu - %zu",
+ wreq->debug_id, subreq->debug_index,
+ transferred_or_error, subreq->len, subreq->transferred))
+ transferred_or_error = subreq->len - subreq->transferred;
+
+ subreq->error = 0;
+ subreq->transferred += transferred_or_error;
+
+ if (iov_iter_count(&subreq->io_iter) != subreq->len - subreq->transferred)
+ pr_warn("R=%08x[%u] ITER POST-MISMATCH %zx != %zx-%zx %x\n",
+ wreq->debug_id, subreq->debug_index,
+ iov_iter_count(&subreq->io_iter), subreq->len,
+ subreq->transferred, subreq->io_iter.iter_type);
+
+ if (subreq->transferred < subreq->len)
+ goto incomplete;
+
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+out:
+ trace_netfs_sreq(subreq, netfs_sreq_trace_terminated);
+
+ /* If we decrement nr_outstanding to 0, the ref belongs to us. */
+ u = atomic_dec_return(&wreq->nr_outstanding);
+ if (u == 0)
+ netfs_write_terminated(wreq, was_async);
+ else if (u == 1)
+ wake_up_var(&wreq->nr_outstanding);
+
+ netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated);
+ return;
+
+incomplete:
+ if (transferred_or_error == 0) {
+ if (__test_and_set_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags)) {
+ subreq->error = -ENODATA;
+ goto failed;
+ }
+ } else {
+ __clear_bit(NETFS_SREQ_NO_PROGRESS, &subreq->flags);
+ }
+
+ __set_bit(NETFS_SREQ_SHORT_IO, &subreq->flags);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+ goto out;
+
+failed:
+ switch (subreq->source) {
+ case NETFS_WRITE_TO_CACHE:
+ netfs_stat(&netfs_n_wh_write_failed);
+ set_bit(NETFS_RREQ_INCOMPLETE_IO, &wreq->flags);
+ break;
+ case NETFS_UPLOAD_TO_SERVER:
+ netfs_stat(&netfs_n_wh_upload_failed);
+ set_bit(NETFS_RREQ_FAILED, &wreq->flags);
+ wreq->error = subreq->error;
+ break;
+ default:
+ break;
+ }
+ goto out;
+}
+EXPORT_SYMBOL(netfs_write_subrequest_terminated);
+
+static void netfs_write_to_cache_op(struct netfs_io_subrequest *subreq)
+{
+ struct netfs_io_request *wreq = subreq->rreq;
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+
+ trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
+
+ cres->ops->write(cres, subreq->start, &subreq->io_iter,
+ netfs_write_subrequest_terminated, subreq);
+}
+
+static void netfs_write_to_cache_op_worker(struct work_struct *work)
+{
+ struct netfs_io_subrequest *subreq =
+ container_of(work, struct netfs_io_subrequest, work);
+
+ netfs_write_to_cache_op(subreq);
+}
+
+/**
+ * netfs_queue_write_request - Queue a write request for attention
+ * @subreq: The write request to be queued
+ *
+ * Queue the specified write request for processing by a worker thread. We
+ * pass the caller's ref on the request to the worker thread.
+ */
+void netfs_queue_write_request(struct netfs_io_subrequest *subreq)
+{
+ if (!queue_work(system_unbound_wq, &subreq->work))
+ netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_wip);
+}
+EXPORT_SYMBOL(netfs_queue_write_request);
+
+/*
+ * Set up a op for writing to the cache.
+ */
+static void netfs_set_up_write_to_cache(struct netfs_io_request *wreq)
+{
+ struct netfs_cache_resources *cres = &wreq->cache_resources;
+ struct netfs_io_subrequest *subreq;
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+ struct fscache_cookie *cookie = netfs_i_cookie(ctx);
+ loff_t start = wreq->start;
+ size_t len = wreq->len;
+ int ret;
+
+ if (!fscache_cookie_enabled(cookie)) {
+ clear_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags);
+ return;
+ }
+
+ _debug("write to cache");
+ ret = fscache_begin_write_operation(cres, cookie);
+ if (ret < 0)
+ return;
+
+ ret = cres->ops->prepare_write(cres, &start, &len, wreq->upper_len,
+ i_size_read(wreq->inode), true);
+ if (ret < 0)
+ return;
+
+ subreq = netfs_create_write_request(wreq, NETFS_WRITE_TO_CACHE, start, len,
+ netfs_write_to_cache_op_worker);
+ if (!subreq)
+ return;
+
+ netfs_write_to_cache_op(subreq);
+}
+
+/*
+ * Begin the process of writing out a chunk of data.
+ *
+ * We are given a write request that holds a series of dirty regions and
+ * (partially) covers a sequence of folios, all of which are present. The
+ * pages must have been marked as writeback as appropriate.
+ *
+ * We need to perform the following steps:
+ *
+ * (1) If encrypting, create an output buffer and encrypt each block of the
+ * data into it, otherwise the output buffer will point to the original
+ * folios.
+ *
+ * (2) If the data is to be cached, set up a write op for the entire output
+ * buffer to the cache, if the cache wants to accept it.
+ *
+ * (3) If the data is to be uploaded (ie. not merely cached):
+ *
+ * (a) If the data is to be compressed, create a compression buffer and
+ * compress the data into it.
+ *
+ * (b) For each destination we want to upload to, set up write ops to write
+ * to that destination. We may need multiple writes if the data is not
+ * contiguous or the span exceeds wsize for a server.
+ */
+int netfs_begin_write(struct netfs_io_request *wreq, bool may_wait,
+ enum netfs_write_trace what)
+{
+ struct netfs_inode *ctx = netfs_inode(wreq->inode);
+
+ _enter("R=%x %llx-%llx f=%lx",
+ wreq->debug_id, wreq->start, wreq->start + wreq->len - 1,
+ wreq->flags);
+
+ trace_netfs_write(wreq, what);
+ if (wreq->len == 0 || wreq->iter.count == 0) {
+ pr_err("Zero-sized write [R=%x]\n", wreq->debug_id);
+ return -EIO;
+ }
+
+ if (wreq->origin == NETFS_DIO_WRITE)
+ inode_dio_begin(wreq->inode);
+
+ wreq->io_iter = wreq->iter;
+
+ /* ->outstanding > 0 carries a ref */
+ netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+ atomic_set(&wreq->nr_outstanding, 1);
+
+ /* Start the encryption/compression going. We can do that in the
+ * background whilst we generate a list of write ops that we want to
+ * perform.
+ */
+ // TODO: Encrypt or compress the region as appropriate
+
+ /* We need to write all of the region to the cache */
+ if (test_bit(NETFS_RREQ_WRITE_TO_CACHE, &wreq->flags))
+ netfs_set_up_write_to_cache(wreq);
+
+ /* However, we don't necessarily write all of the region to the server.
+ * Caching of reads is being managed this way also.
+ */
+ if (test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+ ctx->ops->create_write_requests(wreq, wreq->start, wreq->len);
+
+ if (atomic_dec_and_test(&wreq->nr_outstanding))
+ netfs_write_terminated(wreq, false);
+
+ if (!may_wait)
+ return -EIOCBQUEUED;
+
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ return wreq->error;
+}
+
+/*
+ * Begin a write operation for writing through the pagecache.
+ */
+struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len)
+{
+ struct netfs_io_request *wreq;
+ struct file *file = iocb->ki_filp;
+
+ wreq = netfs_alloc_request(file->f_mapping, file, iocb->ki_pos, len,
+ NETFS_WRITETHROUGH);
+ if (IS_ERR(wreq))
+ return wreq;
+
+ trace_netfs_write(wreq, netfs_write_trace_writethrough);
+
+ __set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
+ iov_iter_xarray(&wreq->iter, ITER_SOURCE, &wreq->mapping->i_pages, wreq->start, 0);
+ wreq->io_iter = wreq->iter;
+
+ /* ->outstanding > 0 carries a ref */
+ netfs_get_request(wreq, netfs_rreq_trace_get_for_outstanding);
+ atomic_set(&wreq->nr_outstanding, 1);
+ return wreq;
+}
+
+static void netfs_submit_writethrough(struct netfs_io_request *wreq, bool final)
+{
+ struct netfs_inode *ictx = netfs_inode(wreq->inode);
+ unsigned long long start;
+ size_t len;
+
+ if (!test_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags))
+ return;
+
+ start = wreq->start + wreq->submitted;
+ len = wreq->iter.count - wreq->submitted;
+ if (!final) {
+ len /= wreq->wsize; /* Round to number of maximum packets */
+ len *= wreq->wsize;
+ }
+
+ ictx->ops->create_write_requests(wreq, start, len);
+ wreq->submitted += len;
+}
+
+/*
+ * Advance the state of the write operation used when writing through the
+ * pagecache. Data has been copied into the pagecache that we need to append
+ * to the request. If we've added more than wsize then we need to create a new
+ * subrequest.
+ */
+int netfs_advance_writethrough(struct netfs_io_request *wreq, size_t copied, bool to_page_end)
+{
+ _enter("ic=%zu sb=%zu ws=%u cp=%zu tp=%u",
+ wreq->iter.count, wreq->submitted, wreq->wsize, copied, to_page_end);
+
+ wreq->iter.count += copied;
+ wreq->io_iter.count += copied;
+ if (to_page_end && wreq->io_iter.count - wreq->submitted >= wreq->wsize)
+ netfs_submit_writethrough(wreq, false);
+
+ return wreq->error;
+}
+
+/*
+ * End a write operation used when writing through the pagecache.
+ */
+int netfs_end_writethrough(struct netfs_io_request *wreq, struct kiocb *iocb)
+{
+ int ret = -EIOCBQUEUED;
+
+ _enter("ic=%zu sb=%zu ws=%u",
+ wreq->iter.count, wreq->submitted, wreq->wsize);
+
+ if (wreq->submitted < wreq->io_iter.count)
+ netfs_submit_writethrough(wreq, true);
+
+ if (atomic_dec_and_test(&wreq->nr_outstanding))
+ netfs_write_terminated(wreq, false);
+
+ if (is_sync_kiocb(iocb)) {
+ wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS,
+ TASK_UNINTERRUPTIBLE);
+ ret = wreq->error;
+ }
+
+ netfs_put_request(wreq, false, netfs_rreq_trace_put_return);
+ return ret;
+}
diff --git a/fs/netfs/stats.c b/fs/netfs/stats.c
index 5510a7a14a40..deeba9f9dcf5 100644
--- a/fs/netfs/stats.c
+++ b/fs/netfs/stats.c
@@ -9,6 +9,8 @@
#include <linux/seq_file.h>
#include "internal.h"
+atomic_t netfs_n_rh_dio_read;
+atomic_t netfs_n_rh_dio_write;
atomic_t netfs_n_rh_readahead;
atomic_t netfs_n_rh_readpage;
atomic_t netfs_n_rh_rreq;
@@ -27,32 +29,48 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
+atomic_t netfs_n_wh_wstream_conflict;
+atomic_t netfs_n_wh_upload;
+atomic_t netfs_n_wh_upload_done;
+atomic_t netfs_n_wh_upload_failed;
+atomic_t netfs_n_wh_write;
+atomic_t netfs_n_wh_write_done;
+atomic_t netfs_n_wh_write_failed;
-void netfs_stats_show(struct seq_file *m)
+int netfs_stats_show(struct seq_file *m, void *v)
{
- seq_printf(m, "RdHelp : RA=%u RP=%u WB=%u WBZ=%u rr=%u sr=%u\n",
+ seq_printf(m, "Netfs : DR=%u DW=%u RA=%u RP=%u WB=%u WBZ=%u\n",
+ atomic_read(&netfs_n_rh_dio_read),
+ atomic_read(&netfs_n_rh_dio_write),
atomic_read(&netfs_n_rh_readahead),
atomic_read(&netfs_n_rh_readpage),
atomic_read(&netfs_n_rh_write_begin),
- atomic_read(&netfs_n_rh_write_zskip),
- atomic_read(&netfs_n_rh_rreq),
- atomic_read(&netfs_n_rh_sreq));
- seq_printf(m, "RdHelp : ZR=%u sh=%u sk=%u\n",
+ atomic_read(&netfs_n_rh_write_zskip));
+ seq_printf(m, "Netfs : ZR=%u sh=%u sk=%u\n",
atomic_read(&netfs_n_rh_zero),
atomic_read(&netfs_n_rh_short_read),
atomic_read(&netfs_n_rh_write_zskip));
- seq_printf(m, "RdHelp : DL=%u ds=%u df=%u di=%u\n",
+ seq_printf(m, "Netfs : DL=%u ds=%u df=%u di=%u\n",
atomic_read(&netfs_n_rh_download),
atomic_read(&netfs_n_rh_download_done),
atomic_read(&netfs_n_rh_download_failed),
atomic_read(&netfs_n_rh_download_instead));
- seq_printf(m, "RdHelp : RD=%u rs=%u rf=%u\n",
+ seq_printf(m, "Netfs : RD=%u rs=%u rf=%u\n",
atomic_read(&netfs_n_rh_read),
atomic_read(&netfs_n_rh_read_done),
atomic_read(&netfs_n_rh_read_failed));
- seq_printf(m, "RdHelp : WR=%u ws=%u wf=%u\n",
- atomic_read(&netfs_n_rh_write),
- atomic_read(&netfs_n_rh_write_done),
- atomic_read(&netfs_n_rh_write_failed));
+ seq_printf(m, "Netfs : UL=%u us=%u uf=%u\n",
+ atomic_read(&netfs_n_wh_upload),
+ atomic_read(&netfs_n_wh_upload_done),
+ atomic_read(&netfs_n_wh_upload_failed));
+ seq_printf(m, "Netfs : WR=%u ws=%u wf=%u\n",
+ atomic_read(&netfs_n_wh_write),
+ atomic_read(&netfs_n_wh_write_done),
+ atomic_read(&netfs_n_wh_write_failed));
+ seq_printf(m, "Netfs : rr=%u sr=%u wsc=%u\n",
+ atomic_read(&netfs_n_rh_rreq),
+ atomic_read(&netfs_n_rh_sreq),
+ atomic_read(&netfs_n_wh_wstream_conflict));
+ return fscache_stats_show(m);
}
EXPORT_SYMBOL(netfs_stats_show);
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 01ac733a6320..f7e32d76e34d 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -169,8 +169,8 @@ config ROOT_NFS
config NFS_FSCACHE
bool "Provide NFS client caching support"
- depends on NFS_FS=m && FSCACHE || NFS_FS=y && FSCACHE=y
- select NETFS_SUPPORT
+ depends on NFS_FS=m && NETFS_SUPPORT || NFS_FS=y && NETFS_SUPPORT=y
+ select FSCACHE
help
Say Y here if you want NFS data to be cached locally on disc through
the general filesystem cache manager
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 943aeea1eb16..6be13e0ec170 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -580,6 +580,8 @@ retry:
nfs4_delete_deviceid(node->ld, node->nfs_client, id);
goto retry;
}
+
+ nfs4_put_deviceid_node(node);
return ERR_PTR(-ENODEV);
}
@@ -893,10 +895,9 @@ bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
}
if (pgio->pg_dreq == NULL)
- wb_size = pnfs_num_cont_bytes(pgio->pg_inode,
- req->wb_index);
+ wb_size = pnfs_num_cont_bytes(pgio->pg_inode, req->wb_index);
else
- wb_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+ wb_size = nfs_dreq_bytes_left(pgio->pg_dreq, req_offset(req));
pnfs_generic_pg_init_write(pgio, req, wb_size);
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index f318a05a80e1..c97ebc42ec0f 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -351,6 +351,9 @@ bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
d->map = bl_map_simple;
d->pr_key = v->scsi.pr_key;
+ if (d->len == 0)
+ return -ENODEV;
+
pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
d->bdev_handle->bdev->bd_disk->disk_name, d->pr_key);
diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
index 6c977288cc28..d8d50a88de04 100644
--- a/fs/nfs/blocklayout/rpc_pipefs.c
+++ b/fs/nfs/blocklayout/rpc_pipefs.c
@@ -75,7 +75,7 @@ bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
msg->len = sizeof(*bl_msg) + b->simple.len;
msg->data = kzalloc(msg->len, gfp_mask);
if (!msg->data)
- goto out_free_data;
+ goto out_unlock;
bl_msg = msg->data;
bl_msg->type = BL_DEVICE_MOUNT;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 4ffa1f469e90..760d27dd7225 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -187,7 +187,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
* Check whether we're already up and running.
*/
if (cb_info->serv)
- return svc_get(cb_info->serv);
+ return cb_info->serv;
/*
* Sanity check: if there's no task,
@@ -245,9 +245,10 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
cb_info->users++;
err_net:
- if (!cb_info->users)
- cb_info->serv = NULL;
- svc_put(serv);
+ if (!cb_info->users) {
+ svc_set_num_threads(cb_info->serv, NULL, 0);
+ svc_destroy(&cb_info->serv);
+ }
err_create:
mutex_unlock(&nfs_callback_mutex);
return ret;
@@ -271,11 +272,9 @@ void nfs_callback_down(int minorversion, struct net *net)
nfs_callback_down_net(minorversion, serv, net);
cb_info->users--;
if (cb_info->users == 0) {
- svc_get(serv);
svc_set_num_threads(serv, NULL, 0);
- svc_put(serv);
dprintk("nfs_callback_down: service destroyed\n");
- cb_info->serv = NULL;
+ svc_destroy(&cb_info->serv);
}
mutex_unlock(&nfs_callback_mutex);
}
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index ccd4f245cae2..650758ee0d5f 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -19,32 +19,14 @@ enum nfs4_callback_procnum {
CB_COMPOUND = 1,
};
-enum nfs4_callback_opnum {
- OP_CB_GETATTR = 3,
- OP_CB_RECALL = 4,
-/* Callback operations new to NFSv4.1 */
- OP_CB_LAYOUTRECALL = 5,
- OP_CB_NOTIFY = 6,
- OP_CB_PUSH_DELEG = 7,
- OP_CB_RECALL_ANY = 8,
- OP_CB_RECALLABLE_OBJ_AVAIL = 9,
- OP_CB_RECALL_SLOT = 10,
- OP_CB_SEQUENCE = 11,
- OP_CB_WANTS_CANCELLED = 12,
- OP_CB_NOTIFY_LOCK = 13,
- OP_CB_NOTIFY_DEVICEID = 14,
-/* Callback operations new to NFSv4.2 */
- OP_CB_OFFLOAD = 15,
- OP_CB_ILLEGAL = 10044,
-};
-
struct nfs4_slot;
struct cb_process_state {
- __be32 drc_status;
struct nfs_client *clp;
struct nfs4_slot *slot;
- u32 minorversion;
struct net *net;
+ u32 minorversion;
+ __be32 drc_status;
+ unsigned int referring_calls;
};
struct cb_compound_hdr_arg {
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 96a4923080ae..76cea34477ae 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -207,7 +207,8 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp,
* Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
*/
static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
- const nfs4_stateid *new)
+ const nfs4_stateid *new,
+ struct cb_process_state *cps)
{
u32 oldseq, newseq;
@@ -221,28 +222,29 @@ static u32 pnfs_check_callback_stateid(struct pnfs_layout_hdr *lo,
newseq = be32_to_cpu(new->seqid);
/* Are we already in a layout recall situation? */
- if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
- lo->plh_return_seq != 0) {
- if (newseq < lo->plh_return_seq)
- return NFS4ERR_OLD_STATEID;
- if (newseq > lo->plh_return_seq)
- return NFS4ERR_DELAY;
- goto out;
- }
+ if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+ return NFS4ERR_DELAY;
- /* Check that the stateid matches what we think it should be. */
+ /*
+ * Check that the stateid matches what we think it should be.
+ * Note that if the server sent us a list of referring calls,
+ * and we know that those have completed, then we trust the
+ * stateid argument is correct.
+ */
oldseq = be32_to_cpu(lo->plh_stateid.seqid);
- if (newseq > oldseq + 1)
+ if (newseq > oldseq + 1 && !cps->referring_calls)
return NFS4ERR_DELAY;
+
/* Crazy server! */
if (newseq <= oldseq)
return NFS4ERR_OLD_STATEID;
-out:
+
return NFS_OK;
}
static u32 initiate_file_draining(struct nfs_client *clp,
- struct cb_layoutrecallargs *args)
+ struct cb_layoutrecallargs *args,
+ struct cb_process_state *cps)
{
struct inode *ino;
struct pnfs_layout_hdr *lo;
@@ -266,7 +268,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
goto out;
}
pnfs_get_layout_hdr(lo);
- rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
+ rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid, cps);
if (rv != NFS_OK)
goto unlock;
@@ -326,10 +328,11 @@ static u32 initiate_bulk_draining(struct nfs_client *clp,
}
static u32 do_callback_layoutrecall(struct nfs_client *clp,
- struct cb_layoutrecallargs *args)
+ struct cb_layoutrecallargs *args,
+ struct cb_process_state *cps)
{
if (args->cbl_recall_type == RETURN_FILE)
- return initiate_file_draining(clp, args);
+ return initiate_file_draining(clp, args, cps);
return initiate_bulk_draining(clp, args);
}
@@ -340,11 +343,12 @@ __be32 nfs4_callback_layoutrecall(void *argp, void *resp,
u32 res = NFS4ERR_OP_NOT_IN_SESSION;
if (cps->clp)
- res = do_callback_layoutrecall(cps->clp, args);
+ res = do_callback_layoutrecall(cps->clp, args, cps);
return cpu_to_be32(res);
}
-static void pnfs_recall_all_layouts(struct nfs_client *clp)
+static void pnfs_recall_all_layouts(struct nfs_client *clp,
+ struct cb_process_state *cps)
{
struct cb_layoutrecallargs args;
@@ -352,7 +356,7 @@ static void pnfs_recall_all_layouts(struct nfs_client *clp)
memset(&args, 0, sizeof(args));
args.cbl_recall_type = RETURN_ALL;
/* FIXME we ignore errors, what should we do? */
- do_callback_layoutrecall(clp, &args);
+ do_callback_layoutrecall(clp, &args, cps);
}
__be32 nfs4_callback_devicenotify(void *argp, void *resp,
@@ -450,6 +454,7 @@ static int referring_call_exists(struct nfs_client *clp,
__acquires(lock)
{
int status = 0;
+ int found = 0;
int i, j;
struct nfs4_session *session;
struct nfs4_slot_table *tbl;
@@ -478,11 +483,12 @@ static int referring_call_exists(struct nfs_client *clp,
spin_lock(lock);
if (status)
goto out;
+ found++;
}
}
out:
- return status;
+ return status < 0 ? status : found;
}
__be32 nfs4_callback_sequence(void *argp, void *resp,
@@ -493,6 +499,7 @@ __be32 nfs4_callback_sequence(void *argp, void *resp,
struct nfs4_slot_table *tbl;
struct nfs4_slot *slot;
struct nfs_client *clp;
+ int ret;
int i;
__be32 status = htonl(NFS4ERR_BADSESSION);
@@ -552,11 +559,13 @@ __be32 nfs4_callback_sequence(void *argp, void *resp,
* related callback was received before the response to the original
* call.
*/
- if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists,
- &tbl->slot_tbl_lock) < 0) {
+ ret = referring_call_exists(clp, args->csa_nrclists, args->csa_rclists,
+ &tbl->slot_tbl_lock);
+ if (ret < 0) {
status = htonl(NFS4ERR_DELAY);
goto out_unlock;
}
+ cps->referring_calls = ret;
/*
* RFC5661 20.9.3
@@ -617,7 +626,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
nfs_expire_unused_delegation_types(cps->clp, flags);
if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT))
- pnfs_recall_all_layouts(cps->clp);
+ pnfs_recall_all_layouts(cps->clp, cps);
if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) {
set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 321af81c456e..9369488f2ed4 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -967,6 +967,11 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp)
nops--;
}
+ if (svc_is_backchannel(rqstp) && cps.clp) {
+ rqstp->bc_to_initval = cps.clp->cl_rpcclient->cl_timeout->to_initval;
+ rqstp->bc_to_retries = cps.clp->cl_rpcclient->cl_timeout->to_retries;
+ }
+
*hdr_res.status = status;
*hdr_res.nops = htonl(nops);
nfs4_cb_free_slot(&cps);
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 44eca51b2808..fbdc9ca80f71 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -246,7 +246,7 @@ void nfs_free_client(struct nfs_client *clp)
put_nfs_version(clp->cl_nfs_mod);
kfree(clp->cl_hostname);
kfree(clp->cl_acceptor);
- kfree(clp);
+ kfree_rcu(clp, rcu);
}
EXPORT_SYMBOL_GPL(nfs_free_client);
@@ -1006,6 +1006,14 @@ struct nfs_server *nfs_alloc_server(void)
}
EXPORT_SYMBOL_GPL(nfs_alloc_server);
+static void delayed_free(struct rcu_head *p)
+{
+ struct nfs_server *server = container_of(p, struct nfs_server, rcu);
+
+ nfs_free_iostats(server->io_stats);
+ kfree(server);
+}
+
/*
* Free up a server record
*/
@@ -1031,10 +1039,9 @@ void nfs_free_server(struct nfs_server *server)
ida_destroy(&server->lockowner_id);
ida_destroy(&server->openowner_id);
- nfs_free_iostats(server->io_stats);
put_cred(server->cred);
- kfree(server);
nfs_release_automount_timer();
+ call_rcu(&server->rcu, delayed_free);
}
EXPORT_SYMBOL_GPL(nfs_free_server);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 13dffe4201e6..ac505671efbd 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1431,9 +1431,9 @@ static bool nfs_verifier_is_delegated(struct dentry *dentry)
static void nfs_set_verifier_locked(struct dentry *dentry, unsigned long verf)
{
struct inode *inode = d_inode(dentry);
- struct inode *dir = d_inode(dentry->d_parent);
+ struct inode *dir = d_inode_rcu(dentry->d_parent);
- if (!nfs_verify_change_attribute(dir, verf))
+ if (!dir || !nfs_verify_change_attribute(dir, verf))
return;
if (inode && NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
nfs_set_verifier_delegated(&verf);
@@ -2194,6 +2194,8 @@ nfs4_do_lookup_revalidate(struct inode *dir, struct dentry *dentry,
{
struct inode *inode;
+ trace_nfs_lookup_revalidate_enter(dir, dentry, flags);
+
if (!(flags & LOOKUP_OPEN) || (flags & LOOKUP_DIRECTORY))
goto full_reval;
if (d_mountpoint(dentry))
@@ -2963,7 +2965,7 @@ static u64 nfs_access_login_time(const struct task_struct *task,
rcu_read_lock();
for (;;) {
parent = rcu_dereference(task->real_parent);
- pcred = rcu_dereference(parent->cred);
+ pcred = __task_cred(parent);
if (parent == task || cred_fscmp(pcred, cred) != 0)
break;
task = parent;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index f6c74f424691..c03926a1cc73 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -205,9 +205,10 @@ static void nfs_direct_req_release(struct nfs_direct_req *dreq)
kref_put(&dreq->kref, nfs_direct_req_free);
}
-ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq)
+ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset)
{
- return dreq->bytes_left;
+ loff_t start = offset - dreq->io_start;
+ return dreq->max_count - start;
}
EXPORT_SYMBOL_GPL(nfs_dreq_bytes_left);
@@ -368,7 +369,6 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
bytes -= req_len;
requested_bytes += req_len;
pos += req_len;
- dreq->bytes_left -= req_len;
}
nfs_direct_release_pages(pagevec, npages);
kvfree(pagevec);
@@ -440,7 +440,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
goto out;
dreq->inode = inode;
- dreq->bytes_left = dreq->max_count = count;
+ dreq->max_count = count;
dreq->io_start = iocb->ki_pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
@@ -873,7 +873,6 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
bytes -= req_len;
requested_bytes += req_len;
pos += req_len;
- dreq->bytes_left -= req_len;
if (defer) {
nfs_mark_request_commit(req, NULL, &cinfo, 0);
@@ -980,7 +979,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
goto out;
dreq->inode = inode;
- dreq->bytes_left = dreq->max_count = count;
+ dreq->max_count = count;
dreq->io_start = pos;
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3f9768810427..8577ccf621f5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -558,7 +558,6 @@ const struct address_space_operations nfs_file_aops = {
.read_folio = nfs_read_folio,
.readahead = nfs_readahead,
.dirty_folio = filemap_dirty_folio,
- .writepage = nfs_writepage,
.writepages = nfs_writepages,
.write_begin = nfs_write_begin,
.write_end = nfs_write_end,
@@ -567,7 +566,7 @@ const struct address_space_operations nfs_file_aops = {
.migrate_folio = nfs_migrate_folio,
.launder_folio = nfs_launder_folio,
.is_dirty_writeback = nfs_check_dirty_writeback,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = nfs_swap_activate,
.swap_deactivate = nfs_swap_deactivate,
.swap_rw = nfs_swap_rw,
diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c
index b05717fe0d4e..2d1bfee225c3 100644
--- a/fs/nfs/fscache.c
+++ b/fs/nfs/fscache.c
@@ -274,12 +274,6 @@ static void nfs_netfs_free_request(struct netfs_io_request *rreq)
put_nfs_open_context(rreq->netfs_priv);
}
-static inline int nfs_netfs_begin_cache_operation(struct netfs_io_request *rreq)
-{
- return fscache_begin_read_operation(&rreq->cache_resources,
- netfs_i_cookie(netfs_inode(rreq->inode)));
-}
-
static struct nfs_netfs_io_data *nfs_netfs_alloc(struct netfs_io_subrequest *sreq)
{
struct nfs_netfs_io_data *netfs;
@@ -387,7 +381,6 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr)
const struct netfs_request_ops nfs_netfs_ops = {
.init_request = nfs_netfs_init_request,
.free_request = nfs_netfs_free_request,
- .begin_cache_operation = nfs_netfs_begin_cache_operation,
.issue_read = nfs_netfs_issue_read,
.clamp_length = nfs_netfs_clamp_length
};
diff --git a/fs/nfs/fscache.h b/fs/nfs/fscache.h
index 5407ab8c8783..e3cb4923316b 100644
--- a/fs/nfs/fscache.h
+++ b/fs/nfs/fscache.h
@@ -80,7 +80,7 @@ static inline void nfs_netfs_put(struct nfs_netfs_io_data *netfs)
}
static inline void nfs_netfs_inode_init(struct nfs_inode *nfsi)
{
- netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops);
+ netfs_inode_init(&nfsi->netfs, &nfs_netfs_ops, false);
}
extern void nfs_netfs_initiate_read(struct nfs_pgio_header *hdr);
extern void nfs_netfs_read_completion(struct nfs_pgio_header *hdr);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9c9cf764f600..e3722ce6722e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -655,7 +655,7 @@ extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry);
/* direct.c */
void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq);
-extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
+extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq, loff_t offset);
/* nfs4proc.c */
extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
@@ -936,7 +936,6 @@ struct nfs_direct_req {
loff_t io_start; /* Start offset for I/O */
ssize_t count, /* bytes actually processed */
max_count, /* max expected count */
- bytes_left, /* bytes left to be sent */
error; /* any reported error */
struct completion completion; /* wait for i/o completion */
diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c
index 2ad66a8922f4..49aaf28a6950 100644
--- a/fs/nfs/nfs42xattr.c
+++ b/fs/nfs/nfs42xattr.c
@@ -132,7 +132,7 @@ nfs4_xattr_entry_lru_add(struct nfs4_xattr_entry *entry)
lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
&nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
- return list_lru_add(lru, &entry->lru);
+ return list_lru_add_obj(lru, &entry->lru);
}
static bool
@@ -143,7 +143,7 @@ nfs4_xattr_entry_lru_del(struct nfs4_xattr_entry *entry)
lru = (entry->flags & NFS4_XATTR_ENTRY_EXTVAL) ?
&nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru;
- return list_lru_del(lru, &entry->lru);
+ return list_lru_del_obj(lru, &entry->lru);
}
/*
@@ -349,7 +349,7 @@ nfs4_xattr_cache_unlink(struct inode *inode)
oldcache = nfsi->xattr_cache;
if (oldcache != NULL) {
- list_lru_del(&nfs4_xattr_cache_lru, &oldcache->lru);
+ list_lru_del_obj(&nfs4_xattr_cache_lru, &oldcache->lru);
oldcache->inode = NULL;
}
nfsi->xattr_cache = NULL;
@@ -474,7 +474,7 @@ nfs4_xattr_get_cache(struct inode *inode, int add)
kref_get(&cache->ref);
nfsi->xattr_cache = cache;
cache->inode = inode;
- list_lru_add(&nfs4_xattr_cache_lru, &cache->lru);
+ list_lru_add_obj(&nfs4_xattr_cache_lru, &cache->lru);
}
spin_unlock(&inode->i_lock);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 02788c3c85e5..e238abc78a13 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -10,6 +10,7 @@
#include <linux/mount.h>
#include <linux/nfs_fs.h>
#include <linux/nfs_ssc.h>
+#include <linux/splice.h>
#include "delegation.h"
#include "internal.h"
#include "iostat.h"
@@ -195,8 +196,8 @@ static ssize_t nfs4_copy_file_range(struct file *file_in, loff_t pos_in,
ret = __nfs4_copy_file_range(file_in, pos_in, file_out, pos_out, count,
flags);
if (ret == -EOPNOTSUPP || ret == -EXDEV)
- ret = generic_copy_file_range(file_in, pos_in, file_out,
- pos_out, count, flags);
+ ret = splice_copy_file_range(file_in, pos_in, file_out,
+ pos_out, count);
return ret;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8a943fffaad5..23819a756508 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -170,6 +170,7 @@ static int nfs4_map_errors(int err)
case -NFS4ERR_RESOURCE:
case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
return -EREMOTEIO;
case -NFS4ERR_WRONGSEC:
case -NFS4ERR_WRONG_CRED:
@@ -558,6 +559,7 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
case -NFS4ERR_GRACE:
case -NFS4ERR_LAYOUTTRYLATER:
case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
exception->delay = 1;
return 0;
@@ -9691,6 +9693,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
status = -EBUSY;
break;
case -NFS4ERR_RECALLCONFLICT:
+ case -NFS4ERR_RETURNCONFLICT:
status = -ERECALLCONFLICT;
break;
case -NFS4ERR_DELEG_REVOKED:
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index e776200e9a11..886a7c4c60b3 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -34,7 +34,6 @@ static struct ctl_table nfs4_cb_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
int nfs4_register_sysctl(void)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index deec76cf5afe..69406e60f391 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1602,7 +1602,8 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_pgio_args *args
static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
{
uint32_t attrs[3] = {
- FATTR4_WORD0_RDATTR_ERROR,
+ FATTR4_WORD0_TYPE
+ | FATTR4_WORD0_RDATTR_ERROR,
FATTR4_WORD1_MOUNTED_ON_FILEID,
};
uint32_t dircount = readdir->count;
@@ -1612,12 +1613,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
unsigned int i;
if (readdir->plus) {
- attrs[0] |= FATTR4_WORD0_TYPE|FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE|
- FATTR4_WORD0_FSID|FATTR4_WORD0_FILEHANDLE|FATTR4_WORD0_FILEID;
- attrs[1] |= FATTR4_WORD1_MODE|FATTR4_WORD1_NUMLINKS|FATTR4_WORD1_OWNER|
- FATTR4_WORD1_OWNER_GROUP|FATTR4_WORD1_RAWDEV|
- FATTR4_WORD1_SPACE_USED|FATTR4_WORD1_TIME_ACCESS|
- FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
+ attrs[0] |= FATTR4_WORD0_CHANGE
+ | FATTR4_WORD0_SIZE
+ | FATTR4_WORD0_FSID
+ | FATTR4_WORD0_FILEHANDLE
+ | FATTR4_WORD0_FILEID;
+ attrs[1] |= FATTR4_WORD1_MODE
+ | FATTR4_WORD1_NUMLINKS
+ | FATTR4_WORD1_OWNER
+ | FATTR4_WORD1_OWNER_GROUP
+ | FATTR4_WORD1_RAWDEV
+ | FATTR4_WORD1_SPACE_USED
+ | FATTR4_WORD1_TIME_ACCESS
+ | FATTR4_WORD1_TIME_METADATA
+ | FATTR4_WORD1_TIME_MODIFY;
attrs[2] |= FATTR4_WORD2_SECURITY_LABEL;
}
/* Use mounted_on_fileid only if the server supports it */
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 4e90ca531176..afedb449b54f 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -400,6 +400,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event,
__field(unsigned long, flags)
__field(dev_t, dev)
__field(u64, dir)
+ __field(u64, fileid)
__string(name, dentry->d_name.name)
),
@@ -407,16 +408,18 @@ DECLARE_EVENT_CLASS(nfs_lookup_event,
__entry->dev = dir->i_sb->s_dev;
__entry->dir = NFS_FILEID(dir);
__entry->flags = flags;
+ __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry));
__assign_str(name, dentry->d_name.name);
),
TP_printk(
- "flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ "flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu",
__entry->flags,
show_fs_lookup_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
- __get_str(name)
+ __get_str(name),
+ __entry->fileid
)
);
@@ -444,6 +447,7 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done,
__field(unsigned long, flags)
__field(dev_t, dev)
__field(u64, dir)
+ __field(u64, fileid)
__string(name, dentry->d_name.name)
),
@@ -452,17 +456,19 @@ DECLARE_EVENT_CLASS(nfs_lookup_event_done,
__entry->dir = NFS_FILEID(dir);
__entry->error = error < 0 ? -error : 0;
__entry->flags = flags;
+ __entry->fileid = d_is_negative(dentry) ? 0 : NFS_FILEID(d_inode(dentry));
__assign_str(name, dentry->d_name.name);
),
TP_printk(
- "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s",
+ "error=%ld (%s) flags=0x%lx (%s) name=%02x:%02x:%llu/%s fileid=%llu",
-__entry->error, show_nfs_status(__entry->error),
__entry->flags,
show_fs_lookup_flags(__entry->flags),
MAJOR(__entry->dev), MINOR(__entry->dev),
(unsigned long long)__entry->dir,
- __get_str(name)
+ __get_str(name),
+ __entry->fileid
)
);
@@ -893,7 +899,7 @@ DECLARE_EVENT_CLASS(nfs_rename_event_done,
DEFINE_NFS_RENAME_EVENT(nfs_rename_enter);
DEFINE_NFS_RENAME_EVENT_DONE(nfs_rename_exit);
-DEFINE_NFS_RENAME_EVENT_DONE(nfs_sillyrename_rename);
+DEFINE_NFS_RENAME_EVENT_DONE(nfs_async_rename_done);
TRACE_EVENT(nfs_sillyrename_unlink,
TP_PROTO(
@@ -1539,7 +1545,6 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class,
__field(u32, fhandle)
__field(loff_t, offset)
__field(ssize_t, count)
- __field(ssize_t, bytes_left)
__field(ssize_t, error)
__field(int, flags)
),
@@ -1554,19 +1559,18 @@ DECLARE_EVENT_CLASS(nfs_direct_req_class,
__entry->fhandle = nfs_fhandle_hash(fh);
__entry->offset = dreq->io_start;
__entry->count = dreq->count;
- __entry->bytes_left = dreq->bytes_left;
__entry->error = dreq->error;
__entry->flags = dreq->flags;
),
TP_printk(
"error=%zd fileid=%02x:%02x:%llu fhandle=0x%08x "
- "offset=%lld count=%zd bytes_left=%zd flags=%s",
+ "offset=%lld count=%zd flags=%s",
__entry->error, MAJOR(__entry->dev),
MINOR(__entry->dev),
(unsigned long long)__entry->fileid,
__entry->fhandle, __entry->offset,
- __entry->count, __entry->bytes_left,
+ __entry->count,
nfs_show_direct_req_flags(__entry->flags)
)
);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 21a365357629..0c0fed1ecd0b 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -2733,7 +2733,8 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
if (pgio->pg_dreq == NULL)
rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
else
- rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+ rd_size = nfs_dreq_bytes_left(pgio->pg_dreq,
+ req_offset(req));
pgio->pg_lseg =
pnfs_update_layout(pgio->pg_inode, nfs_req_openctx(req),
diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c
index f39e2089bc4c..e645be1a3381 100644
--- a/fs/nfs/sysctl.c
+++ b/fs/nfs/sysctl.c
@@ -29,7 +29,6 @@ static struct ctl_table nfs_cb_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- { }
};
int nfs_register_sysctl(void)
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 150a953a8be9..0110299643a2 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -267,7 +267,7 @@ static void nfs_async_rename_done(struct rpc_task *task, void *calldata)
struct inode *new_dir = data->new_dir;
struct dentry *old_dentry = data->old_dentry;
- trace_nfs_sillyrename_rename(old_dir, old_dentry,
+ trace_nfs_async_rename_done(old_dir, old_dentry,
new_dir, data->new_dentry, task->tk_status);
if (!NFS_PROTO(old_dir)->rename_done(task, old_dir, new_dir)) {
rpc_restart_call_prepare(task);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b664caea8b4e..bb79d3a886ae 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -192,13 +192,13 @@ static struct nfs_page *nfs_folio_find_private_request(struct folio *folio)
if (!folio_test_private(folio))
return NULL;
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
req = nfs_folio_private_request(folio);
if (req) {
WARN_ON_ONCE(req->wb_head != req);
kref_get(&req->wb_kref);
}
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
return req;
}
@@ -680,17 +680,6 @@ static int nfs_writepage_locked(struct folio *folio,
return err;
}
-int nfs_writepage(struct page *page, struct writeback_control *wbc)
-{
- struct folio *folio = page_folio(page);
- int ret;
-
- ret = nfs_writepage_locked(folio, wbc);
- if (ret != AOP_WRITEPAGE_ACTIVATE)
- unlock_page(page);
- return ret;
-}
-
static int nfs_writepages_callback(struct folio *folio,
struct writeback_control *wbc, void *data)
{
@@ -769,13 +758,13 @@ static void nfs_inode_add_request(struct nfs_page *req)
* Swap-space should not get truncated. Hence no need to plug the race
* with invalidate/truncate.
*/
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
if (likely(!folio_test_swapcache(folio))) {
set_bit(PG_MAPPED, &req->wb_flags);
folio_set_private(folio);
folio->private = req;
}
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
atomic_long_inc(&nfsi->nrequests);
/* this a head request for a page group - mark it as having an
* extra reference so sub groups can follow suit.
@@ -796,13 +785,13 @@ static void nfs_inode_remove_request(struct nfs_page *req)
struct folio *folio = nfs_page_to_folio(req->wb_head);
struct address_space *mapping = folio_file_mapping(folio);
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
if (likely(folio && !folio_test_swapcache(folio))) {
folio->private = NULL;
folio_clear_private(folio);
clear_bit(PG_MAPPED, &req->wb_head->wb_flags);
}
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
}
if (test_and_clear_bit(PG_INODE_REF, &req->wb_flags)) {
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 43b88eaf0673..272ab8d5c4d7 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -158,3 +158,19 @@ config NFSD_V4_SECURITY_LABEL
If you do not wish to enable fine-grained security labels SELinux or
Smack policies on NFSv4 files, say N.
+
+config NFSD_LEGACY_CLIENT_TRACKING
+ bool "Support legacy NFSv4 client tracking methods (DEPRECATED)"
+ depends on NFSD_V4
+ default n
+ help
+ The NFSv4 server needs to store a small amount of information on
+ stable storage in order to handle state recovery after reboot. Most
+ modern deployments upcall to a userland daemon for this (nfsdcld),
+ but older NFS servers may store information directly in a
+ recoverydir, or spawn a process directly using a usermodehelper
+ upcall.
+
+ These legacy client tracking methods have proven to be probelmatic
+ and will be removed in the future. Say Y here if you need support
+ for them in the interim.
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index ef063f93fde9..9cb7f0c33df5 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -322,7 +322,7 @@ nfsd_file_check_writeback(struct nfsd_file *nf)
static bool nfsd_file_lru_add(struct nfsd_file *nf)
{
set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags);
- if (list_lru_add(&nfsd_file_lru, &nf->nf_lru)) {
+ if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) {
trace_nfsd_file_lru_add(nf);
return true;
}
@@ -331,7 +331,7 @@ static bool nfsd_file_lru_add(struct nfsd_file *nf)
static bool nfsd_file_lru_remove(struct nfsd_file *nf)
{
- if (list_lru_del(&nfsd_file_lru, &nf->nf_lru)) {
+ if (list_lru_del_obj(&nfsd_file_lru, &nf->nf_lru)) {
trace_nfsd_file_lru_del(nf);
return true;
}
@@ -717,7 +717,7 @@ nfsd_file_cache_init(void)
return ret;
ret = -ENOMEM;
- nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0);
+ nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", WQ_UNBOUND, 0);
if (!nfsd_filecache_wq)
goto out;
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ab303a8b77d5..74b4360779a1 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -123,14 +123,9 @@ struct nfsd_net {
u32 clientid_counter;
u32 clverifier_counter;
- struct svc_serv *nfsd_serv;
- /* When a listening socket is added to nfsd, keep_active is set
- * and this justifies a reference on nfsd_serv. This stops
- * nfsd_serv from being freed. When the number of threads is
- * set, keep_active is cleared and the reference is dropped. So
- * when the last thread exits, the service will be destroyed.
- */
- int keep_active;
+ struct svc_info nfsd_info;
+#define nfsd_serv nfsd_info.serv
+
/*
* clientid and stateid data for construction of net unique COPY
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 4039ffcf90ba..926c29879c6a 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -31,6 +31,7 @@
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
+#include <linux/nfs4.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/xprt.h>
#include <linux/sunrpc/svc_xprt.h>
@@ -87,31 +88,6 @@ static void encode_bitmap4(struct xdr_stream *xdr, const __u32 *bitmap,
WARN_ON_ONCE(xdr_stream_encode_uint32_array(xdr, bitmap, len) < 0);
}
-/*
- * nfs_cb_opnum4
- *
- * enum nfs_cb_opnum4 {
- * OP_CB_GETATTR = 3,
- * ...
- * };
- */
-enum nfs_cb_opnum4 {
- OP_CB_GETATTR = 3,
- OP_CB_RECALL = 4,
- OP_CB_LAYOUTRECALL = 5,
- OP_CB_NOTIFY = 6,
- OP_CB_PUSH_DELEG = 7,
- OP_CB_RECALL_ANY = 8,
- OP_CB_RECALLABLE_OBJ_AVAIL = 9,
- OP_CB_RECALL_SLOT = 10,
- OP_CB_SEQUENCE = 11,
- OP_CB_WANTS_CANCELLED = 12,
- OP_CB_NOTIFY_LOCK = 13,
- OP_CB_NOTIFY_DEVICEID = 14,
- OP_CB_OFFLOAD = 15,
- OP_CB_ILLEGAL = 10044
-};
-
static void encode_nfs_cb_opnum4(struct xdr_stream *xdr, enum nfs_cb_opnum4 op)
{
__be32 *p;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6f2d4aa4970d..14712fa08f76 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -970,8 +970,11 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* To ensure proper ordering, we therefore turn off zero copy if
* the client wants us to do more in this compound:
*/
- if (!nfsd4_last_compound_op(rqstp))
- clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
+ if (!nfsd4_last_compound_op(rqstp)) {
+ struct nfsd4_compoundargs *argp = rqstp->rq_argp;
+
+ argp->splice_ok = false;
+ }
/* check stateid */
status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 3509e73abe1f..2c060e0b1604 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -66,6 +66,7 @@ struct nfsd4_client_tracking_ops {
static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops;
static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2;
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
/* Globals */
static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
@@ -720,6 +721,7 @@ static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
.version = 1,
.msglen = 0,
};
+#endif /* CONFIG_NFSD_LEGACY_CLIENT_TRACKING */
/* Globals */
#define NFSD_PIPE_DIR "nfsd"
@@ -731,8 +733,10 @@ struct cld_net {
spinlock_t cn_lock;
struct list_head cn_list;
unsigned int cn_xid;
- bool cn_has_legacy;
struct crypto_shash *cn_tfm;
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
+ bool cn_has_legacy;
+#endif
};
struct cld_upcall {
@@ -793,7 +797,6 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
uint8_t cmd, princhashlen;
struct xdr_netobj name, princhash = { .len = 0, .data = NULL };
uint16_t namelen;
- struct cld_net *cn = nn->cld_net;
if (get_user(cmd, &cmsg->cm_cmd)) {
dprintk("%s: error when copying cmd from userspace", __func__);
@@ -833,11 +836,15 @@ __cld_pipe_inprogress_downcall(const struct cld_msg_v2 __user *cmsg,
return PTR_ERR(name.data);
name.len = namelen;
}
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
if (name.len > 5 && memcmp(name.data, "hash:", 5) == 0) {
+ struct cld_net *cn = nn->cld_net;
+
name.len = name.len - 5;
memmove(name.data, name.data + 5, name.len);
cn->cn_has_legacy = true;
}
+#endif
if (!nfs4_client_to_reclaim(name, princhash, nn)) {
kfree(name.data);
kfree(princhash.data);
@@ -1010,7 +1017,9 @@ __nfsd4_init_cld_pipe(struct net *net)
}
cn->cn_pipe->dentry = dentry;
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
cn->cn_has_legacy = false;
+#endif
nn->cld_net = cn;
return 0;
@@ -1282,10 +1291,6 @@ nfsd4_cld_check(struct nfs4_client *clp)
{
struct nfs4_client_reclaim *crp;
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
- struct cld_net *cn = nn->cld_net;
- int status;
- char dname[HEXDIR_LEN];
- struct xdr_netobj name;
/* did we already find that this client is stable? */
if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags))
@@ -1296,7 +1301,12 @@ nfsd4_cld_check(struct nfs4_client *clp)
if (crp)
goto found;
- if (cn->cn_has_legacy) {
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
+ if (nn->cld_net->cn_has_legacy) {
+ int status;
+ char dname[HEXDIR_LEN];
+ struct xdr_netobj name;
+
status = nfs4_make_rec_clidname(dname, &clp->cl_name);
if (status)
return -ENOENT;
@@ -1314,6 +1324,7 @@ nfsd4_cld_check(struct nfs4_client *clp)
goto found;
}
+#endif
return -ENOENT;
found:
crp->cr_clp = clp;
@@ -1327,8 +1338,6 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
int status;
- char dname[HEXDIR_LEN];
- struct xdr_netobj name;
struct crypto_shash *tfm = cn->cn_tfm;
struct xdr_netobj cksum;
char *principal = NULL;
@@ -1342,7 +1351,11 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)
if (crp)
goto found;
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
if (cn->cn_has_legacy) {
+ struct xdr_netobj name;
+ char dname[HEXDIR_LEN];
+
status = nfs4_make_rec_clidname(dname, &clp->cl_name);
if (status)
return -ENOENT;
@@ -1360,6 +1373,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)
goto found;
}
+#endif
return -ENOENT;
found:
if (crp->cr_princhash.len) {
@@ -1663,6 +1677,7 @@ static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops_v2 = {
.msglen = sizeof(struct cld_msg_v2),
};
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
/* upcall via usermodehelper */
static char cltrack_prog[PATH_MAX] = "/sbin/nfsdcltrack";
module_param_string(cltrack_prog, cltrack_prog, sizeof(cltrack_prog),
@@ -2007,28 +2022,10 @@ static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
.msglen = 0,
};
-int
-nfsd4_client_tracking_init(struct net *net)
+static inline int check_for_legacy_methods(int status, struct net *net)
{
- int status;
- struct path path;
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-
- /* just run the init if it the method is already decided */
- if (nn->client_tracking_ops)
- goto do_init;
-
- /* First, try to use nfsdcld */
- nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
- status = nn->client_tracking_ops->init(net);
- if (!status)
- return status;
- if (status != -ETIMEDOUT) {
- nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0;
- status = nn->client_tracking_ops->init(net);
- if (!status)
- return status;
- }
+ struct path path;
/*
* Next, try the UMH upcall.
@@ -2045,14 +2042,46 @@ nfsd4_client_tracking_init(struct net *net)
nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
if (!status) {
- status = d_is_dir(path.dentry);
+ status = !d_is_dir(path.dentry);
path_put(&path);
- if (!status) {
- status = -EINVAL;
- goto out;
- }
+ if (status)
+ return -ENOTDIR;
+ status = nn->client_tracking_ops->init(net);
+ }
+ return status;
+}
+#else
+static inline int check_for_legacy_methods(int status, struct net *net)
+{
+ return status;
+}
+#endif /* CONFIG_LEGACY_NFSD_CLIENT_TRACKING */
+
+int
+nfsd4_client_tracking_init(struct net *net)
+{
+ struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+ int status;
+
+ /* just run the init if it the method is already decided */
+ if (nn->client_tracking_ops)
+ goto do_init;
+
+ /* First, try to use nfsdcld */
+ nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
+ status = nn->client_tracking_ops->init(net);
+ if (!status)
+ return status;
+ if (status != -ETIMEDOUT) {
+ nn->client_tracking_ops = &nfsd4_cld_tracking_ops_v0;
+ status = nn->client_tracking_ops->init(net);
+ if (!status)
+ return status;
}
+ status = check_for_legacy_methods(status, net);
+ if (status)
+ goto out;
do_init:
status = nn->client_tracking_ops->init(net);
out:
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3edbfa0233e6..7d6c657e0409 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4945,10 +4945,8 @@ nfsd_break_deleg_cb(struct file_lock *fl)
*/
fl->fl_break_time = 0;
- spin_lock(&fp->fi_lock);
fp->fi_had_conflict = true;
nfsd_break_one_deleg(dp);
- spin_unlock(&fp->fi_lock);
return false;
}
@@ -5557,12 +5555,13 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
if (status)
goto out_unlock;
+ status = -EAGAIN;
+ if (fp->fi_had_conflict)
+ goto out_unlock;
+
spin_lock(&state_lock);
spin_lock(&fp->fi_lock);
- if (fp->fi_had_conflict)
- status = -EAGAIN;
- else
- status = hash_delegation_locked(dp, fp);
+ status = hash_delegation_locked(dp, fp);
spin_unlock(&fp->fi_lock);
spin_unlock(&state_lock);
@@ -6575,7 +6574,7 @@ unlock:
spin_unlock(&nn->s2s_cp_lock);
if (!state)
return nfserr_bad_stateid;
- if (!clp && state)
+ if (!clp)
*cps = state;
return 0;
}
@@ -7911,14 +7910,16 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
{
struct file_lock *fl;
int status = false;
- struct nfsd_file *nf = find_any_file(fp);
+ struct nfsd_file *nf;
struct inode *inode;
struct file_lock_context *flctx;
+ spin_lock(&fp->fi_lock);
+ nf = find_any_file_locked(fp);
if (!nf) {
/* Any valid lock stateid should have some sort of access */
WARN_ON_ONCE(1);
- return status;
+ goto out;
}
inode = file_inode(nf->nf_file);
@@ -7934,7 +7935,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
}
spin_unlock(&flctx->flc_lock);
}
- nfsd_file_put(nf);
+out:
+ spin_unlock(&fp->fi_lock);
return status;
}
@@ -7944,10 +7946,8 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
* @cstate: NFSv4 COMPOUND state
* @u: RELEASE_LOCKOWNER arguments
*
- * The lockowner's so_count is bumped when a lock record is added
- * or when copying a conflicting lock. The latter case is brief,
- * but can lead to fleeting false positives when looking for
- * locks-in-use.
+ * Check if theree are any locks still held and if not - free the lockowner
+ * and any lock state that is owned.
*
* Return values:
* %nfs_ok: lockowner released or not found
@@ -7983,10 +7983,13 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
spin_unlock(&clp->cl_lock);
return nfs_ok;
}
- if (atomic_read(&lo->lo_owner.so_count) != 2) {
- spin_unlock(&clp->cl_lock);
- nfs4_put_stateowner(&lo->lo_owner);
- return nfserr_locks_held;
+
+ list_for_each_entry(stp, &lo->lo_owner.so_stateids, st_perstateowner) {
+ if (check_for_locks(stp->st_stid.sc_file, lo)) {
+ spin_unlock(&clp->cl_lock);
+ nfs4_put_stateowner(&lo->lo_owner);
+ return nfserr_locks_held;
+ }
}
unhash_lockowner_locked(lo);
while (!list_empty(&lo->lo_owner.so_stateids)) {
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b499fe9caa32..c719c475a068 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2524,8 +2524,9 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
svc_reserve(argp->rqstp, max_reply + readbytes);
argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
+ argp->splice_ok = nfsd_read_splice_ok(argp->rqstp);
if (readcount > 1 || max_reply > PAGE_SIZE - auth_slack)
- clear_bit(RQ_SPLICE_OK, &argp->rqstp->rq_flags);
+ argp->splice_ok = false;
return true;
}
@@ -4375,12 +4376,13 @@ static __be32
nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr,
union nfsd4_op_u *u)
{
+ struct nfsd4_compoundargs *argp = resp->rqstp->rq_argp;
struct nfsd4_read *read = &u->read;
- bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
- unsigned long maxcount;
struct xdr_stream *xdr = resp->xdr;
- struct file *file;
int starting_len = xdr->buf->len;
+ bool splice_ok = argp->splice_ok;
+ unsigned long maxcount;
+ struct file *file;
__be32 *p;
if (nfserr)
@@ -5201,9 +5203,10 @@ static __be32
nfsd4_encode_read_plus_data(struct nfsd4_compoundres *resp,
struct nfsd4_read *read)
{
- bool splice_ok = test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags);
+ struct nfsd4_compoundargs *argp = resp->rqstp->rq_argp;
struct file *file = read->rd_nf->nf_file;
struct xdr_stream *xdr = resp->xdr;
+ bool splice_ok = argp->splice_ok;
unsigned long maxcount;
__be32 nfserr, *p;
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index d3273a396659..5c1a4a0aa605 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -364,8 +364,6 @@ nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
if (freed > sc->nr_to_scan)
break;
}
-
- trace_nfsd_drc_gc(nn, freed);
return freed;
}
@@ -508,7 +506,6 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
__wsum csum;
struct nfsd_drc_bucket *b;
int type = rqstp->rq_cachetype;
- unsigned long freed;
LIST_HEAD(dispose);
int rtn = RC_DOIT;
@@ -538,8 +535,7 @@ int nfsd_cache_lookup(struct svc_rqst *rqstp, unsigned int start,
nfsd_prune_bucket_locked(nn, b, 3, &dispose);
spin_unlock(&b->cache_lock);
- freed = nfsd_cacherep_dispose(&dispose);
- trace_nfsd_drc_gc(nn, freed);
+ nfsd_cacherep_dispose(&dispose);
nfsd_stats_rc_misses_inc();
atomic_inc(&nn->num_drc_entries);
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 87fed75808ff..f206ca32e7f5 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -48,10 +48,6 @@ enum {
NFSD_MaxBlkSize,
NFSD_MaxConnections,
NFSD_Filecache,
- /*
- * The below MUST come last. Otherwise we leave a hole in nfsd_files[]
- * with !CONFIG_NFSD_V4 and simple_fill_super() goes oops
- */
#ifdef CONFIG_NFSD_V4
NFSD_Leasetime,
NFSD_Gracetime,
@@ -76,7 +72,9 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size);
#ifdef CONFIG_NFSD_V4
static ssize_t write_leasetime(struct file *file, char *buf, size_t size);
static ssize_t write_gracetime(struct file *file, char *buf, size_t size);
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
static ssize_t write_recoverydir(struct file *file, char *buf, size_t size);
+#endif
static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size);
#endif
@@ -93,7 +91,9 @@ static ssize_t (*const write_op[])(struct file *, char *, size_t) = {
#ifdef CONFIG_NFSD_V4
[NFSD_Leasetime] = write_leasetime,
[NFSD_Gracetime] = write_gracetime,
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
[NFSD_RecoveryDir] = write_recoverydir,
+#endif
[NFSD_V4EndGrace] = write_v4_end_grace,
#endif
};
@@ -179,7 +179,7 @@ static const struct file_operations pool_stats_operations = {
.open = nfsd_pool_stats_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = nfsd_pool_stats_release,
+ .release = seq_release,
};
DEFINE_SHOW_ATTRIBUTE(nfsd_reply_cache_stats);
@@ -707,12 +707,9 @@ static ssize_t __write_ports_addfd(char *buf, struct net *net, const struct cred
serv = nn->nfsd_serv;
err = svc_addsock(serv, net, fd, buf, SIMPLE_TRANSACTION_LIMIT, cred);
- if (err < 0 && !serv->sv_nrthreads && !nn->keep_active)
- nfsd_last_thread(net);
- else if (err >= 0 && !serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
- svc_get(serv);
+ if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks))
+ nfsd_destroy_serv(net);
- svc_put(serv);
return err;
}
@@ -750,10 +747,6 @@ static ssize_t __write_ports_addxprt(char *buf, struct net *net, const struct cr
if (err < 0 && err != -EAFNOSUPPORT)
goto out_close;
- if (!serv->sv_nrthreads && !xchg(&nn->keep_active, 1))
- svc_get(serv);
-
- svc_put(serv);
return 0;
out_close:
xprt = svc_find_xprt(serv, transport, net, PF_INET, port);
@@ -762,10 +755,9 @@ out_close:
svc_xprt_put(xprt);
}
out_err:
- if (!serv->sv_nrthreads && !nn->keep_active)
- nfsd_last_thread(net);
+ if (!serv->sv_nrthreads && list_empty(&nn->nfsd_serv->sv_permsocks))
+ nfsd_destroy_serv(net);
- svc_put(serv);
return err;
}
@@ -1021,6 +1013,7 @@ static ssize_t write_gracetime(struct file *file, char *buf, size_t size)
return nfsd4_write_time(file, buf, size, &nn->nfsd4_grace, nn);
}
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size,
struct nfsd_net *nn)
{
@@ -1081,6 +1074,7 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size)
mutex_unlock(&nfsd_mutex);
return rv;
}
+#endif
/*
* write_v4_end_grace - release grace period for nfsd's v4.x lock manager
@@ -1244,63 +1238,34 @@ static inline void _nfsd_symlink(struct dentry *parent, const char *name,
#endif
-static void clear_ncl(struct inode *inode)
+static void clear_ncl(struct dentry *dentry)
{
+ struct inode *inode = d_inode(dentry);
struct nfsdfs_client *ncl = inode->i_private;
+ spin_lock(&inode->i_lock);
inode->i_private = NULL;
+ spin_unlock(&inode->i_lock);
kref_put(&ncl->cl_ref, ncl->cl_release);
}
-static struct nfsdfs_client *__get_nfsdfs_client(struct inode *inode)
-{
- struct nfsdfs_client *nc = inode->i_private;
-
- if (nc)
- kref_get(&nc->cl_ref);
- return nc;
-}
-
struct nfsdfs_client *get_nfsdfs_client(struct inode *inode)
{
struct nfsdfs_client *nc;
- inode_lock_shared(inode);
- nc = __get_nfsdfs_client(inode);
- inode_unlock_shared(inode);
+ spin_lock(&inode->i_lock);
+ nc = inode->i_private;
+ if (nc)
+ kref_get(&nc->cl_ref);
+ spin_unlock(&inode->i_lock);
return nc;
}
-/* from __rpc_unlink */
-static void nfsdfs_remove_file(struct inode *dir, struct dentry *dentry)
-{
- int ret;
-
- clear_ncl(d_inode(dentry));
- dget(dentry);
- ret = simple_unlink(dir, dentry);
- d_drop(dentry);
- fsnotify_unlink(dir, dentry);
- dput(dentry);
- WARN_ON_ONCE(ret);
-}
-
-static void nfsdfs_remove_files(struct dentry *root)
-{
- struct dentry *dentry, *tmp;
-
- list_for_each_entry_safe(dentry, tmp, &root->d_subdirs, d_child) {
- if (!simple_positive(dentry)) {
- WARN_ON_ONCE(1); /* I think this can't happen? */
- continue;
- }
- nfsdfs_remove_file(d_inode(root), dentry);
- }
-}
/* XXX: cut'n'paste from simple_fill_super; figure out if we could share
* code instead. */
static int nfsdfs_create_files(struct dentry *root,
const struct tree_descr *files,
+ struct nfsdfs_client *ncl,
struct dentry **fdentries)
{
struct inode *dir = d_inode(root);
@@ -1319,8 +1284,9 @@ static int nfsdfs_create_files(struct dentry *root,
dput(dentry);
goto out;
}
+ kref_get(&ncl->cl_ref);
inode->i_fop = files->ops;
- inode->i_private = __get_nfsdfs_client(dir);
+ inode->i_private = ncl;
d_add(dentry, inode);
fsnotify_create(dir, dentry);
if (fdentries)
@@ -1329,7 +1295,6 @@ static int nfsdfs_create_files(struct dentry *root,
inode_unlock(dir);
return 0;
out:
- nfsdfs_remove_files(root);
inode_unlock(dir);
return -ENOMEM;
}
@@ -1349,7 +1314,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
dentry = nfsd_mkdir(nn->nfsd_client_dir, ncl, name);
if (IS_ERR(dentry)) /* XXX: tossing errors? */
return NULL;
- ret = nfsdfs_create_files(dentry, files, fdentries);
+ ret = nfsdfs_create_files(dentry, files, ncl, fdentries);
if (ret) {
nfsd_client_rmdir(dentry);
return NULL;
@@ -1360,20 +1325,7 @@ struct dentry *nfsd_client_mkdir(struct nfsd_net *nn,
/* Taken from __rpc_rmdir: */
void nfsd_client_rmdir(struct dentry *dentry)
{
- struct inode *dir = d_inode(dentry->d_parent);
- struct inode *inode = d_inode(dentry);
- int ret;
-
- inode_lock(dir);
- nfsdfs_remove_files(dentry);
- clear_ncl(inode);
- dget(dentry);
- ret = simple_rmdir(dir, dentry);
- WARN_ON_ONCE(ret);
- d_drop(dentry);
- fsnotify_rmdir(dir, dentry);
- dput(dentry);
- inode_unlock(dir);
+ simple_recursive_removal(dentry, clear_ncl);
}
static int nfsd_fill_super(struct super_block *sb, struct fs_context *fc)
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 9ed0e08d16c2..304e9728b929 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -148,7 +148,7 @@ int nfsd_vers(struct nfsd_net *nn, int vers, enum vers_op change);
int nfsd_minorversion(struct nfsd_net *nn, u32 minorversion, enum vers_op change);
void nfsd_reset_versions(struct nfsd_net *nn);
int nfsd_create_serv(struct net *net);
-void nfsd_last_thread(struct net *net);
+void nfsd_destroy_serv(struct net *net);
extern int nfsd_max_blksize;
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 7a2bc8e82a63..a667802e08e7 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -59,15 +59,6 @@ static __be32 nfsd_init_request(struct svc_rqst *,
* nfsd_mutex protects nn->nfsd_serv -- both the pointer itself and some members
* of the svc_serv struct such as ->sv_temp_socks and ->sv_permsocks.
*
- * If (out side the lock) nn->nfsd_serv is non-NULL, then it must point to a
- * properly initialised 'struct svc_serv' with ->sv_nrthreads > 0 (unless
- * nn->keep_active is set). That number of nfsd threads must
- * exist and each must be listed in ->sp_all_threads in some entry of
- * ->sv_pools[].
- *
- * Each active thread holds a counted reference on nn->nfsd_serv, as does
- * the nn->keep_active flag and various transient calls to svc_get().
- *
* Finally, the nfsd_mutex also protects some of the global variables that are
* accessed when nfsd starts and that are settable via the write_* routines in
* nfsctl.c. In particular:
@@ -359,13 +350,12 @@ static bool nfsd_needs_lockd(struct nfsd_net *nn)
*/
void nfsd_copy_write_verifier(__be32 verf[2], struct nfsd_net *nn)
{
- int seq = 0;
+ unsigned int seq;
do {
- read_seqbegin_or_lock(&nn->writeverf_lock, &seq);
+ seq = read_seqbegin(&nn->writeverf_lock);
memcpy(verf, nn->writeverf, sizeof(nn->writeverf));
- } while (need_seqretry(&nn->writeverf_lock, seq));
- done_seqretry(&nn->writeverf_lock, seq);
+ } while (read_seqretry(&nn->writeverf_lock, seq));
}
static void nfsd_reset_write_verifier_locked(struct nfsd_net *nn)
@@ -542,7 +532,11 @@ static struct notifier_block nfsd_inet6addr_notifier = {
/* Only used under nfsd_mutex, so this atomic may be overkill: */
static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0);
-void nfsd_last_thread(struct net *net)
+/**
+ * nfsd_destroy_serv - tear down NFSD's svc_serv for a namespace
+ * @net: network namespace the NFS service is associated with
+ */
+void nfsd_destroy_serv(struct net *net)
{
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
struct svc_serv *serv = nn->nfsd_serv;
@@ -564,7 +558,7 @@ void nfsd_last_thread(struct net *net)
/*
* write_ports can create the server without actually starting
* any threads--if we get shut down before any threads are
- * started, then nfsd_last_thread will be run before any of this
+ * started, then nfsd_destroy_serv will be run before any of this
* other initialization has been done except the rpcb information.
*/
svc_rpcb_cleanup(serv, net);
@@ -573,6 +567,7 @@ void nfsd_last_thread(struct net *net)
nfsd_shutdown_net(net);
nfsd_export_flush(net);
+ svc_destroy(&serv);
}
void nfsd_reset_versions(struct nfsd_net *nn)
@@ -647,11 +642,9 @@ void nfsd_shutdown_threads(struct net *net)
return;
}
- svc_get(serv);
/* Kill outstanding nfsd threads */
svc_set_num_threads(serv, NULL, 0);
- nfsd_last_thread(net);
- svc_put(serv);
+ nfsd_destroy_serv(net);
mutex_unlock(&nfsd_mutex);
}
@@ -667,10 +660,9 @@ int nfsd_create_serv(struct net *net)
struct svc_serv *serv;
WARN_ON(!mutex_is_locked(&nfsd_mutex));
- if (nn->nfsd_serv) {
- svc_get(nn->nfsd_serv);
+ if (nn->nfsd_serv)
return 0;
- }
+
if (nfsd_max_blksize == 0)
nfsd_max_blksize = nfsd_get_default_max_blksize();
nfsd_reset_versions(nn);
@@ -681,10 +673,11 @@ int nfsd_create_serv(struct net *net)
serv->sv_maxconn = nn->max_connections;
error = svc_bind(serv, net);
if (error < 0) {
- svc_put(serv);
+ svc_destroy(&serv);
return error;
}
spin_lock(&nfsd_notifier_lock);
+ nn->nfsd_info.mutex = &nfsd_mutex;
nn->nfsd_serv = serv;
spin_unlock(&nfsd_notifier_lock);
@@ -764,7 +757,6 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
nthreads[0] = 1;
/* apply the new numbers */
- svc_get(nn->nfsd_serv);
for (i = 0; i < n; i++) {
err = svc_set_num_threads(nn->nfsd_serv,
&nn->nfsd_serv->sv_pools[i],
@@ -772,7 +764,6 @@ int nfsd_set_nrthreads(int n, int *nthreads, struct net *net)
if (err)
break;
}
- svc_put(nn->nfsd_serv);
return err;
}
@@ -814,13 +805,8 @@ nfsd_svc(int nrservs, struct net *net, const struct cred *cred)
goto out_put;
error = serv->sv_nrthreads;
out_put:
- /* Threads now hold service active */
- if (xchg(&nn->keep_active, 0))
- svc_put(serv);
-
if (serv->sv_nrthreads == 0)
- nfsd_last_thread(net);
- svc_put(serv);
+ nfsd_destroy_serv(net);
out:
mutex_unlock(&nfsd_mutex);
return error;
@@ -1083,28 +1069,7 @@ bool nfssvc_encode_voidres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
int nfsd_pool_stats_open(struct inode *inode, struct file *file)
{
- int ret;
struct nfsd_net *nn = net_generic(inode->i_sb->s_fs_info, nfsd_net_id);
- mutex_lock(&nfsd_mutex);
- if (nn->nfsd_serv == NULL) {
- mutex_unlock(&nfsd_mutex);
- return -ENODEV;
- }
- svc_get(nn->nfsd_serv);
- ret = svc_pool_stats_open(nn->nfsd_serv, file);
- mutex_unlock(&nfsd_mutex);
- return ret;
-}
-
-int nfsd_pool_stats_release(struct inode *inode, struct file *file)
-{
- struct seq_file *seq = file->private_data;
- struct svc_serv *serv = seq->private;
- int ret = seq_release(inode, file);
-
- mutex_lock(&nfsd_mutex);
- svc_put(serv);
- mutex_unlock(&nfsd_mutex);
- return ret;
+ return svc_pool_stats_open(&nn->nfsd_info, file);
}
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index fbc0ccb40424..d1e8cf079b0f 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -1262,28 +1262,6 @@ TRACE_EVENT(nfsd_drc_mismatch,
__entry->ingress)
);
-TRACE_EVENT_CONDITION(nfsd_drc_gc,
- TP_PROTO(
- const struct nfsd_net *nn,
- unsigned long freed
- ),
- TP_ARGS(nn, freed),
- TP_CONDITION(freed > 0),
- TP_STRUCT__entry(
- __field(unsigned long long, boot_time)
- __field(unsigned long, freed)
- __field(int, total)
- ),
- TP_fast_assign(
- __entry->boot_time = nn->boot_time;
- __entry->freed = freed;
- __entry->total = atomic_read(&nn->num_drc_entries);
- ),
- TP_printk("boot_time=%16llx total=%d freed=%lu",
- __entry->boot_time, __entry->total, __entry->freed
- )
-);
-
TRACE_EVENT(nfsd_cb_args,
TP_PROTO(
const struct nfs4_client *clp,
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index e01e4e2acbd9..b7c7a9273ea0 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1039,7 +1039,10 @@ __be32 nfsd_splice_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
ssize_t host_err;
trace_nfsd_read_splice(rqstp, fhp, offset, *count);
- host_err = splice_direct_to_actor(file, &sd, nfsd_direct_splice_actor);
+ host_err = rw_verify_area(READ, file, &offset, *count);
+ if (!host_err)
+ host_err = splice_direct_to_actor(file, &sd,
+ nfsd_direct_splice_actor);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
@@ -1176,9 +1179,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
- file_start_write(file);
host_err = vfs_iter_write(file, &iter, &pos, flags);
- file_end_write(file);
if (host_err < 0) {
commit_reset_write_verifier(nn, rqstp, host_err);
goto out_nfserr;
@@ -1210,6 +1211,30 @@ out_nfserr:
}
/**
+ * nfsd_read_splice_ok - check if spliced reading is supported
+ * @rqstp: RPC transaction context
+ *
+ * Return values:
+ * %true: nfsd_splice_read() may be used
+ * %false: nfsd_splice_read() must not be used
+ *
+ * NFS READ normally uses splice to send data in-place. However the
+ * data in cache can change after the reply's MIC is computed but
+ * before the RPC reply is sent. To prevent the client from
+ * rejecting the server-computed MIC in this somewhat rare case, do
+ * not use splice with the GSS integrity and privacy services.
+ */
+bool nfsd_read_splice_ok(struct svc_rqst *rqstp)
+{
+ switch (svc_auth_flavor(rqstp)) {
+ case RPC_AUTH_GSS_KRB5I:
+ case RPC_AUTH_GSS_KRB5P:
+ return false;
+ }
+ return true;
+}
+
+/**
* nfsd_read - Read data from a file
* @rqstp: RPC transaction context
* @fhp: file handle of file to be read
@@ -1238,7 +1263,7 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
return err;
file = nf->nf_file;
- if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &rqstp->rq_flags))
+ if (file->f_op->splice_read && nfsd_read_splice_ok(rqstp))
err = nfsd_splice_read(rqstp, fhp, file, offset, count, eof);
else
err = nfsd_iter_read(rqstp, fhp, file, offset, count, 0, eof);
@@ -1806,6 +1831,10 @@ retry:
}
trap = lock_rename(tdentry, fdentry);
+ if (IS_ERR(trap)) {
+ err = (rqstp->rq_vers == 2) ? nfserr_acces : nfserr_xdev;
+ goto out;
+ }
err = fh_fill_pre_attrs(ffhp);
if (err != nfs_ok)
goto out_unlock;
@@ -2102,9 +2131,23 @@ static __be32 nfsd_buffered_readdir(struct file *file, struct svc_fh *fhp,
return cdp->err;
}
-/*
- * Read entries from a directory.
- * The NFSv3/4 verifier we ignore for now.
+/**
+ * nfsd_readdir - Read entries from a directory
+ * @rqstp: RPC transaction context
+ * @fhp: NFS file handle of directory to be read
+ * @offsetp: OUT: seek offset of final entry that was read
+ * @cdp: OUT: an eof error value
+ * @func: entry filler actor
+ *
+ * This implementation ignores the NFSv3/4 verifier cookie.
+ *
+ * NB: normal system calls hold file->f_pos_lock when calling
+ * ->iterate_shared and ->llseek, but nfsd_readdir() does not.
+ * Because the struct file acquired here is not visible to other
+ * threads, it's internal state does not need mutex protection.
+ *
+ * Returns nfs_ok on success, otherwise an nfsstat code is
+ * returned.
*/
__be32
nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp,
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index e3c29596f4df..702fbc4483bf 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -114,6 +114,7 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct file *file, loff_t offset,
unsigned long *count, unsigned int base,
u32 *eof);
+bool nfsd_read_splice_ok(struct svc_rqst *rqstp);
__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
loff_t offset, unsigned long *count,
u32 *eof);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 80e859dc84d8..415516c1b27e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -840,6 +840,7 @@ struct nfsd4_compoundargs {
u32 minorversion;
u32 client_opcnt;
u32 opcnt;
+ bool splice_ok;
struct nfsd4_op *ops;
struct nfsd4_op iops[8];
};
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 5710833ac1cc..0131d83b912d 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -64,8 +64,8 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
set_buffer_mapped(bh);
set_buffer_uptodate(bh);
- unlock_page(bh->b_page);
- put_page(bh->b_page);
+ folio_unlock(bh->b_folio);
+ folio_put(bh->b_folio);
return bh;
}
@@ -75,7 +75,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
{
struct buffer_head *bh;
struct inode *inode = btnc->host;
- struct page *page;
+ struct folio *folio;
int err;
bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
@@ -83,7 +83,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
return -ENOMEM;
err = -EEXIST; /* internal code */
- page = bh->b_page;
+ folio = bh->b_folio;
if (buffer_uptodate(bh) || buffer_dirty(bh))
goto found;
@@ -130,8 +130,8 @@ found:
*pbh = bh;
out_locked:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
return err;
}
@@ -145,19 +145,19 @@ out_locked:
void nilfs_btnode_delete(struct buffer_head *bh)
{
struct address_space *mapping;
- struct page *page = bh->b_page;
- pgoff_t index = page_index(page);
+ struct folio *folio = bh->b_folio;
+ pgoff_t index = folio->index;
int still_dirty;
- get_page(page);
- lock_page(page);
- wait_on_page_writeback(page);
+ folio_get(folio);
+ folio_lock(folio);
+ folio_wait_writeback(folio);
nilfs_forget_buffer(bh);
- still_dirty = PageDirty(page);
- mapping = page->mapping;
- unlock_page(page);
- put_page(page);
+ still_dirty = folio_test_dirty(folio);
+ mapping = folio->mapping;
+ folio_unlock(folio);
+ folio_put(folio);
if (!still_dirty && mapping)
invalidate_inode_pages2_range(mapping, index, index);
@@ -185,23 +185,23 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
ctxt->newbh = NULL;
if (inode->i_blkbits == PAGE_SHIFT) {
- struct page *opage = obh->b_page;
- lock_page(opage);
+ struct folio *ofolio = obh->b_folio;
+ folio_lock(ofolio);
retry:
/* BUG_ON(oldkey != obh->b_folio->index); */
- if (unlikely(oldkey != opage->index))
- NILFS_PAGE_BUG(opage,
+ if (unlikely(oldkey != ofolio->index))
+ NILFS_FOLIO_BUG(ofolio,
"invalid oldkey %lld (newkey=%lld)",
(unsigned long long)oldkey,
(unsigned long long)newkey);
xa_lock_irq(&btnc->i_pages);
- err = __xa_insert(&btnc->i_pages, newkey, opage, GFP_NOFS);
+ err = __xa_insert(&btnc->i_pages, newkey, ofolio, GFP_NOFS);
xa_unlock_irq(&btnc->i_pages);
/*
- * Note: page->index will not change to newkey until
+ * Note: folio->index will not change to newkey until
* nilfs_btnode_commit_change_key() will be called.
- * To protect the page in intermediate state, the page lock
+ * To protect the folio in intermediate state, the folio lock
* is held.
*/
if (!err)
@@ -213,7 +213,7 @@ retry:
if (!err)
goto retry;
/* fallback to copy mode */
- unlock_page(opage);
+ folio_unlock(ofolio);
}
nbh = nilfs_btnode_create_block(btnc, newkey);
@@ -225,7 +225,7 @@ retry:
return 0;
failed_unlock:
- unlock_page(obh->b_page);
+ folio_unlock(obh->b_folio);
return err;
}
@@ -238,15 +238,15 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
{
struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
- struct page *opage;
+ struct folio *ofolio;
if (oldkey == newkey)
return;
if (nbh == NULL) { /* blocksize == pagesize */
- opage = obh->b_page;
- if (unlikely(oldkey != opage->index))
- NILFS_PAGE_BUG(opage,
+ ofolio = obh->b_folio;
+ if (unlikely(oldkey != ofolio->index))
+ NILFS_FOLIO_BUG(ofolio,
"invalid oldkey %lld (newkey=%lld)",
(unsigned long long)oldkey,
(unsigned long long)newkey);
@@ -257,8 +257,8 @@ void nilfs_btnode_commit_change_key(struct address_space *btnc,
__xa_set_mark(&btnc->i_pages, newkey, PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&btnc->i_pages);
- opage->index = obh->b_blocknr = newkey;
- unlock_page(opage);
+ ofolio->index = obh->b_blocknr = newkey;
+ folio_unlock(ofolio);
} else {
nilfs_copy_buffer(nbh, obh);
mark_buffer_dirty(nbh);
@@ -284,7 +284,7 @@ void nilfs_btnode_abort_change_key(struct address_space *btnc,
if (nbh == NULL) { /* blocksize == pagesize */
xa_erase_irq(&btnc->i_pages, newkey);
- unlock_page(ctxt->bh->b_page);
+ folio_unlock(ctxt->bh->b_folio);
} else {
/*
* When canceling a buffer that a prepare operation has
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 9ebefb3acb0e..39136637f715 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -552,11 +552,29 @@ static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
}
/**
- * nilfs_cpfile_get_cpinfo -
- * @cpfile:
- * @cno:
- * @ci:
- * @nci:
+ * nilfs_cpfile_get_cpinfo - get information on checkpoints
+ * @cpfile: checkpoint file inode
+ * @cnop: place to pass a starting checkpoint number and receive a
+ * checkpoint number to continue the search
+ * @mode: mode of checkpoints that the caller wants to retrieve
+ * @buf: buffer for storing checkpoints' information
+ * @cisz: byte size of one checkpoint info item in array
+ * @nci: number of checkpoint info items to retrieve
+ *
+ * nilfs_cpfile_get_cpinfo() searches for checkpoints in @mode state
+ * starting from the checkpoint number stored in @cnop, and stores
+ * information about found checkpoints in @buf.
+ * The buffer pointed to by @buf must be large enough to store information
+ * for @nci checkpoints. If at least one checkpoint information is
+ * successfully retrieved, @cnop is updated to point to the checkpoint
+ * number to continue searching.
+ *
+ * Return: Count of checkpoint info items stored in the output buffer on
+ * success, or the following negative error code on failure.
+ * * %-EINVAL - Invalid checkpoint mode.
+ * * %-ENOMEM - Insufficient memory available.
+ * * %-EIO - I/O error (including metadata corruption).
+ * * %-ENOENT - Invalid checkpoint number specified.
*/
ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index de2073c47651..bc846b904b68 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -64,12 +64,6 @@ static inline unsigned int nilfs_chunk_size(struct inode *inode)
return inode->i_sb->s_blocksize;
}
-static inline void nilfs_put_page(struct page *page)
-{
- kunmap(page);
- put_page(page);
-}
-
/*
* Return the offset into page `page_nr' of the last valid
* byte in that page, plus one.
@@ -84,48 +78,46 @@ static unsigned int nilfs_last_byte(struct inode *inode, unsigned long page_nr)
return last_byte;
}
-static int nilfs_prepare_chunk(struct page *page, unsigned int from,
+static int nilfs_prepare_chunk(struct folio *folio, unsigned int from,
unsigned int to)
{
- loff_t pos = page_offset(page) + from;
+ loff_t pos = folio_pos(folio) + from;
- return __block_write_begin(page, pos, to - from, nilfs_get_block);
+ return __block_write_begin(&folio->page, pos, to - from, nilfs_get_block);
}
-static void nilfs_commit_chunk(struct page *page,
- struct address_space *mapping,
- unsigned int from, unsigned int to)
+static void nilfs_commit_chunk(struct folio *folio,
+ struct address_space *mapping, size_t from, size_t to)
{
struct inode *dir = mapping->host;
- loff_t pos = page_offset(page) + from;
- unsigned int len = to - from;
- unsigned int nr_dirty, copied;
+ loff_t pos = folio_pos(folio) + from;
+ size_t copied, len = to - from;
+ unsigned int nr_dirty;
int err;
- nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
- copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
+ nr_dirty = nilfs_page_count_clean_buffers(&folio->page, from, to);
+ copied = block_write_end(NULL, mapping, pos, len, len, &folio->page, NULL);
if (pos + copied > dir->i_size)
i_size_write(dir, pos + copied);
if (IS_DIRSYNC(dir))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
err = nilfs_set_file_dirty(dir, nr_dirty);
WARN_ON(err); /* do not happen */
- unlock_page(page);
+ folio_unlock(folio);
}
-static bool nilfs_check_page(struct page *page)
+static bool nilfs_check_folio(struct folio *folio, char *kaddr)
{
- struct inode *dir = page->mapping->host;
+ struct inode *dir = folio->mapping->host;
struct super_block *sb = dir->i_sb;
unsigned int chunk_size = nilfs_chunk_size(dir);
- char *kaddr = page_address(page);
- unsigned int offs, rec_len;
- unsigned int limit = PAGE_SIZE;
+ size_t offs, rec_len;
+ size_t limit = folio_size(folio);
struct nilfs_dir_entry *p;
char *error;
- if ((dir->i_size >> PAGE_SHIFT) == page->index) {
- limit = dir->i_size & ~PAGE_MASK;
+ if (dir->i_size < folio_pos(folio) + limit) {
+ limit = dir->i_size - folio_pos(folio);
if (limit & (chunk_size - 1))
goto Ebadsize;
if (!limit)
@@ -147,7 +139,7 @@ static bool nilfs_check_page(struct page *page)
if (offs != limit)
goto Eend;
out:
- SetPageChecked(page);
+ folio_set_checked(folio);
return true;
/* Too bad, we had an error */
@@ -170,8 +162,8 @@ Espan:
error = "directory entry across blocks";
bad_entry:
nilfs_error(sb,
- "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
- dir->i_ino, error, (page->index << PAGE_SHIFT) + offs,
+ "bad entry in directory #%lu: %s - offset=%lu, inode=%lu, rec_len=%zd, name_len=%d",
+ dir->i_ino, error, (folio->index << PAGE_SHIFT) + offs,
(unsigned long)le64_to_cpu(p->inode),
rec_len, p->name_len);
goto fail;
@@ -179,29 +171,34 @@ Eend:
p = (struct nilfs_dir_entry *)(kaddr + offs);
nilfs_error(sb,
"entry in directory #%lu spans the page boundary offset=%lu, inode=%lu",
- dir->i_ino, (page->index << PAGE_SHIFT) + offs,
+ dir->i_ino, (folio->index << PAGE_SHIFT) + offs,
(unsigned long)le64_to_cpu(p->inode));
fail:
- SetPageError(page);
+ folio_set_error(folio);
return false;
}
-static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
+static void *nilfs_get_folio(struct inode *dir, unsigned long n,
+ struct folio **foliop)
{
struct address_space *mapping = dir->i_mapping;
- struct page *page = read_mapping_page(mapping, n, NULL);
+ struct folio *folio = read_mapping_folio(mapping, n, NULL);
+ void *kaddr;
- if (!IS_ERR(page)) {
- kmap(page);
- if (unlikely(!PageChecked(page))) {
- if (!nilfs_check_page(page))
- goto fail;
- }
+ if (IS_ERR(folio))
+ return folio;
+
+ kaddr = kmap_local_folio(folio, 0);
+ if (unlikely(!folio_test_checked(folio))) {
+ if (!nilfs_check_folio(folio, kaddr))
+ goto fail;
}
- return page;
+
+ *foliop = folio;
+ return kaddr;
fail:
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
return ERR_PTR(-EIO);
}
@@ -275,21 +272,21 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
for ( ; n < npages; n++, offset = 0) {
char *kaddr, *limit;
struct nilfs_dir_entry *de;
- struct page *page = nilfs_get_page(inode, n);
+ struct folio *folio;
- if (IS_ERR(page)) {
+ kaddr = nilfs_get_folio(inode, n, &folio);
+ if (IS_ERR(kaddr)) {
nilfs_error(sb, "bad page in #%lu", inode->i_ino);
ctx->pos += PAGE_SIZE - offset;
return -EIO;
}
- kaddr = page_address(page);
de = (struct nilfs_dir_entry *)(kaddr + offset);
limit = kaddr + nilfs_last_byte(inode, n) -
NILFS_DIR_REC_LEN(1);
for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
if (de->rec_len == 0) {
nilfs_error(sb, "zero-length directory entry");
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
return -EIO;
}
if (de->inode) {
@@ -302,72 +299,67 @@ static int nilfs_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit(ctx, de->name, de->name_len,
le64_to_cpu(de->inode), t)) {
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
return 0;
}
}
ctx->pos += nilfs_rec_len_from_disk(de->rec_len);
}
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
}
return 0;
}
/*
- * nilfs_find_entry()
+ * nilfs_find_entry()
+ *
+ * Finds an entry in the specified directory with the wanted name. It
+ * returns the folio in which the entry was found, and the entry itself.
+ * The folio is mapped and unlocked. When the caller is finished with
+ * the entry, it should call folio_release_kmap().
*
- * finds an entry in the specified directory with the wanted name. It
- * returns the page in which the entry was found, and the entry itself
- * (as a parameter - res_dir). Page is returned mapped and unlocked.
- * Entry is guaranteed to be valid.
+ * On failure, returns NULL and the caller should ignore foliop.
*/
-struct nilfs_dir_entry *
-nilfs_find_entry(struct inode *dir, const struct qstr *qstr,
- struct page **res_page)
+struct nilfs_dir_entry *nilfs_find_entry(struct inode *dir,
+ const struct qstr *qstr, struct folio **foliop)
{
const unsigned char *name = qstr->name;
int namelen = qstr->len;
unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
unsigned long start, n;
unsigned long npages = dir_pages(dir);
- struct page *page = NULL;
struct nilfs_inode_info *ei = NILFS_I(dir);
struct nilfs_dir_entry *de;
if (npages == 0)
goto out;
- /* OFFSET_CACHE */
- *res_page = NULL;
-
start = ei->i_dir_start_lookup;
if (start >= npages)
start = 0;
n = start;
do {
- char *kaddr;
+ char *kaddr = nilfs_get_folio(dir, n, foliop);
- page = nilfs_get_page(dir, n);
- if (!IS_ERR(page)) {
- kaddr = page_address(page);
+ if (!IS_ERR(kaddr)) {
de = (struct nilfs_dir_entry *)kaddr;
kaddr += nilfs_last_byte(dir, n) - reclen;
while ((char *) de <= kaddr) {
if (de->rec_len == 0) {
nilfs_error(dir->i_sb,
"zero-length directory entry");
- nilfs_put_page(page);
+ folio_release_kmap(*foliop, kaddr);
goto out;
}
if (nilfs_match(namelen, name, de))
goto found;
de = nilfs_next_entry(de);
}
- nilfs_put_page(page);
+ folio_release_kmap(*foliop, kaddr);
}
if (++n >= npages)
n = 0;
- /* next page is past the blocks we've got */
+ /* next folio is past the blocks we've got */
if (unlikely(n > (dir->i_blocks >> (PAGE_SHIFT - 9)))) {
nilfs_error(dir->i_sb,
"dir %lu size %lld exceeds block count %llu",
@@ -380,55 +372,47 @@ out:
return NULL;
found:
- *res_page = page;
ei->i_dir_start_lookup = n;
return de;
}
-struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
+struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct folio **foliop)
{
- struct page *page = nilfs_get_page(dir, 0);
- struct nilfs_dir_entry *de = NULL;
+ struct nilfs_dir_entry *de = nilfs_get_folio(dir, 0, foliop);
- if (!IS_ERR(page)) {
- de = nilfs_next_entry(
- (struct nilfs_dir_entry *)page_address(page));
- *p = page;
- }
- return de;
+ if (IS_ERR(de))
+ return NULL;
+ return nilfs_next_entry(de);
}
ino_t nilfs_inode_by_name(struct inode *dir, const struct qstr *qstr)
{
ino_t res = 0;
struct nilfs_dir_entry *de;
- struct page *page;
+ struct folio *folio;
- de = nilfs_find_entry(dir, qstr, &page);
+ de = nilfs_find_entry(dir, qstr, &folio);
if (de) {
res = le64_to_cpu(de->inode);
- kunmap(page);
- put_page(page);
+ folio_release_kmap(folio, de);
}
return res;
}
-/* Releases the page */
void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
- struct page *page, struct inode *inode)
+ struct folio *folio, struct inode *inode)
{
- unsigned int from = (char *)de - (char *)page_address(page);
- unsigned int to = from + nilfs_rec_len_from_disk(de->rec_len);
- struct address_space *mapping = page->mapping;
+ size_t from = offset_in_folio(folio, de);
+ size_t to = from + nilfs_rec_len_from_disk(de->rec_len);
+ struct address_space *mapping = folio->mapping;
int err;
- lock_page(page);
- err = nilfs_prepare_chunk(page, from, to);
+ folio_lock(folio);
+ err = nilfs_prepare_chunk(folio, from, to);
BUG_ON(err);
de->inode = cpu_to_le64(inode->i_ino);
nilfs_set_de_type(de, inode);
- nilfs_commit_chunk(page, mapping, from, to);
- nilfs_put_page(page);
+ nilfs_commit_chunk(folio, mapping, from, to);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
}
@@ -443,31 +427,28 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
unsigned int chunk_size = nilfs_chunk_size(dir);
unsigned int reclen = NILFS_DIR_REC_LEN(namelen);
unsigned short rec_len, name_len;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct nilfs_dir_entry *de;
unsigned long npages = dir_pages(dir);
unsigned long n;
- char *kaddr;
- unsigned int from, to;
+ size_t from, to;
int err;
/*
* We take care of directory expansion in the same loop.
- * This code plays outside i_size, so it locks the page
+ * This code plays outside i_size, so it locks the folio
* to protect that region.
*/
for (n = 0; n <= npages; n++) {
+ char *kaddr = nilfs_get_folio(dir, n, &folio);
char *dir_end;
- page = nilfs_get_page(dir, n);
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto out;
- lock_page(page);
- kaddr = page_address(page);
+ if (IS_ERR(kaddr))
+ return PTR_ERR(kaddr);
+ folio_lock(folio);
dir_end = kaddr + nilfs_last_byte(dir, n);
de = (struct nilfs_dir_entry *)kaddr;
- kaddr += PAGE_SIZE - reclen;
+ kaddr += folio_size(folio) - reclen;
while ((char *)de <= kaddr) {
if ((char *)de == dir_end) {
/* We hit i_size */
@@ -494,16 +475,16 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode)
goto got_it;
de = (struct nilfs_dir_entry *)((char *)de + rec_len);
}
- unlock_page(page);
- nilfs_put_page(page);
+ folio_unlock(folio);
+ folio_release_kmap(folio, kaddr);
}
BUG();
return -EINVAL;
got_it:
- from = (char *)de - (char *)page_address(page);
+ from = offset_in_folio(folio, de);
to = from + rec_len;
- err = nilfs_prepare_chunk(page, from, to);
+ err = nilfs_prepare_chunk(folio, from, to);
if (err)
goto out_unlock;
if (de->inode) {
@@ -518,29 +499,28 @@ got_it:
memcpy(de->name, name, namelen);
de->inode = cpu_to_le64(inode->i_ino);
nilfs_set_de_type(de, inode);
- nilfs_commit_chunk(page, page->mapping, from, to);
+ nilfs_commit_chunk(folio, folio->mapping, from, to);
inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir));
nilfs_mark_inode_dirty(dir);
/* OFFSET_CACHE */
out_put:
- nilfs_put_page(page);
-out:
+ folio_release_kmap(folio, de);
return err;
out_unlock:
- unlock_page(page);
+ folio_unlock(folio);
goto out_put;
}
/*
* nilfs_delete_entry deletes a directory entry by merging it with the
- * previous entry. Page is up-to-date. Releases the page.
+ * previous entry. Folio is up-to-date.
*/
-int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
+int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
struct inode *inode = mapping->host;
- char *kaddr = page_address(page);
- unsigned int from, to;
+ char *kaddr = (char *)((unsigned long)dir & ~(folio_size(folio) - 1));
+ size_t from, to;
struct nilfs_dir_entry *de, *pde = NULL;
int err;
@@ -559,17 +539,16 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
de = nilfs_next_entry(de);
}
if (pde)
- from = (char *)pde - (char *)page_address(page);
- lock_page(page);
- err = nilfs_prepare_chunk(page, from, to);
+ from = (char *)pde - kaddr;
+ folio_lock(folio);
+ err = nilfs_prepare_chunk(folio, from, to);
BUG_ON(err);
if (pde)
pde->rec_len = nilfs_rec_len_to_disk(to - from);
dir->inode = 0;
- nilfs_commit_chunk(page, mapping, from, to);
+ nilfs_commit_chunk(folio, mapping, from, to);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
out:
- nilfs_put_page(page);
return err;
}
@@ -579,21 +558,21 @@ out:
int nilfs_make_empty(struct inode *inode, struct inode *parent)
{
struct address_space *mapping = inode->i_mapping;
- struct page *page = grab_cache_page(mapping, 0);
+ struct folio *folio = filemap_grab_folio(mapping, 0);
unsigned int chunk_size = nilfs_chunk_size(inode);
struct nilfs_dir_entry *de;
int err;
void *kaddr;
- if (!page)
- return -ENOMEM;
+ if (IS_ERR(folio))
+ return PTR_ERR(folio);
- err = nilfs_prepare_chunk(page, 0, chunk_size);
+ err = nilfs_prepare_chunk(folio, 0, chunk_size);
if (unlikely(err)) {
- unlock_page(page);
+ folio_unlock(folio);
goto fail;
}
- kaddr = kmap_atomic(page);
+ kaddr = kmap_local_folio(folio, 0);
memset(kaddr, 0, chunk_size);
de = (struct nilfs_dir_entry *)kaddr;
de->name_len = 1;
@@ -608,10 +587,10 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent)
de->inode = cpu_to_le64(parent->i_ino);
memcpy(de->name, "..\0", 4);
nilfs_set_de_type(de, inode);
- kunmap_atomic(kaddr);
- nilfs_commit_chunk(page, mapping, 0, chunk_size);
+ kunmap_local(kaddr);
+ nilfs_commit_chunk(folio, mapping, 0, chunk_size);
fail:
- put_page(page);
+ folio_put(folio);
return err;
}
@@ -620,18 +599,17 @@ fail:
*/
int nilfs_empty_dir(struct inode *inode)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
+ char *kaddr;
unsigned long i, npages = dir_pages(inode);
for (i = 0; i < npages; i++) {
- char *kaddr;
struct nilfs_dir_entry *de;
- page = nilfs_get_page(inode, i);
- if (IS_ERR(page))
+ kaddr = nilfs_get_folio(inode, i, &folio);
+ if (IS_ERR(kaddr))
continue;
- kaddr = page_address(page);
de = (struct nilfs_dir_entry *)kaddr;
kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
@@ -657,12 +635,12 @@ int nilfs_empty_dir(struct inode *inode)
}
de = nilfs_next_entry(de);
}
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
}
return 1;
not_empty:
- nilfs_put_page(page);
+ folio_release_kmap(folio, kaddr);
return 0;
}
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 740ce26d1e76..0e3fc5ba33c7 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -45,34 +45,36 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = vmf->page;
+ struct folio *folio = page_folio(vmf->page);
struct inode *inode = file_inode(vma->vm_file);
struct nilfs_transaction_info ti;
+ struct buffer_head *bh, *head;
int ret = 0;
if (unlikely(nilfs_near_disk_full(inode->i_sb->s_fs_info)))
return VM_FAULT_SIGBUS; /* -ENOSPC */
sb_start_pagefault(inode->i_sb);
- lock_page(page);
- if (page->mapping != inode->i_mapping ||
- page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
- unlock_page(page);
+ folio_lock(folio);
+ if (folio->mapping != inode->i_mapping ||
+ folio_pos(folio) >= i_size_read(inode) ||
+ !folio_test_uptodate(folio)) {
+ folio_unlock(folio);
ret = -EFAULT; /* make the VM retry the fault */
goto out;
}
/*
- * check to see if the page is mapped already (no holes)
+ * check to see if the folio is mapped already (no holes)
*/
- if (PageMappedToDisk(page))
+ if (folio_test_mappedtodisk(folio))
goto mapped;
- if (page_has_buffers(page)) {
- struct buffer_head *bh, *head;
+ head = folio_buffers(folio);
+ if (head) {
int fully_mapped = 1;
- bh = head = page_buffers(page);
+ bh = head;
do {
if (!buffer_mapped(bh)) {
fully_mapped = 0;
@@ -81,11 +83,11 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
} while (bh = bh->b_this_page, bh != head);
if (fully_mapped) {
- SetPageMappedToDisk(page);
+ folio_set_mappedtodisk(folio);
goto mapped;
}
}
- unlock_page(page);
+ folio_unlock(folio);
/*
* fill hole blocks
@@ -105,7 +107,13 @@ static vm_fault_t nilfs_page_mkwrite(struct vm_fault *vmf)
nilfs_transaction_commit(inode->i_sb);
mapped:
- wait_for_stable_page(page);
+ /*
+ * Since checksumming including data blocks is performed to determine
+ * the validity of the log to be written and used for recovery, it is
+ * necessary to wait for writeback to finish here, regardless of the
+ * stable write requirement of the backing device.
+ */
+ folio_wait_writeback(folio);
out:
sb_end_pagefault(inode->i_sb);
return vmf_fs_error(ret);
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 8beb2730929d..bf9a11d58817 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -98,8 +98,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
*out_bh = bh;
failed:
- unlock_page(bh->b_page);
- put_page(bh->b_page);
+ folio_unlock(bh->b_folio);
+ folio_put(bh->b_folio);
if (unlikely(err))
brelse(bh);
return err;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index f861f3a0bf5c..9c334c722fc1 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -175,7 +175,8 @@ static int nilfs_writepages(struct address_space *mapping,
static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = folio->mapping->host;
int err;
if (sb_rdonly(inode->i_sb)) {
@@ -185,13 +186,13 @@ static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
* have dirty pages that try to be flushed in background.
* So, here we simply discard this dirty page.
*/
- nilfs_clear_dirty_page(page, false);
- unlock_page(page);
+ nilfs_clear_folio_dirty(folio, false);
+ folio_unlock(folio);
return -EROFS;
}
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
if (wbc->sync_mode == WB_SYNC_ALL) {
err = nilfs_construct_segment(inode->i_sb);
@@ -214,7 +215,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping,
/*
* The page may not be locked, eg if called from try_to_unmap_one()
*/
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
head = folio_buffers(folio);
if (head) {
struct buffer_head *bh = head;
@@ -230,7 +231,7 @@ static bool nilfs_dirty_folio(struct address_space *mapping,
} else if (ret) {
nr_dirty = 1 << (folio_shift(folio) - inode->i_blkbits);
}
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 40ffade49f38..cfb6aca5ec38 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -872,16 +872,14 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
nsegs = argv[4].v_nmembs;
if (argv[4].v_size != argsz[4])
goto out;
- if (nsegs > UINT_MAX / sizeof(__u64))
- goto out;
/*
* argv[4] points to segment numbers this ioctl cleans. We
- * use kmalloc() for its buffer because memory used for the
- * segment numbers is enough small.
+ * use kmalloc() for its buffer because the memory used for the
+ * segment numbers is small enough.
*/
- kbufs[4] = memdup_user((void __user *)(unsigned long)argv[4].v_base,
- nsegs * sizeof(__u64));
+ kbufs[4] = memdup_array_user((void __user *)(unsigned long)argv[4].v_base,
+ nsegs, sizeof(__u64));
if (IS_ERR(kbufs[4])) {
ret = PTR_ERR(kbufs[4]);
goto out;
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c97c77a39668..e45c01a559c0 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -97,8 +97,8 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
}
failed_bh:
- unlock_page(bh->b_page);
- put_page(bh->b_page);
+ folio_unlock(bh->b_folio);
+ folio_put(bh->b_folio);
brelse(bh);
failed_unlock:
@@ -158,8 +158,8 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, blk_opf_t opf,
*out_bh = bh;
failed_bh:
- unlock_page(bh->b_page);
- put_page(bh->b_page);
+ folio_unlock(bh->b_folio);
+ folio_put(bh->b_folio);
brelse(bh);
failed:
return ret;
@@ -399,7 +399,8 @@ int nilfs_mdt_fetch_dirty(struct inode *inode)
static int
nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
{
- struct inode *inode = page->mapping->host;
+ struct folio *folio = page_folio(page);
+ struct inode *inode = folio->mapping->host;
struct super_block *sb;
int err = 0;
@@ -407,16 +408,16 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
/*
* It means that filesystem was remounted in read-only
* mode because of error or metadata corruption. But we
- * have dirty pages that try to be flushed in background.
- * So, here we simply discard this dirty page.
+ * have dirty folios that try to be flushed in background.
+ * So, here we simply discard this dirty folio.
*/
- nilfs_clear_dirty_page(page, false);
- unlock_page(page);
+ nilfs_clear_folio_dirty(folio, false);
+ folio_unlock(folio);
return -EROFS;
}
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
+ folio_redirty_for_writepage(wbc, folio);
+ folio_unlock(folio);
if (!inode)
return 0;
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 2a4e7f4a8102..c950139db6ef 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -260,11 +260,11 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
{
struct inode *inode;
struct nilfs_dir_entry *de;
- struct page *page;
+ struct folio *folio;
int err;
err = -ENOENT;
- de = nilfs_find_entry(dir, &dentry->d_name, &page);
+ de = nilfs_find_entry(dir, &dentry->d_name, &folio);
if (!de)
goto out;
@@ -279,7 +279,8 @@ static int nilfs_do_unlink(struct inode *dir, struct dentry *dentry)
inode->i_ino, inode->i_nlink);
set_nlink(inode, 1);
}
- err = nilfs_delete_entry(de, page);
+ err = nilfs_delete_entry(de, folio);
+ folio_release_kmap(folio, de);
if (err)
goto out;
@@ -347,9 +348,9 @@ static int nilfs_rename(struct mnt_idmap *idmap,
{
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
- struct page *dir_page = NULL;
+ struct folio *dir_folio = NULL;
struct nilfs_dir_entry *dir_de = NULL;
- struct page *old_page;
+ struct folio *old_folio;
struct nilfs_dir_entry *old_de;
struct nilfs_transaction_info ti;
int err;
@@ -362,19 +363,19 @@ static int nilfs_rename(struct mnt_idmap *idmap,
return err;
err = -ENOENT;
- old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_page);
+ old_de = nilfs_find_entry(old_dir, &old_dentry->d_name, &old_folio);
if (!old_de)
goto out;
if (S_ISDIR(old_inode->i_mode)) {
err = -EIO;
- dir_de = nilfs_dotdot(old_inode, &dir_page);
+ dir_de = nilfs_dotdot(old_inode, &dir_folio);
if (!dir_de)
goto out_old;
}
if (new_inode) {
- struct page *new_page;
+ struct folio *new_folio;
struct nilfs_dir_entry *new_de;
err = -ENOTEMPTY;
@@ -382,10 +383,11 @@ static int nilfs_rename(struct mnt_idmap *idmap,
goto out_dir;
err = -ENOENT;
- new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_page);
+ new_de = nilfs_find_entry(new_dir, &new_dentry->d_name, &new_folio);
if (!new_de)
goto out_dir;
- nilfs_set_link(new_dir, new_de, new_page, old_inode);
+ nilfs_set_link(new_dir, new_de, new_folio, old_inode);
+ folio_release_kmap(new_folio, new_de);
nilfs_mark_inode_dirty(new_dir);
inode_set_ctime_current(new_inode);
if (dir_de)
@@ -408,12 +410,15 @@ static int nilfs_rename(struct mnt_idmap *idmap,
*/
inode_set_ctime_current(old_inode);
- nilfs_delete_entry(old_de, old_page);
+ nilfs_delete_entry(old_de, old_folio);
if (dir_de) {
- nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
+ nilfs_set_link(old_inode, dir_de, dir_folio, new_dir);
+ folio_release_kmap(dir_folio, dir_de);
drop_nlink(old_dir);
}
+ folio_release_kmap(old_folio, old_de);
+
nilfs_mark_inode_dirty(old_dir);
nilfs_mark_inode_dirty(old_inode);
@@ -421,13 +426,10 @@ static int nilfs_rename(struct mnt_idmap *idmap,
return err;
out_dir:
- if (dir_de) {
- kunmap(dir_page);
- put_page(dir_page);
- }
+ if (dir_de)
+ folio_release_kmap(dir_folio, dir_de);
out_old:
- kunmap(old_page);
- put_page(old_page);
+ folio_release_kmap(old_folio, old_de);
out:
nilfs_transaction_abort(old_dir->i_sb);
return err;
@@ -439,7 +441,6 @@ out:
static struct dentry *nilfs_get_parent(struct dentry *child)
{
unsigned long ino;
- struct inode *inode;
struct nilfs_root *root;
ino = nilfs_inode_by_name(d_inode(child), &dotdot_name);
@@ -448,11 +449,7 @@ static struct dentry *nilfs_get_parent(struct dentry *child)
root = NILFS_I(d_inode(child))->i_root;
- inode = nilfs_iget(child->d_sb, root, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
-
- return d_obtain_alias(inode);
+ return d_obtain_alias(nilfs_iget(child->d_sb, root, ino));
}
static struct dentry *nilfs_get_dentry(struct super_block *sb, u64 cno,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8046490cd7fe..98cffaf0ac12 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -226,16 +226,16 @@ static inline __u32 nilfs_mask_flags(umode_t mode, __u32 flags)
}
/* dir.c */
-extern int nilfs_add_link(struct dentry *, struct inode *);
-extern ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
-extern int nilfs_make_empty(struct inode *, struct inode *);
-extern struct nilfs_dir_entry *
-nilfs_find_entry(struct inode *, const struct qstr *, struct page **);
-extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
-extern int nilfs_empty_dir(struct inode *);
-extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
-extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
- struct page *, struct inode *);
+int nilfs_add_link(struct dentry *, struct inode *);
+ino_t nilfs_inode_by_name(struct inode *, const struct qstr *);
+int nilfs_make_empty(struct inode *, struct inode *);
+struct nilfs_dir_entry *nilfs_find_entry(struct inode *, const struct qstr *,
+ struct folio **);
+int nilfs_delete_entry(struct nilfs_dir_entry *, struct folio *);
+int nilfs_empty_dir(struct inode *);
+struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct folio **);
+void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
+ struct folio *, struct inode *);
/* file.c */
extern int nilfs_sync_file(struct file *, loff_t, loff_t, int);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 06b04758f289..5c2eba1987bd 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -73,7 +73,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode,
*/
void nilfs_forget_buffer(struct buffer_head *bh)
{
- struct page *page = bh->b_page;
+ struct folio *folio = bh->b_folio;
const unsigned long clear_bits =
(BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
@@ -81,12 +81,12 @@ void nilfs_forget_buffer(struct buffer_head *bh)
lock_buffer(bh);
set_mask_bits(&bh->b_state, clear_bits, 0);
- if (nilfs_page_buffers_clean(page))
- __nilfs_clear_page_dirty(page);
+ if (nilfs_folio_buffers_clean(folio))
+ __nilfs_clear_folio_dirty(folio);
bh->b_blocknr = -1;
- ClearPageUptodate(page);
- ClearPageMappedToDisk(page);
+ folio_clear_uptodate(folio);
+ folio_clear_mappedtodisk(folio);
unlock_buffer(bh);
brelse(bh);
}
@@ -131,48 +131,49 @@ void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
}
/**
- * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
- * @page: page to be checked
+ * nilfs_folio_buffers_clean - Check if a folio has dirty buffers or not.
+ * @folio: Folio to be checked.
*
- * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
- * Otherwise, it returns non-zero value.
+ * nilfs_folio_buffers_clean() returns false if the folio has dirty buffers.
+ * Otherwise, it returns true.
*/
-int nilfs_page_buffers_clean(struct page *page)
+bool nilfs_folio_buffers_clean(struct folio *folio)
{
struct buffer_head *bh, *head;
- bh = head = page_buffers(page);
+ bh = head = folio_buffers(folio);
do {
if (buffer_dirty(bh))
- return 0;
+ return false;
bh = bh->b_this_page;
} while (bh != head);
- return 1;
+ return true;
}
-void nilfs_page_bug(struct page *page)
+void nilfs_folio_bug(struct folio *folio)
{
+ struct buffer_head *bh, *head;
struct address_space *m;
unsigned long ino;
- if (unlikely(!page)) {
- printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
+ if (unlikely(!folio)) {
+ printk(KERN_CRIT "NILFS_FOLIO_BUG(NULL)\n");
return;
}
- m = page->mapping;
+ m = folio->mapping;
ino = m ? m->host->i_ino : 0;
- printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
+ printk(KERN_CRIT "NILFS_FOLIO_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
"mapping=%p ino=%lu\n",
- page, page_ref_count(page),
- (unsigned long long)page->index, page->flags, m, ino);
+ folio, folio_ref_count(folio),
+ (unsigned long long)folio->index, folio->flags, m, ino);
- if (page_has_buffers(page)) {
- struct buffer_head *bh, *head;
+ head = folio_buffers(folio);
+ if (head) {
int i = 0;
- bh = head = page_buffers(page);
+ bh = head;
do {
printk(KERN_CRIT
" BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
@@ -258,7 +259,7 @@ repeat:
folio_lock(folio);
if (unlikely(!folio_test_dirty(folio)))
- NILFS_PAGE_BUG(&folio->page, "inconsistent dirty state");
+ NILFS_FOLIO_BUG(folio, "inconsistent dirty state");
dfolio = filemap_grab_folio(dmap, folio->index);
if (unlikely(IS_ERR(dfolio))) {
@@ -268,7 +269,7 @@ repeat:
break;
}
if (unlikely(!folio_buffers(folio)))
- NILFS_PAGE_BUG(&folio->page,
+ NILFS_FOLIO_BUG(folio,
"found empty page in dat page cache");
nilfs_copy_folio(dfolio, folio, true);
@@ -379,7 +380,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
* was acquired. Skip processing in that case.
*/
if (likely(folio->mapping == mapping))
- nilfs_clear_dirty_page(&folio->page, silent);
+ nilfs_clear_folio_dirty(folio, silent);
folio_unlock(folio);
}
@@ -389,32 +390,33 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent)
}
/**
- * nilfs_clear_dirty_page - discard dirty page
- * @page: dirty page that will be discarded
+ * nilfs_clear_folio_dirty - discard dirty folio
+ * @folio: dirty folio that will be discarded
* @silent: suppress [true] or print [false] warning messages
*/
-void nilfs_clear_dirty_page(struct page *page, bool silent)
+void nilfs_clear_folio_dirty(struct folio *folio, bool silent)
{
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
struct super_block *sb = inode->i_sb;
+ struct buffer_head *bh, *head;
- BUG_ON(!PageLocked(page));
+ BUG_ON(!folio_test_locked(folio));
if (!silent)
nilfs_warn(sb, "discard dirty page: offset=%lld, ino=%lu",
- page_offset(page), inode->i_ino);
+ folio_pos(folio), inode->i_ino);
- ClearPageUptodate(page);
- ClearPageMappedToDisk(page);
+ folio_clear_uptodate(folio);
+ folio_clear_mappedtodisk(folio);
- if (page_has_buffers(page)) {
- struct buffer_head *bh, *head;
+ head = folio_buffers(folio);
+ if (head) {
const unsigned long clear_bits =
(BIT(BH_Uptodate) | BIT(BH_Dirty) | BIT(BH_Mapped) |
BIT(BH_Async_Write) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Checked) | BIT(BH_NILFS_Redirected));
- bh = head = page_buffers(page);
+ bh = head;
do {
lock_buffer(bh);
if (!silent)
@@ -427,7 +429,7 @@ void nilfs_clear_dirty_page(struct page *page, bool silent)
} while (bh = bh->b_this_page, bh != head);
}
- __nilfs_clear_page_dirty(page);
+ __nilfs_clear_folio_dirty(folio);
}
unsigned int nilfs_page_count_clean_buffers(struct page *page,
@@ -457,22 +459,23 @@ unsigned int nilfs_page_count_clean_buffers(struct page *page,
* 2) Some B-tree operations like insertion or deletion may dispose buffers
* in dirty state, and this needs to cancel the dirty state of their pages.
*/
-int __nilfs_clear_page_dirty(struct page *page)
+void __nilfs_clear_folio_dirty(struct folio *folio)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
if (mapping) {
xa_lock_irq(&mapping->i_pages);
- if (test_bit(PG_dirty, &page->flags)) {
- __xa_clear_mark(&mapping->i_pages, page_index(page),
+ if (folio_test_dirty(folio)) {
+ __xa_clear_mark(&mapping->i_pages, folio->index,
PAGECACHE_TAG_DIRTY);
xa_unlock_irq(&mapping->i_pages);
- return clear_page_dirty_for_io(page);
+ folio_clear_dirty_for_io(folio);
+ return;
}
xa_unlock_irq(&mapping->i_pages);
- return 0;
+ return;
}
- return TestClearPageDirty(page);
+ folio_clear_dirty(folio);
}
/**
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index d249ea1cefff..7e1a2c455a10 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -30,18 +30,18 @@ BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */
BUFFER_FNS(NILFS_Redirected, nilfs_redirected) /* redirected to a copy */
-int __nilfs_clear_page_dirty(struct page *);
+void __nilfs_clear_folio_dirty(struct folio *);
struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
unsigned long, unsigned long);
void nilfs_forget_buffer(struct buffer_head *);
void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
-int nilfs_page_buffers_clean(struct page *);
-void nilfs_page_bug(struct page *);
+bool nilfs_folio_buffers_clean(struct folio *);
+void nilfs_folio_bug(struct folio *);
int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
void nilfs_copy_back_pages(struct address_space *, struct address_space *);
-void nilfs_clear_dirty_page(struct page *, bool);
+void nilfs_clear_folio_dirty(struct folio *, bool);
void nilfs_clear_dirty_pages(struct address_space *, bool);
unsigned int nilfs_page_count_clean_buffers(struct page *, unsigned int,
unsigned int);
@@ -49,7 +49,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
sector_t start_blk,
sector_t *blkoff);
-#define NILFS_PAGE_BUG(page, m, a...) \
- do { nilfs_page_bug(page); BUG(); } while (0)
+#define NILFS_FOLIO_BUG(folio, m, a...) \
+ do { nilfs_folio_bug(folio); BUG(); } while (0)
#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index 0955b657938f..a9b8d77c8c1d 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -472,9 +472,10 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
struct nilfs_recovery_block *rb,
- struct page *page)
+ loff_t pos, struct page *page)
{
struct buffer_head *bh_org;
+ size_t from = pos & ~PAGE_MASK;
void *kaddr;
bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize);
@@ -482,7 +483,7 @@ static int nilfs_recovery_copy_block(struct the_nilfs *nilfs,
return -EIO;
kaddr = kmap_atomic(page);
- memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
+ memcpy(kaddr + from, bh_org->b_data, bh_org->b_size);
kunmap_atomic(kaddr);
brelse(bh_org);
return 0;
@@ -521,7 +522,7 @@ static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs,
goto failed_inode;
}
- err = nilfs_recovery_copy_block(nilfs, rb, page);
+ err = nilfs_recovery_copy_block(nilfs, rb, pos, page);
if (unlikely(err))
goto failed_page;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 55e31cc903d1..2bfb08052d39 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1665,68 +1665,68 @@ static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
return 0;
}
-static void nilfs_begin_page_io(struct page *page)
+static void nilfs_begin_folio_io(struct folio *folio)
{
- if (!page || PageWriteback(page))
+ if (!folio || folio_test_writeback(folio))
/*
* For split b-tree node pages, this function may be called
* twice. We ignore the 2nd or later calls by this check.
*/
return;
- lock_page(page);
- clear_page_dirty_for_io(page);
- set_page_writeback(page);
- unlock_page(page);
+ folio_lock(folio);
+ folio_clear_dirty_for_io(folio);
+ folio_start_writeback(folio);
+ folio_unlock(folio);
}
static void nilfs_segctor_prepare_write(struct nilfs_sc_info *sci)
{
struct nilfs_segment_buffer *segbuf;
- struct page *bd_page = NULL, *fs_page = NULL;
+ struct folio *bd_folio = NULL, *fs_folio = NULL;
list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
struct buffer_head *bh;
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
- if (bh->b_page != bd_page) {
- if (bd_page) {
- lock_page(bd_page);
- clear_page_dirty_for_io(bd_page);
- set_page_writeback(bd_page);
- unlock_page(bd_page);
+ if (bh->b_folio != bd_folio) {
+ if (bd_folio) {
+ folio_lock(bd_folio);
+ folio_clear_dirty_for_io(bd_folio);
+ folio_start_writeback(bd_folio);
+ folio_unlock(bd_folio);
}
- bd_page = bh->b_page;
+ bd_folio = bh->b_folio;
}
}
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- set_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
- if (bh->b_page != bd_page) {
- lock_page(bd_page);
- clear_page_dirty_for_io(bd_page);
- set_page_writeback(bd_page);
- unlock_page(bd_page);
- bd_page = bh->b_page;
+ if (bh->b_folio != bd_folio) {
+ folio_lock(bd_folio);
+ folio_clear_dirty_for_io(bd_folio);
+ folio_start_writeback(bd_folio);
+ folio_unlock(bd_folio);
+ bd_folio = bh->b_folio;
}
break;
}
- if (bh->b_page != fs_page) {
- nilfs_begin_page_io(fs_page);
- fs_page = bh->b_page;
+ set_buffer_async_write(bh);
+ if (bh->b_folio != fs_folio) {
+ nilfs_begin_folio_io(fs_folio);
+ fs_folio = bh->b_folio;
}
}
}
- if (bd_page) {
- lock_page(bd_page);
- clear_page_dirty_for_io(bd_page);
- set_page_writeback(bd_page);
- unlock_page(bd_page);
+ if (bd_folio) {
+ folio_lock(bd_folio);
+ folio_clear_dirty_for_io(bd_folio);
+ folio_start_writeback(bd_folio);
+ folio_unlock(bd_folio);
}
- nilfs_begin_page_io(fs_page);
+ nilfs_begin_folio_io(fs_folio);
}
static int nilfs_segctor_write(struct nilfs_sc_info *sci,
@@ -1739,17 +1739,18 @@ static int nilfs_segctor_write(struct nilfs_sc_info *sci,
return ret;
}
-static void nilfs_end_page_io(struct page *page, int err)
+static void nilfs_end_folio_io(struct folio *folio, int err)
{
- if (!page)
+ if (!folio)
return;
- if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
+ if (buffer_nilfs_node(folio_buffers(folio)) &&
+ !folio_test_writeback(folio)) {
/*
* For b-tree node pages, this function may be called twice
* or more because they might be split in a segment.
*/
- if (PageDirty(page)) {
+ if (folio_test_dirty(folio)) {
/*
* For pages holding split b-tree node buffers, dirty
* flag on the buffers may be cleared discretely.
@@ -1757,30 +1758,30 @@ static void nilfs_end_page_io(struct page *page, int err)
* remaining buffers, and it must be cancelled if
* all the buffers get cleaned later.
*/
- lock_page(page);
- if (nilfs_page_buffers_clean(page))
- __nilfs_clear_page_dirty(page);
- unlock_page(page);
+ folio_lock(folio);
+ if (nilfs_folio_buffers_clean(folio))
+ __nilfs_clear_folio_dirty(folio);
+ folio_unlock(folio);
}
return;
}
if (!err) {
- if (!nilfs_page_buffers_clean(page))
- __set_page_dirty_nobuffers(page);
- ClearPageError(page);
+ if (!nilfs_folio_buffers_clean(folio))
+ filemap_dirty_folio(folio->mapping, folio);
+ folio_clear_error(folio);
} else {
- __set_page_dirty_nobuffers(page);
- SetPageError(page);
+ filemap_dirty_folio(folio->mapping, folio);
+ folio_set_error(folio);
}
- end_page_writeback(page);
+ folio_end_writeback(folio);
}
static void nilfs_abort_logs(struct list_head *logs, int err)
{
struct nilfs_segment_buffer *segbuf;
- struct page *bd_page = NULL, *fs_page = NULL;
+ struct folio *bd_folio = NULL, *fs_folio = NULL;
struct buffer_head *bh;
if (list_empty(logs))
@@ -1790,34 +1791,34 @@ static void nilfs_abort_logs(struct list_head *logs, int err)
list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
b_assoc_buffers) {
clear_buffer_uptodate(bh);
- if (bh->b_page != bd_page) {
- if (bd_page)
- end_page_writeback(bd_page);
- bd_page = bh->b_page;
+ if (bh->b_folio != bd_folio) {
+ if (bd_folio)
+ folio_end_writeback(bd_folio);
+ bd_folio = bh->b_folio;
}
}
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
b_assoc_buffers) {
- clear_buffer_async_write(bh);
if (bh == segbuf->sb_super_root) {
clear_buffer_uptodate(bh);
- if (bh->b_page != bd_page) {
- end_page_writeback(bd_page);
- bd_page = bh->b_page;
+ if (bh->b_folio != bd_folio) {
+ folio_end_writeback(bd_folio);
+ bd_folio = bh->b_folio;
}
break;
}
- if (bh->b_page != fs_page) {
- nilfs_end_page_io(fs_page, err);
- fs_page = bh->b_page;
+ clear_buffer_async_write(bh);
+ if (bh->b_folio != fs_folio) {
+ nilfs_end_folio_io(fs_folio, err);
+ fs_folio = bh->b_folio;
}
}
}
- if (bd_page)
- end_page_writeback(bd_page);
+ if (bd_folio)
+ folio_end_writeback(bd_folio);
- nilfs_end_page_io(fs_page, err);
+ nilfs_end_folio_io(fs_folio, err);
}
static void nilfs_segctor_abort_construction(struct nilfs_sc_info *sci,
@@ -1859,7 +1860,7 @@ static void nilfs_set_next_segment(struct the_nilfs *nilfs,
static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
{
struct nilfs_segment_buffer *segbuf;
- struct page *bd_page = NULL, *fs_page = NULL;
+ struct folio *bd_folio = NULL, *fs_folio = NULL;
struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
int update_sr = false;
@@ -1870,21 +1871,21 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
b_assoc_buffers) {
set_buffer_uptodate(bh);
clear_buffer_dirty(bh);
- if (bh->b_page != bd_page) {
- if (bd_page)
- end_page_writeback(bd_page);
- bd_page = bh->b_page;
+ if (bh->b_folio != bd_folio) {
+ if (bd_folio)
+ folio_end_writeback(bd_folio);
+ bd_folio = bh->b_folio;
}
}
/*
- * We assume that the buffers which belong to the same page
+ * We assume that the buffers which belong to the same folio
* continue over the buffer list.
- * Under this assumption, the last BHs of pages is
- * identifiable by the discontinuity of bh->b_page
- * (page != fs_page).
+ * Under this assumption, the last BHs of folios is
+ * identifiable by the discontinuity of bh->b_folio
+ * (folio != fs_folio).
*
* For B-tree node blocks, however, this assumption is not
- * guaranteed. The cleanup code of B-tree node pages needs
+ * guaranteed. The cleanup code of B-tree node folios needs
* special care.
*/
list_for_each_entry(bh, &segbuf->sb_payload_buffers,
@@ -1895,18 +1896,20 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
BIT(BH_NILFS_Redirected));
- set_mask_bits(&bh->b_state, clear_bits, set_bits);
if (bh == segbuf->sb_super_root) {
- if (bh->b_page != bd_page) {
- end_page_writeback(bd_page);
- bd_page = bh->b_page;
+ set_buffer_uptodate(bh);
+ clear_buffer_dirty(bh);
+ if (bh->b_folio != bd_folio) {
+ folio_end_writeback(bd_folio);
+ bd_folio = bh->b_folio;
}
update_sr = true;
break;
}
- if (bh->b_page != fs_page) {
- nilfs_end_page_io(fs_page, 0);
- fs_page = bh->b_page;
+ set_mask_bits(&bh->b_state, clear_bits, set_bits);
+ if (bh->b_folio != fs_folio) {
+ nilfs_end_folio_io(fs_folio, 0);
+ fs_folio = bh->b_folio;
}
}
@@ -1920,13 +1923,13 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
}
}
/*
- * Since pages may continue over multiple segment buffers,
- * end of the last page must be checked outside of the loop.
+ * Since folios may continue over multiple segment buffers,
+ * end of the last folio must be checked outside of the loop.
*/
- if (bd_page)
- end_page_writeback(bd_page);
+ if (bd_folio)
+ folio_end_writeback(bd_folio);
- nilfs_end_page_io(fs_page, 0);
+ nilfs_end_folio_io(fs_folio, 0);
nilfs_drop_collected_inodes(&sci->sc_dirty_files);
@@ -2587,6 +2590,7 @@ static int nilfs_segctor_thread(void *arg)
"segctord starting. Construction interval = %lu seconds, CP frequency < %lu seconds",
sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
+ set_freezable();
spin_lock(&sci->sc_state_lock);
loop:
for (;;) {
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 58ca7c936393..0a8119456c21 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -471,10 +471,15 @@ void nilfs_sufile_do_free(struct inode *sufile, __u64 segnum,
kunmap_atomic(kaddr);
return;
}
- WARN_ON(nilfs_segment_usage_error(su));
- WARN_ON(!nilfs_segment_usage_dirty(su));
+ if (unlikely(nilfs_segment_usage_error(su)))
+ nilfs_warn(sufile->i_sb, "free segment %llu marked in error",
+ (unsigned long long)segnum);
sudirty = nilfs_segment_usage_dirty(su);
+ if (unlikely(!sudirty))
+ nilfs_warn(sufile->i_sb, "free unallocated segment %llu",
+ (unsigned long long)segnum);
+
nilfs_segment_usage_set_clean(su);
kunmap_atomic(kaddr);
mark_buffer_dirty(su_bh);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index a5d1fa4e7552..df8674173b22 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1314,15 +1314,7 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
return ERR_CAST(s);
if (!s->s_root) {
- /*
- * We drop s_umount here because we need to open the bdev and
- * bdev->open_mutex ranks above s_umount (blkdev_put() ->
- * __invalidate_device()). It is safe because we have active sb
- * reference and SB_BORN is not set yet.
- */
- up_write(&s->s_umount);
err = setup_bdev_super(s, flags, NULL);
- down_write(&s->s_umount);
if (!err)
err = nilfs_fill_super(s, data,
flags & SB_SILENT ? 1 : 0);
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 1cb9ad7e884e..3464fa7e8538 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -29,7 +29,6 @@ static struct ctl_table dnotify_sysctls[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {}
};
static void __init dnotify_sysctl_init(void)
{
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 9dac7f6e72d2..1e4def21811e 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -29,12 +29,6 @@ static unsigned int fanotify_hash_path(const struct path *path)
hash_ptr(path->mnt, FANOTIFY_EVENT_HASH_BITS);
}
-static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1,
- __kernel_fsid_t *fsid2)
-{
- return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1];
-}
-
static unsigned int fanotify_hash_fsid(__kernel_fsid_t *fsid)
{
return hash_32(fsid->val[0], FANOTIFY_EVENT_HASH_BITS) ^
@@ -838,9 +832,8 @@ out:
}
/*
- * Get cached fsid of the filesystem containing the object from any connector.
- * All connectors are supposed to have the same fsid, but we do not verify that
- * here.
+ * Get cached fsid of the filesystem containing the object from any mark.
+ * All marks are supposed to have the same fsid, but we do not verify that here.
*/
static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
{
@@ -849,18 +842,11 @@ static __kernel_fsid_t fanotify_get_fsid(struct fsnotify_iter_info *iter_info)
__kernel_fsid_t fsid = {};
fsnotify_foreach_iter_mark_type(iter_info, mark, type) {
- struct fsnotify_mark_connector *conn;
-
- conn = READ_ONCE(mark->connector);
- /* Mark is just getting destroyed or created? */
- if (!conn)
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_HAS_FSID))
continue;
- if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID))
- continue;
- /* Pairs with smp_wmb() in fsnotify_add_mark_list() */
- smp_rmb();
- fsid = conn->fsid;
- if (WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
+ fsid = FANOTIFY_MARK(mark)->fsid;
+ if (!(mark->flags & FSNOTIFY_MARK_FLAG_WEAK_FSID) &&
+ WARN_ON_ONCE(!fsid.val[0] && !fsid.val[1]))
continue;
return fsid;
}
@@ -942,12 +928,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask,
return 0;
}
- if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS)) {
+ if (FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS))
fsid = fanotify_get_fsid(iter_info);
- /* Racing with mark destruction or creation? */
- if (!fsid.val[0] && !fsid.val[1])
- return 0;
- }
event = fanotify_alloc_event(group, mask, data, data_type, dir,
file_name, &fsid, match_mask);
@@ -1068,7 +1050,7 @@ static void fanotify_freeing_mark(struct fsnotify_mark *mark,
static void fanotify_free_mark(struct fsnotify_mark *fsn_mark)
{
- kmem_cache_free(fanotify_mark_cache, fsn_mark);
+ kmem_cache_free(fanotify_mark_cache, FANOTIFY_MARK(fsn_mark));
}
const struct fsnotify_ops fanotify_fsnotify_ops = {
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 6936671e148d..e5ab33cae6a7 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -489,6 +489,22 @@ static inline unsigned int fanotify_event_hash_bucket(
return event->hash & FANOTIFY_HTABLE_MASK;
}
+struct fanotify_mark {
+ struct fsnotify_mark fsn_mark;
+ __kernel_fsid_t fsid;
+};
+
+static inline struct fanotify_mark *FANOTIFY_MARK(struct fsnotify_mark *mark)
+{
+ return container_of(mark, struct fanotify_mark, fsn_mark);
+}
+
+static inline bool fanotify_fsid_equal(__kernel_fsid_t *fsid1,
+ __kernel_fsid_t *fsid2)
+{
+ return fsid1->val[0] == fsid2->val[0] && fsid1->val[1] == fsid2->val[1];
+}
+
static inline unsigned int fanotify_mark_user_flags(struct fsnotify_mark *mark)
{
unsigned int mflags = 0;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 4d765c72496f..fbdc63cc10d9 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -23,7 +23,7 @@
#include <asm/ioctls.h>
-#include "../../mount.h"
+#include "../fsnotify.h"
#include "../fdinfo.h"
#include "fanotify.h"
@@ -86,7 +86,6 @@ static struct ctl_table fanotify_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO
},
- { }
};
static void __init fanotify_sysctls_init(void)
@@ -1192,13 +1191,71 @@ static bool fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
return recalc;
}
+struct fan_fsid {
+ struct super_block *sb;
+ __kernel_fsid_t id;
+ bool weak;
+};
+
+static int fanotify_set_mark_fsid(struct fsnotify_group *group,
+ struct fsnotify_mark *mark,
+ struct fan_fsid *fsid)
+{
+ struct fsnotify_mark_connector *conn;
+ struct fsnotify_mark *old;
+ struct super_block *old_sb = NULL;
+
+ FANOTIFY_MARK(mark)->fsid = fsid->id;
+ mark->flags |= FSNOTIFY_MARK_FLAG_HAS_FSID;
+ if (fsid->weak)
+ mark->flags |= FSNOTIFY_MARK_FLAG_WEAK_FSID;
+
+ /* First mark added will determine if group is single or multi fsid */
+ if (list_empty(&group->marks_list))
+ return 0;
+
+ /* Find sb of an existing mark */
+ list_for_each_entry(old, &group->marks_list, g_list) {
+ conn = READ_ONCE(old->connector);
+ if (!conn)
+ continue;
+ old_sb = fsnotify_connector_sb(conn);
+ if (old_sb)
+ break;
+ }
+
+ /* Only detached marks left? */
+ if (!old_sb)
+ return 0;
+
+ /* Do not allow mixing of marks with weak and strong fsid */
+ if ((mark->flags ^ old->flags) & FSNOTIFY_MARK_FLAG_WEAK_FSID)
+ return -EXDEV;
+
+ /* Allow mixing of marks with strong fsid from different fs */
+ if (!fsid->weak)
+ return 0;
+
+ /* Do not allow mixing marks with weak fsid from different fs */
+ if (old_sb != fsid->sb)
+ return -EXDEV;
+
+ /* Do not allow mixing marks from different btrfs sub-volumes */
+ if (!fanotify_fsid_equal(&FANOTIFY_MARK(old)->fsid,
+ &FANOTIFY_MARK(mark)->fsid))
+ return -EXDEV;
+
+ return 0;
+}
+
static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp,
unsigned int obj_type,
unsigned int fan_flags,
- __kernel_fsid_t *fsid)
+ struct fan_fsid *fsid)
{
struct ucounts *ucounts = group->fanotify_data.ucounts;
+ struct fanotify_mark *fan_mark;
struct fsnotify_mark *mark;
int ret;
@@ -1211,24 +1268,34 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
!inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
return ERR_PTR(-ENOSPC);
- mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
- if (!mark) {
+ fan_mark = kmem_cache_alloc(fanotify_mark_cache, GFP_KERNEL);
+ if (!fan_mark) {
ret = -ENOMEM;
goto out_dec_ucounts;
}
+ mark = &fan_mark->fsn_mark;
fsnotify_init_mark(mark, group);
if (fan_flags & FAN_MARK_EVICTABLE)
mark->flags |= FSNOTIFY_MARK_FLAG_NO_IREF;
- ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0, fsid);
- if (ret) {
- fsnotify_put_mark(mark);
- goto out_dec_ucounts;
+ /* Cache fsid of filesystem containing the marked object */
+ if (fsid) {
+ ret = fanotify_set_mark_fsid(group, mark, fsid);
+ if (ret)
+ goto out_put_mark;
+ } else {
+ fan_mark->fsid.val[0] = fan_mark->fsid.val[1] = 0;
}
+ ret = fsnotify_add_mark_locked(mark, connp, obj_type, 0);
+ if (ret)
+ goto out_put_mark;
+
return mark;
+out_put_mark:
+ fsnotify_put_mark(mark);
out_dec_ucounts:
if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS))
dec_ucount(ucounts, UCOUNT_FANOTIFY_MARKS);
@@ -1279,7 +1346,7 @@ static int fanotify_may_update_existing_mark(struct fsnotify_mark *fsn_mark,
static int fanotify_add_mark(struct fsnotify_group *group,
fsnotify_connp_t *connp, unsigned int obj_type,
__u32 mask, unsigned int fan_flags,
- __kernel_fsid_t *fsid)
+ struct fan_fsid *fsid)
{
struct fsnotify_mark *fsn_mark;
bool recalc;
@@ -1327,7 +1394,7 @@ out:
static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
struct vfsmount *mnt, __u32 mask,
- unsigned int flags, __kernel_fsid_t *fsid)
+ unsigned int flags, struct fan_fsid *fsid)
{
return fanotify_add_mark(group, &real_mount(mnt)->mnt_fsnotify_marks,
FSNOTIFY_OBJ_TYPE_VFSMOUNT, mask, flags, fsid);
@@ -1335,7 +1402,7 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group,
static int fanotify_add_sb_mark(struct fsnotify_group *group,
struct super_block *sb, __u32 mask,
- unsigned int flags, __kernel_fsid_t *fsid)
+ unsigned int flags, struct fan_fsid *fsid)
{
return fanotify_add_mark(group, &sb->s_fsnotify_marks,
FSNOTIFY_OBJ_TYPE_SB, mask, flags, fsid);
@@ -1343,7 +1410,7 @@ static int fanotify_add_sb_mark(struct fsnotify_group *group,
static int fanotify_add_inode_mark(struct fsnotify_group *group,
struct inode *inode, __u32 mask,
- unsigned int flags, __kernel_fsid_t *fsid)
+ unsigned int flags, struct fan_fsid *fsid)
{
pr_debug("%s: group=%p inode=%p\n", __func__, group, inode);
@@ -1554,20 +1621,25 @@ out_destroy_group:
return fd;
}
-static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
+static int fanotify_test_fsid(struct dentry *dentry, unsigned int flags,
+ struct fan_fsid *fsid)
{
+ unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
__kernel_fsid_t root_fsid;
int err;
/*
* Make sure dentry is not of a filesystem with zero fsid (e.g. fuse).
*/
- err = vfs_get_fsid(dentry, fsid);
+ err = vfs_get_fsid(dentry, &fsid->id);
if (err)
return err;
- if (!fsid->val[0] && !fsid->val[1])
- return -ENODEV;
+ fsid->sb = dentry->d_sb;
+ if (!fsid->id.val[0] && !fsid->id.val[1]) {
+ err = -ENODEV;
+ goto weak;
+ }
/*
* Make sure dentry is not of a filesystem subvolume (e.g. btrfs)
@@ -1577,11 +1649,18 @@ static int fanotify_test_fsid(struct dentry *dentry, __kernel_fsid_t *fsid)
if (err)
return err;
- if (root_fsid.val[0] != fsid->val[0] ||
- root_fsid.val[1] != fsid->val[1])
- return -EXDEV;
+ if (!fanotify_fsid_equal(&root_fsid, &fsid->id)) {
+ err = -EXDEV;
+ goto weak;
+ }
+ fsid->weak = false;
return 0;
+
+weak:
+ /* Allow weak fsid when marking inodes */
+ fsid->weak = true;
+ return (mark_type == FAN_MARK_INODE) ? 0 : err;
}
/* Check if filesystem can encode a unique fid */
@@ -1665,7 +1744,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct fsnotify_group *group;
struct fd f;
struct path path;
- __kernel_fsid_t __fsid, *fsid = NULL;
+ struct fan_fsid __fsid, *fsid = NULL;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
@@ -1817,7 +1896,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
if (fid_mode) {
- ret = fanotify_test_fsid(path.dentry, &__fsid);
+ ret = fanotify_test_fsid(path.dentry, flags, &__fsid);
if (ret)
goto path_put_and_out;
@@ -1935,7 +2014,7 @@ static int __init fanotify_user_setup(void)
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 12);
BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11);
- fanotify_mark_cache = KMEM_CACHE(fsnotify_mark,
+ fanotify_mark_cache = KMEM_CACHE(fanotify_mark,
SLAB_PANIC|SLAB_ACCOUNT);
fanotify_fid_event_cachep = KMEM_CACHE(fanotify_fid_event,
SLAB_PANIC);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 7974e91ffe13..8bfd690e9f10 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -124,7 +124,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
* d_flags to indicate parental interest (their parent is the
* original inode) */
spin_lock(&alias->d_lock);
- list_for_each_entry(child, &alias->d_subdirs, d_child) {
+ hlist_for_each_entry(child, &alias->d_children, d_sib) {
if (!child->d_inode)
continue;
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index a3809ae92170..85d8fdd55329 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -85,7 +85,6 @@ static struct ctl_table inotify_table[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO
},
- { }
};
static void __init inotify_sysctls_init(void)
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index c74ef947447d..d6944ff86ffa 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -537,8 +537,7 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b)
}
static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
- unsigned int obj_type,
- __kernel_fsid_t *fsid)
+ unsigned int obj_type)
{
struct fsnotify_mark_connector *conn;
@@ -550,14 +549,7 @@ static int fsnotify_attach_connector_to_object(fsnotify_connp_t *connp,
conn->flags = 0;
conn->type = obj_type;
conn->obj = connp;
- /* Cache fsid of filesystem containing the object */
- if (fsid) {
- conn->fsid = *fsid;
- conn->flags = FSNOTIFY_CONN_FLAG_HAS_FSID;
- } else {
- conn->fsid.val[0] = conn->fsid.val[1] = 0;
- conn->flags = 0;
- }
+ conn->flags = 0;
fsnotify_get_sb_connectors(conn);
/*
@@ -608,8 +600,7 @@ out:
*/
static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
fsnotify_connp_t *connp,
- unsigned int obj_type,
- int add_flags, __kernel_fsid_t *fsid)
+ unsigned int obj_type, int add_flags)
{
struct fsnotify_mark *lmark, *last = NULL;
struct fsnotify_mark_connector *conn;
@@ -619,41 +610,15 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark,
if (WARN_ON(!fsnotify_valid_obj_type(obj_type)))
return -EINVAL;
- /* Backend is expected to check for zero fsid (e.g. tmpfs) */
- if (fsid && WARN_ON_ONCE(!fsid->val[0] && !fsid->val[1]))
- return -ENODEV;
-
restart:
spin_lock(&mark->lock);
conn = fsnotify_grab_connector(connp);
if (!conn) {
spin_unlock(&mark->lock);
- err = fsnotify_attach_connector_to_object(connp, obj_type,
- fsid);
+ err = fsnotify_attach_connector_to_object(connp, obj_type);
if (err)
return err;
goto restart;
- } else if (fsid && !(conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID)) {
- conn->fsid = *fsid;
- /* Pairs with smp_rmb() in fanotify_get_fsid() */
- smp_wmb();
- conn->flags |= FSNOTIFY_CONN_FLAG_HAS_FSID;
- } else if (fsid && (conn->flags & FSNOTIFY_CONN_FLAG_HAS_FSID) &&
- (fsid->val[0] != conn->fsid.val[0] ||
- fsid->val[1] != conn->fsid.val[1])) {
- /*
- * Backend is expected to check for non uniform fsid
- * (e.g. btrfs), but maybe we missed something?
- * Only allow setting conn->fsid once to non zero fsid.
- * inotify and non-fid fanotify groups do not set nor test
- * conn->fsid.
- */
- pr_warn_ratelimited("%s: fsid mismatch on object of type %u: "
- "%x.%x != %x.%x\n", __func__, conn->type,
- fsid->val[0], fsid->val[1],
- conn->fsid.val[0], conn->fsid.val[1]);
- err = -EXDEV;
- goto out_err;
}
/* is mark the first mark? */
@@ -703,7 +668,7 @@ out_err:
*/
int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_connp_t *connp, unsigned int obj_type,
- int add_flags, __kernel_fsid_t *fsid)
+ int add_flags)
{
struct fsnotify_group *group = mark->group;
int ret = 0;
@@ -723,7 +688,7 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,
fsnotify_get_mark(mark); /* for g_list */
spin_unlock(&mark->lock);
- ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags, fsid);
+ ret = fsnotify_add_mark_list(mark, connp, obj_type, add_flags);
if (ret)
goto err;
@@ -742,14 +707,13 @@ err:
}
int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp,
- unsigned int obj_type, int add_flags,
- __kernel_fsid_t *fsid)
+ unsigned int obj_type, int add_flags)
{
int ret;
struct fsnotify_group *group = mark->group;
fsnotify_group_lock(group);
- ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags, fsid);
+ ret = fsnotify_add_mark_locked(mark, connp, obj_type, add_flags);
fsnotify_group_unlock(group);
return ret;
}
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 9a4b228d42fa..34e1e3e36733 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -90,12 +90,9 @@ slow:
inode->i_fop = &ns_file_operations;
inode->i_private = ns;
- dentry = d_alloc_anon(mnt->mnt_sb);
- if (!dentry) {
- iput(inode);
+ dentry = d_make_root(inode); /* not the normal use, but... */
+ if (!dentry)
return -ENOMEM;
- }
- d_instantiate(dentry, inode);
dentry->d_fsdata = (void *)ns->ops;
d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry);
if (d) {
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index 71e31e789b29..2d01517a2d59 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -1304,7 +1304,7 @@ done:
* page cleaned. The VM has already locked the page and marked it clean.
*
* For non-resident attributes, ntfs_writepage() writes the @page by calling
- * the ntfs version of the generic block_write_full_page() function,
+ * the ntfs version of the generic block_write_full_folio() function,
* ntfs_write_block(), which in turn if necessary creates and writes the
* buffers associated with the page asynchronously.
*
@@ -1314,7 +1314,7 @@ done:
* vfs inode dirty code path for the inode the mft record belongs to or via the
* vm page dirty code path for the page the mft record is in.
*
- * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_page().
+ * Based on ntfs_read_folio() and fs/buffer.c::block_write_full_folio().
*
* Return 0 on success and -errno on error.
*/
@@ -1644,7 +1644,7 @@ const struct address_space_operations ntfs_normal_aops = {
.bmap = ntfs_bmap,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
/*
@@ -1658,7 +1658,7 @@ const struct address_space_operations ntfs_compressed_aops = {
#endif /* NTFS_RW */
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
/*
@@ -1673,7 +1673,7 @@ const struct address_space_operations ntfs_mst_aops = {
#endif /* NTFS_RW */
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
#ifdef NTFS_RW
@@ -1690,7 +1690,7 @@ const struct address_space_operations ntfs_mst_aops = {
*
* If the page does not have buffers, we create them and set them uptodate.
* The page may not be locked which is why we need to handle the buffers under
- * the mapping->private_lock. Once the buffers are marked dirty we no longer
+ * the mapping->i_private_lock. Once the buffers are marked dirty we no longer
* need the lock since try_to_free_buffers() does not free dirty buffers.
*/
void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
@@ -1702,11 +1702,11 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
BUG_ON(!PageUptodate(page));
end = ofs + ni->itype.index.block_size;
bh_size = VFS_I(ni)->i_sb->s_blocksize;
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
if (unlikely(!page_has_buffers(page))) {
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
bh = head = alloc_page_buffers(page, bh_size, true);
- spin_lock(&mapping->private_lock);
+ spin_lock(&mapping->i_private_lock);
if (likely(!page_has_buffers(page))) {
struct buffer_head *tail;
@@ -1730,7 +1730,7 @@ void mark_ntfs_record_dirty(struct page *page, const unsigned int ofs) {
break;
set_buffer_dirty(bh);
} while ((bh = bh->b_this_page) != head);
- spin_unlock(&mapping->private_lock);
+ spin_unlock(&mapping->i_private_lock);
filemap_dirty_folio(mapping, page_folio(page));
if (unlikely(buffers_to_free)) {
do {
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 4596c90e7b7c..629723a8d712 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1462,7 +1462,8 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
/**
* ntfs_dir_fsync - sync a directory to disk
* @filp: directory to be synced
- * @dentry: dentry describing the directory to sync
+ * @start: offset in bytes of the beginning of data range to sync
+ * @end: offset in bytes of the end of data range (inclusive)
* @datasync: if non-zero only flush user data and not metadata
*
* Data integrity sync of a directory to disk. Used for fsync, fdatasync, and
diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c
index 174fe536a1c0..4e980170d86a 100644
--- a/fs/ntfs/sysctl.c
+++ b/fs/ntfs/sysctl.c
@@ -28,7 +28,6 @@ static struct ctl_table ntfs_sysctls[] = {
.mode = 0644, /* Mode, proc handler. */
.proc_handler = proc_dointvec
},
- {}
};
/* Storage for the sysctls header. */
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index 63f70259edc0..7aadf5010999 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -886,7 +886,7 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
struct runs_tree *run = &ni->file.run;
struct ntfs_sb_info *sbi;
u8 cluster_bits;
- struct ATTRIB *attr = NULL, *attr_b;
+ struct ATTRIB *attr, *attr_b;
struct ATTR_LIST_ENTRY *le, *le_b;
struct mft_inode *mi, *mi_b;
CLST hint, svcn, to_alloc, evcn1, next_svcn, asize, end, vcn0, alen;
@@ -904,12 +904,8 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
*len = 0;
up_read(&ni->file.run_lock);
- if (*len) {
- if (*lcn != SPARSE_LCN || !new)
- return 0; /* Fast normal way without allocation. */
- else if (clen > *len)
- clen = *len;
- }
+ if (*len && (*lcn != SPARSE_LCN || !new))
+ return 0; /* Fast normal way without allocation. */
/* No cluster in cache or we need to allocate cluster in hole. */
sbi = ni->mi.sbi;
@@ -918,6 +914,17 @@ int attr_data_get_block(struct ntfs_inode *ni, CLST vcn, CLST clen, CLST *lcn,
ni_lock(ni);
down_write(&ni->file.run_lock);
+ /* Repeat the code above (under write lock). */
+ if (!run_lookup_entry(run, vcn, lcn, len, NULL))
+ *len = 0;
+
+ if (*len) {
+ if (*lcn != SPARSE_LCN || !new)
+ goto out; /* normal way without allocation. */
+ if (clen > *len)
+ clen = *len;
+ }
+
le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL, 0, NULL, &mi_b);
if (!attr_b) {
@@ -1736,8 +1743,10 @@ repack:
le_b = NULL;
attr_b = ni_find_attr(ni, NULL, &le_b, ATTR_DATA, NULL,
0, NULL, &mi_b);
- if (!attr_b)
- return -ENOENT;
+ if (!attr_b) {
+ err = -ENOENT;
+ goto out;
+ }
attr = attr_b;
le = le_b;
@@ -1818,13 +1827,15 @@ ins_ext:
ok:
run_truncate_around(run, vcn);
out:
- if (new_valid > data_size)
- new_valid = data_size;
+ if (attr_b) {
+ if (new_valid > data_size)
+ new_valid = data_size;
- valid_size = le64_to_cpu(attr_b->nres.valid_size);
- if (new_valid != valid_size) {
- attr_b->nres.valid_size = cpu_to_le64(valid_size);
- mi_b->dirty = true;
+ valid_size = le64_to_cpu(attr_b->nres.valid_size);
+ if (new_valid != valid_size) {
+ attr_b->nres.valid_size = cpu_to_le64(valid_size);
+ mi_b->dirty = true;
+ }
}
return err;
@@ -2073,7 +2084,7 @@ next_attr:
/* Update inode size. */
ni->i_valid = valid_size;
- ni->vfs_inode.i_size = data_size;
+ i_size_write(&ni->vfs_inode, data_size);
inode_set_bytes(&ni->vfs_inode, total_size);
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
mark_inode_dirty(&ni->vfs_inode);
@@ -2488,7 +2499,7 @@ int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes)
mi_b->dirty = true;
done:
- ni->vfs_inode.i_size += bytes;
+ i_size_write(&ni->vfs_inode, ni->vfs_inode.i_size + bytes);
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
mark_inode_dirty(&ni->vfs_inode);
diff --git a/fs/ntfs3/attrlist.c b/fs/ntfs3/attrlist.c
index 7c01735d1219..9f4bd8d26090 100644
--- a/fs/ntfs3/attrlist.c
+++ b/fs/ntfs3/attrlist.c
@@ -29,7 +29,7 @@ static inline bool al_is_valid_le(const struct ntfs_inode *ni,
void al_destroy(struct ntfs_inode *ni)
{
run_close(&ni->attr_list.run);
- kfree(ni->attr_list.le);
+ kvfree(ni->attr_list.le);
ni->attr_list.le = NULL;
ni->attr_list.size = 0;
ni->attr_list.dirty = false;
@@ -127,12 +127,13 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
{
size_t off;
u16 sz;
+ const unsigned le_min_size = le_size(0);
if (!le) {
le = ni->attr_list.le;
} else {
sz = le16_to_cpu(le->size);
- if (sz < sizeof(struct ATTR_LIST_ENTRY)) {
+ if (sz < le_min_size) {
/* Impossible 'cause we should not return such le. */
return NULL;
}
@@ -141,7 +142,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
/* Check boundary. */
off = PtrOffset(ni->attr_list.le, le);
- if (off + sizeof(struct ATTR_LIST_ENTRY) > ni->attr_list.size) {
+ if (off + le_min_size > ni->attr_list.size) {
/* The regular end of list. */
return NULL;
}
@@ -149,8 +150,7 @@ struct ATTR_LIST_ENTRY *al_enumerate(struct ntfs_inode *ni,
sz = le16_to_cpu(le->size);
/* Check le for errors. */
- if (sz < sizeof(struct ATTR_LIST_ENTRY) ||
- off + sz > ni->attr_list.size ||
+ if (sz < le_min_size || off + sz > ni->attr_list.size ||
sz < le->name_off + le->name_len * sizeof(short)) {
return NULL;
}
@@ -318,7 +318,7 @@ int al_add_le(struct ntfs_inode *ni, enum ATTR_TYPE type, const __le16 *name,
memcpy(ptr, al->le, off);
memcpy(Add2Ptr(ptr, off + sz), le, old_size - off);
le = Add2Ptr(ptr, off);
- kfree(al->le);
+ kvfree(al->le);
al->le = ptr;
} else {
memmove(Add2Ptr(le, sz), le, old_size - off);
diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c
index 63f14a0232f6..845f9b22deef 100644
--- a/fs/ntfs3/bitmap.c
+++ b/fs/ntfs3/bitmap.c
@@ -124,7 +124,7 @@ void wnd_close(struct wnd_bitmap *wnd)
{
struct rb_node *node, *next;
- kfree(wnd->free_bits);
+ kvfree(wnd->free_bits);
wnd->free_bits = NULL;
run_close(&wnd->run);
@@ -1360,7 +1360,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits)
memcpy(new_free, wnd->free_bits, wnd->nwnd * sizeof(short));
memset(new_free + wnd->nwnd, 0,
(new_wnd - wnd->nwnd) * sizeof(short));
- kfree(wnd->free_bits);
+ kvfree(wnd->free_bits);
wnd->free_bits = new_free;
}
diff --git a/fs/ntfs3/dir.c b/fs/ntfs3/dir.c
index ec0566b322d5..5cf3d9decf64 100644
--- a/fs/ntfs3/dir.c
+++ b/fs/ntfs3/dir.c
@@ -309,11 +309,31 @@ static inline int ntfs_filldir(struct ntfs_sb_info *sbi, struct ntfs_inode *ni,
return 0;
}
- /* NTFS: symlinks are "dir + reparse" or "file + reparse" */
- if (fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT)
- dt_type = DT_LNK;
- else
- dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+ /*
+ * NTFS: symlinks are "dir + reparse" or "file + reparse"
+ * Unfortunately reparse attribute is used for many purposes (several dozens).
+ * It is not possible here to know is this name symlink or not.
+ * To get exactly the type of name we should to open inode (read mft).
+ * getattr for opened file (fstat) correctly returns symlink.
+ */
+ dt_type = (fname->dup.fa & FILE_ATTRIBUTE_DIRECTORY) ? DT_DIR : DT_REG;
+
+ /*
+ * It is not reliable to detect the type of name using duplicated information
+ * stored in parent directory.
+ * The only correct way to get the type of name - read MFT record and find ATTR_STD.
+ * The code below is not good idea.
+ * It does additional locks/reads just to get the type of name.
+ * Should we use additional mount option to enable branch below?
+ */
+ if ((fname->dup.fa & FILE_ATTRIBUTE_REPARSE_POINT) &&
+ ino != ni->mi.rno) {
+ struct inode *inode = ntfs_iget5(sbi->sb, &e->ref, NULL);
+ if (!IS_ERR_OR_NULL(inode)) {
+ dt_type = fs_umode_to_dtype(inode->i_mode);
+ iput(inode);
+ }
+ }
return !dir_emit(ctx, (s8 *)name, name_len, ino, dt_type);
}
@@ -495,11 +515,9 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
struct INDEX_HDR *hdr;
const struct ATTR_FILE_NAME *fname;
u32 e_size, off, end;
- u64 vbo = 0;
size_t drs = 0, fles = 0, bit = 0;
- loff_t i_size = ni->vfs_inode.i_size;
struct indx_node *node = NULL;
- u8 index_bits = ni->dir.index_bits;
+ size_t max_indx = i_size_read(&ni->vfs_inode) >> ni->dir.index_bits;
if (is_empty)
*is_empty = true;
@@ -518,8 +536,10 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
e = Add2Ptr(hdr, off);
e_size = le16_to_cpu(e->size);
if (e_size < sizeof(struct NTFS_DE) ||
- off + e_size > end)
+ off + e_size > end) {
+ /* Looks like corruption. */
break;
+ }
if (de_is_last(e))
break;
@@ -543,7 +563,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
fles += 1;
}
- if (vbo >= i_size)
+ if (bit >= max_indx)
goto out;
err = indx_used_bit(&ni->dir, ni, &bit);
@@ -553,8 +573,7 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
if (bit == MINUS_ONE_T)
goto out;
- vbo = (u64)bit << index_bits;
- if (vbo >= i_size)
+ if (bit >= max_indx)
goto out;
err = indx_read(&ni->dir, ni, bit << ni->dir.idx2vbn_bits,
@@ -564,7 +583,6 @@ static int ntfs_dir_count(struct inode *dir, bool *is_empty, size_t *dirs,
hdr = &node->index->ihdr;
bit += 1;
- vbo = (u64)bit << ni->dir.idx2vbn_bits;
}
out:
@@ -593,5 +611,9 @@ const struct file_operations ntfs_dir_operations = {
.iterate_shared = ntfs_readdir,
.fsync = generic_file_fsync,
.open = ntfs_file_open,
+ .unlocked_ioctl = ntfs_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = ntfs_compat_ioctl,
+#endif
};
// clang-format on
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index a5a30a24ce5d..5418662c80d8 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -48,7 +48,7 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
return 0;
}
-static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
+long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
{
struct inode *inode = file_inode(filp);
struct ntfs_sb_info *sbi = inode->i_sb->s_fs_info;
@@ -61,7 +61,7 @@ static long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
}
#ifdef CONFIG_COMPAT
-static long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg)
+long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg)
{
return ntfs_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
@@ -188,6 +188,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
u32 bh_next, bh_off, to;
sector_t iblock;
struct folio *folio;
+ bool dirty = false;
for (; idx < idx_end; idx += 1, from = 0) {
page_off = (loff_t)idx << PAGE_SHIFT;
@@ -223,29 +224,27 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to)
/* Ok, it's mapped. Make sure it's up-to-date. */
if (folio_test_uptodate(folio))
set_buffer_uptodate(bh);
-
- if (!buffer_uptodate(bh)) {
- err = bh_read(bh, 0);
- if (err < 0) {
- folio_unlock(folio);
- folio_put(folio);
- goto out;
- }
+ else if (bh_read(bh, 0) < 0) {
+ err = -EIO;
+ folio_unlock(folio);
+ folio_put(folio);
+ goto out;
}
mark_buffer_dirty(bh);
-
} while (bh_off = bh_next, iblock += 1,
head != (bh = bh->b_this_page));
folio_zero_segment(folio, from, to);
+ dirty = true;
folio_unlock(folio);
folio_put(folio);
cond_resched();
}
out:
- mark_inode_dirty(inode);
+ if (dirty)
+ mark_inode_dirty(inode);
return err;
}
@@ -261,6 +260,9 @@ static int ntfs_file_mmap(struct file *file, struct vm_area_struct *vma)
bool rw = vma->vm_flags & VM_WRITE;
int err;
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
if (is_encrypted(ni)) {
ntfs_inode_warn(inode, "mmap encrypted not supported");
return -EOPNOTSUPP;
@@ -499,10 +501,14 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
ni_lock(ni);
err = attr_punch_hole(ni, vbo, len, &frame_size);
ni_unlock(ni);
+ if (!err)
+ goto ok;
+
if (err != E_NTFS_NOTALIGNED)
goto out;
/* Process not aligned punch. */
+ err = 0;
mask = frame_size - 1;
vbo_a = (vbo + mask) & ~mask;
end_a = end & ~mask;
@@ -525,6 +531,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
ni_lock(ni);
err = attr_punch_hole(ni, vbo_a, end_a - vbo_a, NULL);
ni_unlock(ni);
+ if (err)
+ goto out;
}
} else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
/*
@@ -564,6 +572,8 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
ni_lock(ni);
err = attr_insert_range(ni, vbo, len);
ni_unlock(ni);
+ if (err)
+ goto out;
} else {
/* Check new size. */
u8 cluster_bits = sbi->cluster_bits;
@@ -633,11 +643,18 @@ static long ntfs_fallocate(struct file *file, int mode, loff_t vbo, loff_t len)
&ni->file.run, i_size, &ni->i_valid,
true, NULL);
ni_unlock(ni);
+ if (err)
+ goto out;
} else if (new_size > i_size) {
- inode->i_size = new_size;
+ i_size_write(inode, new_size);
}
}
+ok:
+ err = file_modified(file);
+ if (err)
+ goto out;
+
out:
if (map_locked)
filemap_invalidate_unlock(mapping);
@@ -663,6 +680,9 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
umode_t mode = inode->i_mode;
int err;
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
err = setattr_prepare(idmap, dentry, attr);
if (err)
goto out;
@@ -676,7 +696,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
goto out;
}
inode_dio_wait(inode);
- oldsize = inode->i_size;
+ oldsize = i_size_read(inode);
newsize = attr->ia_size;
if (newsize <= oldsize)
@@ -688,7 +708,7 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
goto out;
ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
- inode->i_size = newsize;
+ i_size_write(inode, newsize);
}
setattr_copy(idmap, inode, attr);
@@ -718,6 +738,9 @@ static ssize_t ntfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
struct inode *inode = file->f_mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
if (is_encrypted(ni)) {
ntfs_inode_warn(inode, "encrypted i/o not supported");
return -EOPNOTSUPP;
@@ -752,6 +775,9 @@ static ssize_t ntfs_file_splice_read(struct file *in, loff_t *ppos,
struct inode *inode = in->f_mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
if (is_encrypted(ni)) {
ntfs_inode_warn(inode, "encrypted i/o not supported");
return -EOPNOTSUPP;
@@ -821,7 +847,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from)
size_t count = iov_iter_count(from);
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(file);
- loff_t i_size = inode->i_size;
+ loff_t i_size = i_size_read(inode);
struct address_space *mapping = inode->i_mapping;
struct ntfs_inode *ni = ntfs_i(inode);
u64 valid = ni->i_valid;
@@ -1028,6 +1054,8 @@ out:
iocb->ki_pos += written;
if (iocb->ki_pos > ni->i_valid)
ni->i_valid = iocb->ki_pos;
+ if (iocb->ki_pos > i_size)
+ i_size_write(inode, iocb->ki_pos);
return written;
}
@@ -1041,8 +1069,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
ssize_t ret;
+ int err;
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
if (is_encrypted(ni)) {
ntfs_inode_warn(inode, "encrypted i/o not supported");
return -EOPNOTSUPP;
@@ -1068,6 +1100,12 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ret <= 0)
goto out;
+ err = file_modified(iocb->ki_filp);
+ if (err) {
+ ret = err;
+ goto out;
+ }
+
if (WARN_ON(ni->ni_flags & NI_FLAG_COMPRESSED_MASK)) {
/* Should never be here, see ntfs_file_open(). */
ret = -EOPNOTSUPP;
@@ -1097,6 +1135,9 @@ int ntfs_file_open(struct inode *inode, struct file *file)
{
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
if (unlikely((is_compressed(ni) || is_encrypted(ni)) &&
(file->f_flags & O_DIRECT))) {
return -EOPNOTSUPP;
@@ -1138,7 +1179,8 @@ static int ntfs_file_release(struct inode *inode, struct file *file)
down_write(&ni->file.run_lock);
err = attr_set_size(ni, ATTR_DATA, NULL, 0, &ni->file.run,
- inode->i_size, &ni->i_valid, false, NULL);
+ i_size_read(inode), &ni->i_valid, false,
+ NULL);
up_write(&ni->file.run_lock);
ni_unlock(ni);
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index 3df2d9e34b91..7f27382e0ce2 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -778,7 +778,7 @@ static int ni_try_remove_attr_list(struct ntfs_inode *ni)
run_deallocate(sbi, &ni->attr_list.run, true);
run_close(&ni->attr_list.run);
ni->attr_list.size = 0;
- kfree(ni->attr_list.le);
+ kvfree(ni->attr_list.le);
ni->attr_list.le = NULL;
ni->attr_list.dirty = false;
@@ -927,7 +927,7 @@ int ni_create_attr_list(struct ntfs_inode *ni)
return 0;
out:
- kfree(ni->attr_list.le);
+ kvfree(ni->attr_list.le);
ni->attr_list.le = NULL;
ni->attr_list.size = 0;
return err;
@@ -2099,7 +2099,7 @@ int ni_readpage_cmpr(struct ntfs_inode *ni, struct page *page)
gfp_t gfp_mask;
struct page *pg;
- if (vbo >= ni->vfs_inode.i_size) {
+ if (vbo >= i_size_read(&ni->vfs_inode)) {
SetPageUptodate(page);
err = 0;
goto out;
@@ -2173,7 +2173,7 @@ int ni_decompress_file(struct ntfs_inode *ni)
{
struct ntfs_sb_info *sbi = ni->mi.sbi;
struct inode *inode = &ni->vfs_inode;
- loff_t i_size = inode->i_size;
+ loff_t i_size = i_size_read(inode);
struct address_space *mapping = inode->i_mapping;
gfp_t gfp_mask = mapping_gfp_mask(mapping);
struct page **pages = NULL;
@@ -2508,6 +2508,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
err = -EOPNOTSUPP;
goto out1;
#else
+ loff_t i_size = i_size_read(&ni->vfs_inode);
u32 frame_bits = ni_ext_compress_bits(ni);
u64 frame64 = frame_vbo >> frame_bits;
u64 frames, vbo_data;
@@ -2548,7 +2549,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
}
}
- frames = (ni->vfs_inode.i_size - 1) >> frame_bits;
+ frames = (i_size - 1) >> frame_bits;
err = attr_wof_frame_info(ni, attr, run, frame64, frames,
frame_bits, &ondisk_size, &vbo_data);
@@ -2556,8 +2557,7 @@ int ni_read_frame(struct ntfs_inode *ni, u64 frame_vbo, struct page **pages,
goto out2;
if (frame64 == frames) {
- unc_size = 1 + ((ni->vfs_inode.i_size - 1) &
- (frame_size - 1));
+ unc_size = 1 + ((i_size - 1) & (frame_size - 1));
ondisk_size = attr_size(attr) - vbo_data;
} else {
unc_size = frame_size;
@@ -3259,6 +3259,9 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
if (is_bad_inode(inode) || sb_rdonly(sb))
return 0;
+ if (unlikely(ntfs3_forced_shutdown(sb)))
+ return -EIO;
+
if (!ni_trylock(ni)) {
/* 'ni' is under modification, skip for now. */
mark_inode_dirty_sync(inode);
@@ -3288,7 +3291,7 @@ int ni_write_inode(struct inode *inode, int sync, const char *hint)
modified = true;
}
- ts = inode_get_mtime(inode);
+ ts = inode_get_ctime(inode);
dup.c_time = kernel2nt(&ts);
if (std->c_time != dup.c_time) {
std->c_time = dup.c_time;
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index 98ccb6650858..855519713bf7 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -465,7 +465,7 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr)
{
const struct RESTART_AREA *ra;
u16 cl, fl, ul;
- u32 off, l_size, file_dat_bits, file_size_round;
+ u32 off, l_size, seq_bits;
u16 ro = le16_to_cpu(rhdr->ra_off);
u32 sys_page = le32_to_cpu(rhdr->sys_page_size);
@@ -511,13 +511,15 @@ static inline bool is_rst_area_valid(const struct RESTART_HDR *rhdr)
/* Make sure the sequence number bits match the log file size. */
l_size = le64_to_cpu(ra->l_size);
- file_dat_bits = sizeof(u64) * 8 - le32_to_cpu(ra->seq_num_bits);
- file_size_round = 1u << (file_dat_bits + 3);
- if (file_size_round != l_size &&
- (file_size_round < l_size || (file_size_round / 2) > l_size)) {
- return false;
+ seq_bits = sizeof(u64) * 8 + 3;
+ while (l_size) {
+ l_size >>= 1;
+ seq_bits -= 1;
}
+ if (seq_bits != ra->seq_num_bits)
+ return false;
+
/* The log page data offset and record header length must be quad-aligned. */
if (!IS_ALIGNED(le16_to_cpu(ra->data_off), 8) ||
!IS_ALIGNED(le16_to_cpu(ra->rec_hdr_len), 8))
@@ -974,6 +976,16 @@ skip_looking:
return e;
}
+struct restart_info {
+ u64 last_lsn;
+ struct RESTART_HDR *r_page;
+ u32 vbo;
+ bool chkdsk_was_run;
+ bool valid_page;
+ bool initialized;
+ bool restart;
+};
+
#define RESTART_SINGLE_PAGE_IO cpu_to_le16(0x0001)
#define NTFSLOG_WRAPPED 0x00000001
@@ -987,6 +999,7 @@ struct ntfs_log {
struct ntfs_inode *ni;
u32 l_size;
+ u32 orig_file_size;
u32 sys_page_size;
u32 sys_page_mask;
u32 page_size;
@@ -1040,6 +1053,8 @@ struct ntfs_log {
struct CLIENT_ID client_id;
u32 client_undo_commit;
+
+ struct restart_info rst_info, rst_info2;
};
static inline u32 lsn_to_vbo(struct ntfs_log *log, const u64 lsn)
@@ -1105,16 +1120,6 @@ static inline bool verify_client_lsn(struct ntfs_log *log,
lsn <= le64_to_cpu(log->ra->current_lsn) && lsn;
}
-struct restart_info {
- u64 last_lsn;
- struct RESTART_HDR *r_page;
- u32 vbo;
- bool chkdsk_was_run;
- bool valid_page;
- bool initialized;
- bool restart;
-};
-
static int read_log_page(struct ntfs_log *log, u32 vbo,
struct RECORD_PAGE_HDR **buffer, bool *usa_error)
{
@@ -1176,7 +1181,7 @@ out:
* restart page header. It will stop the first time we find a
* valid page header.
*/
-static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
+static int log_read_rst(struct ntfs_log *log, bool first,
struct restart_info *info)
{
u32 skip, vbo;
@@ -1192,7 +1197,7 @@ static int log_read_rst(struct ntfs_log *log, u32 l_size, bool first,
}
/* Loop continuously until we succeed. */
- for (; vbo < l_size; vbo = 2 * vbo + skip, skip = 0) {
+ for (; vbo < log->l_size; vbo = 2 * vbo + skip, skip = 0) {
bool usa_error;
bool brst, bchk;
struct RESTART_AREA *ra;
@@ -1285,22 +1290,17 @@ check_result:
/*
* Ilog_init_pg_hdr - Init @log from restart page header.
*/
-static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size,
- u32 page_size, u16 major_ver, u16 minor_ver)
+static void log_init_pg_hdr(struct ntfs_log *log, u16 major_ver, u16 minor_ver)
{
- log->sys_page_size = sys_page_size;
- log->sys_page_mask = sys_page_size - 1;
- log->page_size = page_size;
- log->page_mask = page_size - 1;
- log->page_bits = blksize_bits(page_size);
+ log->sys_page_size = log->page_size;
+ log->sys_page_mask = log->page_mask;
log->clst_per_page = log->page_size >> log->ni->mi.sbi->cluster_bits;
if (!log->clst_per_page)
log->clst_per_page = 1;
- log->first_page = major_ver >= 2 ?
- 0x22 * page_size :
- ((sys_page_size << 1) + (page_size << 1));
+ log->first_page = major_ver >= 2 ? 0x22 * log->page_size :
+ 4 * log->page_size;
log->major_ver = major_ver;
log->minor_ver = minor_ver;
}
@@ -1308,12 +1308,11 @@ static void log_init_pg_hdr(struct ntfs_log *log, u32 sys_page_size,
/*
* log_create - Init @log in cases when we don't have a restart area to use.
*/
-static void log_create(struct ntfs_log *log, u32 l_size, const u64 last_lsn,
+static void log_create(struct ntfs_log *log, const u64 last_lsn,
u32 open_log_count, bool wrapped, bool use_multi_page)
{
- log->l_size = l_size;
/* All file offsets must be quadword aligned. */
- log->file_data_bits = blksize_bits(l_size) - 3;
+ log->file_data_bits = blksize_bits(log->l_size) - 3;
log->seq_num_mask = (8 << log->file_data_bits) - 1;
log->seq_num_bits = sizeof(u64) * 8 - log->file_data_bits;
log->seq_num = (last_lsn >> log->file_data_bits) + 2;
@@ -3720,10 +3719,8 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
struct ntfs_sb_info *sbi = ni->mi.sbi;
struct ntfs_log *log;
- struct restart_info rst_info, rst_info2;
- u64 rec_lsn, ra_lsn, checkpt_lsn = 0, rlsn = 0;
+ u64 rec_lsn, checkpt_lsn = 0, rlsn = 0;
struct ATTR_NAME_ENTRY *attr_names = NULL;
- struct ATTR_NAME_ENTRY *ane;
struct RESTART_TABLE *dptbl = NULL;
struct RESTART_TABLE *trtbl = NULL;
const struct RESTART_TABLE *rt;
@@ -3741,9 +3738,7 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
struct TRANSACTION_ENTRY *tr;
struct DIR_PAGE_ENTRY *dp;
u32 i, bytes_per_attr_entry;
- u32 l_size = ni->vfs_inode.i_size;
- u32 orig_file_size = l_size;
- u32 page_size, vbo, tail, off, dlen;
+ u32 vbo, tail, off, dlen;
u32 saved_len, rec_len, transact_id;
bool use_second_page;
struct RESTART_AREA *ra2, *ra = NULL;
@@ -3758,52 +3753,50 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
u16 t16;
u32 t32;
- /* Get the size of page. NOTE: To replay we can use default page. */
-#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2
- page_size = norm_file_page(PAGE_SIZE, &l_size, true);
-#else
- page_size = norm_file_page(PAGE_SIZE, &l_size, false);
-#endif
- if (!page_size)
- return -EINVAL;
-
log = kzalloc(sizeof(struct ntfs_log), GFP_NOFS);
if (!log)
return -ENOMEM;
log->ni = ni;
- log->l_size = l_size;
- log->one_page_buf = kmalloc(page_size, GFP_NOFS);
+ log->l_size = log->orig_file_size = ni->vfs_inode.i_size;
+ /* Get the size of page. NOTE: To replay we can use default page. */
+#if PAGE_SIZE >= DefaultLogPageSize && PAGE_SIZE <= DefaultLogPageSize * 2
+ log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, true);
+#else
+ log->page_size = norm_file_page(PAGE_SIZE, &log->l_size, false);
+#endif
+ if (!log->page_size) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ log->one_page_buf = kmalloc(log->page_size, GFP_NOFS);
if (!log->one_page_buf) {
err = -ENOMEM;
goto out;
}
- log->page_size = page_size;
- log->page_mask = page_size - 1;
- log->page_bits = blksize_bits(page_size);
+ log->page_mask = log->page_size - 1;
+ log->page_bits = blksize_bits(log->page_size);
/* Look for a restart area on the disk. */
- memset(&rst_info, 0, sizeof(struct restart_info));
- err = log_read_rst(log, l_size, true, &rst_info);
+ err = log_read_rst(log, true, &log->rst_info);
if (err)
goto out;
/* remember 'initialized' */
- *initialized = rst_info.initialized;
+ *initialized = log->rst_info.initialized;
- if (!rst_info.restart) {
- if (rst_info.initialized) {
+ if (!log->rst_info.restart) {
+ if (log->rst_info.initialized) {
/* No restart area but the file is not initialized. */
err = -EINVAL;
goto out;
}
- log_init_pg_hdr(log, page_size, page_size, 1, 1);
- log_create(log, l_size, 0, get_random_u32(), false, false);
-
- log->ra = ra;
+ log_init_pg_hdr(log, 1, 1);
+ log_create(log, 0, get_random_u32(), false, false);
ra = log_create_ra(log);
if (!ra) {
@@ -3820,25 +3813,26 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
* If the restart offset above wasn't zero then we won't
* look for a second restart.
*/
- if (rst_info.vbo)
+ if (log->rst_info.vbo)
goto check_restart_area;
- memset(&rst_info2, 0, sizeof(struct restart_info));
- err = log_read_rst(log, l_size, false, &rst_info2);
+ err = log_read_rst(log, false, &log->rst_info2);
if (err)
goto out;
/* Determine which restart area to use. */
- if (!rst_info2.restart || rst_info2.last_lsn <= rst_info.last_lsn)
+ if (!log->rst_info2.restart ||
+ log->rst_info2.last_lsn <= log->rst_info.last_lsn)
goto use_first_page;
use_second_page = true;
- if (rst_info.chkdsk_was_run && page_size != rst_info.vbo) {
+ if (log->rst_info.chkdsk_was_run &&
+ log->page_size != log->rst_info.vbo) {
struct RECORD_PAGE_HDR *sp = NULL;
bool usa_error;
- if (!read_log_page(log, page_size, &sp, &usa_error) &&
+ if (!read_log_page(log, log->page_size, &sp, &usa_error) &&
sp->rhdr.sign == NTFS_CHKD_SIGNATURE) {
use_second_page = false;
}
@@ -3846,52 +3840,43 @@ int log_replay(struct ntfs_inode *ni, bool *initialized)
}
if (use_second_page) {
- kfree(rst_info.r_page);
- memcpy(&rst_info, &rst_info2, sizeof(struct restart_info));
- rst_info2.r_page = NULL;
+ kfree(log->rst_info.r_page);
+ memcpy(&log->rst_info, &log->rst_info2,
+ sizeof(struct restart_info));
+ log->rst_info2.r_page = NULL;
}
use_first_page:
- kfree(rst_info2.r_page);
+ kfree(log->rst_info2.r_page);
check_restart_area:
/*
* If the restart area is at offset 0, we want
* to write the second restart area first.
*/
- log->init_ra = !!rst_info.vbo;
+ log->init_ra = !!log->rst_info.vbo;
/* If we have a valid page then grab a pointer to the restart area. */
- ra2 = rst_info.valid_page ?
- Add2Ptr(rst_info.r_page,
- le16_to_cpu(rst_info.r_page->ra_off)) :
+ ra2 = log->rst_info.valid_page ?
+ Add2Ptr(log->rst_info.r_page,
+ le16_to_cpu(log->rst_info.r_page->ra_off)) :
NULL;
- if (rst_info.chkdsk_was_run ||
+ if (log->rst_info.chkdsk_was_run ||
(ra2 && ra2->client_idx[1] == LFS_NO_CLIENT_LE)) {
bool wrapped = false;
bool use_multi_page = false;
u32 open_log_count;
/* Do some checks based on whether we have a valid log page. */
- if (!rst_info.valid_page) {
- open_log_count = get_random_u32();
- goto init_log_instance;
- }
- open_log_count = le32_to_cpu(ra2->open_log_count);
-
- /*
- * If the restart page size isn't changing then we want to
- * check how much work we need to do.
- */
- if (page_size != le32_to_cpu(rst_info.r_page->sys_page_size))
- goto init_log_instance;
+ open_log_count = log->rst_info.valid_page ?
+ le32_to_cpu(ra2->open_log_count) :
+ get_random_u32();
-init_log_instance:
- log_init_pg_hdr(log, page_size, page_size, 1, 1);
+ log_init_pg_hdr(log, 1, 1);
- log_create(log, l_size, rst_info.last_lsn, open_log_count,
- wrapped, use_multi_page);
+ log_create(log, log->rst_info.last_lsn, open_log_count, wrapped,
+ use_multi_page);
ra = log_create_ra(log);
if (!ra) {
@@ -3916,28 +3901,27 @@ init_log_instance:
* use the log file. We must use the system page size instead of the
* default size if there is not a clean shutdown.
*/
- t32 = le32_to_cpu(rst_info.r_page->sys_page_size);
- if (page_size != t32) {
- l_size = orig_file_size;
- page_size =
- norm_file_page(t32, &l_size, t32 == DefaultLogPageSize);
+ t32 = le32_to_cpu(log->rst_info.r_page->sys_page_size);
+ if (log->page_size != t32) {
+ log->l_size = log->orig_file_size;
+ log->page_size = norm_file_page(t32, &log->l_size,
+ t32 == DefaultLogPageSize);
}
- if (page_size != t32 ||
- page_size != le32_to_cpu(rst_info.r_page->page_size)) {
+ if (log->page_size != t32 ||
+ log->page_size != le32_to_cpu(log->rst_info.r_page->page_size)) {
err = -EINVAL;
goto out;
}
/* If the file size has shrunk then we won't mount it. */
- if (l_size < le64_to_cpu(ra2->l_size)) {
+ if (log->l_size < le64_to_cpu(ra2->l_size)) {
err = -EINVAL;
goto out;
}
- log_init_pg_hdr(log, page_size, page_size,
- le16_to_cpu(rst_info.r_page->major_ver),
- le16_to_cpu(rst_info.r_page->minor_ver));
+ log_init_pg_hdr(log, le16_to_cpu(log->rst_info.r_page->major_ver),
+ le16_to_cpu(log->rst_info.r_page->minor_ver));
log->l_size = le64_to_cpu(ra2->l_size);
log->seq_num_bits = le32_to_cpu(ra2->seq_num_bits);
@@ -3945,7 +3929,7 @@ init_log_instance:
log->seq_num_mask = (8 << log->file_data_bits) - 1;
log->last_lsn = le64_to_cpu(ra2->current_lsn);
log->seq_num = log->last_lsn >> log->file_data_bits;
- log->ra_off = le16_to_cpu(rst_info.r_page->ra_off);
+ log->ra_off = le16_to_cpu(log->rst_info.r_page->ra_off);
log->restart_size = log->sys_page_size - log->ra_off;
log->record_header_len = le16_to_cpu(ra2->rec_hdr_len);
log->ra_size = le16_to_cpu(ra2->ra_len);
@@ -4045,7 +4029,7 @@ find_oldest:
log->current_avail = current_log_avail(log);
/* Remember which restart area to write first. */
- log->init_ra = rst_info.vbo;
+ log->init_ra = log->rst_info.vbo;
process_log:
/* 1.0, 1.1, 2.0 log->major_ver/minor_ver - short values. */
@@ -4105,7 +4089,7 @@ process_log:
log->client_id.seq_num = cr->seq_num;
log->client_id.client_idx = client;
- err = read_rst_area(log, &rst, &ra_lsn);
+ err = read_rst_area(log, &rst, &checkpt_lsn);
if (err)
goto out;
@@ -4114,9 +4098,8 @@ process_log:
bytes_per_attr_entry = !rst->major_ver ? 0x2C : 0x28;
- checkpt_lsn = le64_to_cpu(rst->check_point_start);
- if (!checkpt_lsn)
- checkpt_lsn = ra_lsn;
+ if (rst->check_point_start)
+ checkpt_lsn = le64_to_cpu(rst->check_point_start);
/* Allocate and Read the Transaction Table. */
if (!rst->transact_table_len)
@@ -4330,23 +4313,20 @@ check_attr_table:
lcb = NULL;
check_attribute_names2:
- if (!rst->attr_names_len)
- goto trace_attribute_table;
-
- ane = attr_names;
- if (!oatbl)
- goto trace_attribute_table;
- while (ane->off) {
- /* TODO: Clear table on exit! */
- oe = Add2Ptr(oatbl, le16_to_cpu(ane->off));
- t16 = le16_to_cpu(ane->name_bytes);
- oe->name_len = t16 / sizeof(short);
- oe->ptr = ane->name;
- oe->is_attr_name = 2;
- ane = Add2Ptr(ane, sizeof(struct ATTR_NAME_ENTRY) + t16);
- }
-
-trace_attribute_table:
+ if (rst->attr_names_len && oatbl) {
+ struct ATTR_NAME_ENTRY *ane = attr_names;
+ while (ane->off) {
+ /* TODO: Clear table on exit! */
+ oe = Add2Ptr(oatbl, le16_to_cpu(ane->off));
+ t16 = le16_to_cpu(ane->name_bytes);
+ oe->name_len = t16 / sizeof(short);
+ oe->ptr = ane->name;
+ oe->is_attr_name = 2;
+ ane = Add2Ptr(ane,
+ sizeof(struct ATTR_NAME_ENTRY) + t16);
+ }
+ }
+
/*
* If the checkpt_lsn is zero, then this is a freshly
* formatted disk and we have no work to do.
@@ -5189,7 +5169,7 @@ out:
kfree(oatbl);
kfree(dptbl);
kfree(attr_names);
- kfree(rst_info.r_page);
+ kfree(log->rst_info.r_page);
kfree(ra);
kfree(log->one_page_buf);
diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c
index fbfe21dbb425..ae2ef5c11868 100644
--- a/fs/ntfs3/fsntfs.c
+++ b/fs/ntfs3/fsntfs.c
@@ -853,7 +853,8 @@ void ntfs_update_mftmirr(struct ntfs_sb_info *sbi, int wait)
/*
* sb can be NULL here. In this case sbi->flags should be 0 too.
*/
- if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR))
+ if (!sb || !(sbi->flags & NTFS_FLAGS_MFTMIRR) ||
+ unlikely(ntfs3_forced_shutdown(sb)))
return;
blocksize = sb->s_blocksize;
@@ -1006,6 +1007,30 @@ static inline __le32 security_hash(const void *sd, size_t bytes)
return cpu_to_le32(hash);
}
+/*
+ * simple wrapper for sb_bread_unmovable.
+ */
+struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block)
+{
+ struct ntfs_sb_info *sbi = sb->s_fs_info;
+ struct buffer_head *bh;
+
+ if (unlikely(block >= sbi->volume.blocks)) {
+ /* prevent generic message "attempt to access beyond end of device" */
+ ntfs_err(sb, "try to read out of volume at offset 0x%llx",
+ (u64)block << sb->s_blocksize_bits);
+ return NULL;
+ }
+
+ bh = sb_bread_unmovable(sb, block);
+ if (bh)
+ return bh;
+
+ ntfs_err(sb, "failed to read volume at offset 0x%llx",
+ (u64)block << sb->s_blocksize_bits);
+ return NULL;
+}
+
int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer)
{
struct block_device *bdev = sb->s_bdev;
@@ -2128,8 +2153,8 @@ int ntfs_insert_security(struct ntfs_sb_info *sbi,
if (le32_to_cpu(d_security->size) == new_sec_size &&
d_security->key.hash == hash_key.hash &&
!memcmp(d_security + 1, sd, size_sd)) {
- *security_id = d_security->key.sec_id;
/* Such security already exists. */
+ *security_id = d_security->key.sec_id;
err = 0;
goto out;
}
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index cf92b2433f7a..daabaad63aaf 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -1462,7 +1462,7 @@ static int indx_create_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
goto out2;
if (in->name == I30_NAME) {
- ni->vfs_inode.i_size = data_size;
+ i_size_write(&ni->vfs_inode, data_size);
inode_set_bytes(&ni->vfs_inode, alloc_size);
}
@@ -1544,7 +1544,7 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni,
}
if (in->name == I30_NAME)
- ni->vfs_inode.i_size = data_size;
+ i_size_write(&ni->vfs_inode, data_size);
*vbn = bit << indx->idx2vbn_bits;
@@ -2090,7 +2090,7 @@ static int indx_shrink(struct ntfs_index *indx, struct ntfs_inode *ni,
return err;
if (in->name == I30_NAME)
- ni->vfs_inode.i_size = new_data;
+ i_size_write(&ni->vfs_inode, new_data);
bpb = bitmap_size(bit);
if (bpb * 8 == nbits)
@@ -2576,7 +2576,7 @@ int indx_delete_entry(struct ntfs_index *indx, struct ntfs_inode *ni,
err = attr_set_size(ni, ATTR_ALLOC, in->name, in->name_len,
&indx->alloc_run, 0, NULL, false, NULL);
if (in->name == I30_NAME)
- ni->vfs_inode.i_size = 0;
+ i_size_write(&ni->vfs_inode, 0);
err = ni_remove_attr(ni, ATTR_ALLOC, in->name, in->name_len,
false, NULL);
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 5e3d71374918..eb7a8c9fba01 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -345,9 +345,7 @@ next_attr:
inode->i_size = le16_to_cpu(rp.SymbolicLinkReparseBuffer
.PrintNameLength) /
sizeof(u16);
-
ni->i_valid = inode->i_size;
-
/* Clear directory bit. */
if (ni->ni_flags & NI_FLAG_DIR) {
indx_clear(&ni->dir);
@@ -412,7 +410,6 @@ end_enum:
goto out;
if (!is_match && name) {
- /* Reuse rec as buffer for ascii name. */
err = -ENOENT;
goto out;
}
@@ -427,6 +424,7 @@ end_enum:
if (names != le16_to_cpu(rec->hard_links)) {
/* Correct minor error on the fly. Do not mark inode as dirty. */
+ ntfs_inode_warn(inode, "Correct links count -> %u.", names);
rec->hard_links = cpu_to_le16(names);
ni->mi.dirty = true;
}
@@ -653,9 +651,10 @@ static noinline int ntfs_get_block_vbo(struct inode *inode, u64 vbo,
off = vbo & (PAGE_SIZE - 1);
folio_set_bh(bh, folio, off);
- err = bh_read(bh, 0);
- if (err < 0)
+ if (bh_read(bh, 0) < 0) {
+ err = -EIO;
goto out;
+ }
folio_zero_segment(folio, off + voff, off + block_size);
}
}
@@ -853,9 +852,13 @@ static int ntfs_resident_writepage(struct folio *folio,
struct writeback_control *wbc, void *data)
{
struct address_space *mapping = data;
- struct ntfs_inode *ni = ntfs_i(mapping->host);
+ struct inode *inode = mapping->host;
+ struct ntfs_inode *ni = ntfs_i(inode);
int ret;
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
ni_lock(ni);
ret = attr_data_write_resident(ni, &folio->page);
ni_unlock(ni);
@@ -869,7 +872,12 @@ static int ntfs_resident_writepage(struct folio *folio,
static int ntfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
- if (is_resident(ntfs_i(mapping->host)))
+ struct inode *inode = mapping->host;
+
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
+ if (is_resident(ntfs_i(inode)))
return write_cache_pages(mapping, wbc, ntfs_resident_writepage,
mapping);
return mpage_writepages(mapping, wbc, ntfs_get_block);
@@ -889,6 +897,9 @@ int ntfs_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
*pagep = NULL;
if (is_resident(ni)) {
struct page *page =
@@ -974,7 +985,7 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos,
}
if (pos + err > inode->i_size) {
- inode->i_size = pos + err;
+ i_size_write(inode, pos + err);
dirty = true;
}
@@ -1306,6 +1317,11 @@ struct inode *ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir,
goto out1;
}
+ if (unlikely(ntfs3_forced_shutdown(sb))) {
+ err = -EIO;
+ goto out2;
+ }
+
/* Mark rw ntfs as dirty. it will be cleared at umount. */
ntfs_set_state(sbi, NTFS_DIRTY_DIRTY);
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index ee3093be5170..cae41db0aaa7 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -181,6 +181,9 @@ static int ntfs_unlink(struct inode *dir, struct dentry *dentry)
struct ntfs_inode *ni = ntfs_i(dir);
int err;
+ if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+ return -EIO;
+
ni_lock_dir(ni);
err = ntfs_unlink_inode(dir, dentry);
@@ -199,6 +202,9 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
u32 size = strlen(symname);
struct inode *inode;
+ if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+ return -EIO;
+
inode = ntfs_create_inode(idmap, dir, dentry, NULL, S_IFLNK | 0777, 0,
symname, size, NULL);
@@ -227,6 +233,9 @@ static int ntfs_rmdir(struct inode *dir, struct dentry *dentry)
struct ntfs_inode *ni = ntfs_i(dir);
int err;
+ if (unlikely(ntfs3_forced_shutdown(dir->i_sb)))
+ return -EIO;
+
ni_lock_dir(ni);
err = ntfs_unlink_inode(dir, dentry);
@@ -264,6 +273,9 @@ static int ntfs_rename(struct mnt_idmap *idmap, struct inode *dir,
1024);
static_assert(PATH_MAX >= 4 * 1024);
+ if (unlikely(ntfs3_forced_shutdown(sb)))
+ return -EIO;
+
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h
index 86aecbb01a92..9c7478150a03 100644
--- a/fs/ntfs3/ntfs.h
+++ b/fs/ntfs3/ntfs.h
@@ -523,12 +523,10 @@ struct ATTR_LIST_ENTRY {
__le64 vcn; // 0x08: Starting VCN of this attribute.
struct MFT_REF ref; // 0x10: MFT record number with attribute.
__le16 id; // 0x18: struct ATTRIB ID.
- __le16 name[3]; // 0x1A: Just to align. To get real name can use bNameOffset.
+ __le16 name[]; // 0x1A: To get real name use name_off.
}; // sizeof(0x20)
-static_assert(sizeof(struct ATTR_LIST_ENTRY) == 0x20);
-
static inline u32 le_size(u8 name_len)
{
return ALIGN(offsetof(struct ATTR_LIST_ENTRY, name) +
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index f6706143d14b..79356fd29a14 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -61,6 +61,8 @@ enum utf16_endian;
/* sbi->flags */
#define NTFS_FLAGS_NODISCARD 0x00000001
+/* ntfs in shutdown state. */
+#define NTFS_FLAGS_SHUTDOWN_BIT 0x00000002 /* == 4*/
/* Set when LogFile is replaying. */
#define NTFS_FLAGS_LOG_REPLAYING 0x00000008
/* Set when we changed first MFT's which copy must be updated in $MftMirr. */
@@ -226,7 +228,7 @@ struct ntfs_sb_info {
u64 maxbytes; // Maximum size for normal files.
u64 maxbytes_sparse; // Maximum size for sparse file.
- u32 flags; // See NTFS_FLAGS_XXX.
+ unsigned long flags; // See NTFS_FLAGS_
CLST zone_max; // Maximum MFT zone length in clusters
CLST bad_clusters; // The count of marked bad clusters.
@@ -473,7 +475,7 @@ bool al_delete_le(struct ntfs_inode *ni, enum ATTR_TYPE type, CLST vcn,
int al_update(struct ntfs_inode *ni, int sync);
static inline size_t al_aligned(size_t size)
{
- return (size + 1023) & ~(size_t)1023;
+ return size_add(size, 1023) & ~(size_t)1023;
}
/* Globals from bitfunc.c */
@@ -500,6 +502,8 @@ int ntfs3_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
int ntfs_file_open(struct inode *inode, struct file *file);
int ntfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len);
+long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg);
+long ntfs_compat_ioctl(struct file *filp, u32 cmd, unsigned long arg);
extern const struct inode_operations ntfs_special_inode_operations;
extern const struct inode_operations ntfs_file_inode_operations;
extern const struct file_operations ntfs_file_operations;
@@ -584,6 +588,7 @@ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes);
int log_replay(struct ntfs_inode *ni, bool *initialized);
/* Globals from fsntfs.c */
+struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block);
bool ntfs_fix_pre_write(struct NTFS_RECORD_HEADER *rhdr, size_t bytes);
int ntfs_fix_post_read(struct NTFS_RECORD_HEADER *rhdr, size_t bytes,
bool simple);
@@ -872,7 +877,7 @@ int ntfs_init_acl(struct mnt_idmap *idmap, struct inode *inode,
int ntfs_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry);
ssize_t ntfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
-extern const struct xattr_handler * const ntfs_xattr_handlers[];
+extern const struct xattr_handler *const ntfs_xattr_handlers[];
int ntfs_save_wsl_perm(struct inode *inode, __le16 *ea_size);
void ntfs_get_wsl_perm(struct inode *inode);
@@ -999,6 +1004,11 @@ static inline struct ntfs_sb_info *ntfs_sb(struct super_block *sb)
return sb->s_fs_info;
}
+static inline int ntfs3_forced_shutdown(struct super_block *sb)
+{
+ return test_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags);
+}
+
/*
* ntfs_up_cluster - Align up on cluster boundary.
*/
@@ -1025,19 +1035,6 @@ static inline u64 bytes_to_block(const struct super_block *sb, u64 size)
return (size + sb->s_blocksize - 1) >> sb->s_blocksize_bits;
}
-static inline struct buffer_head *ntfs_bread(struct super_block *sb,
- sector_t block)
-{
- struct buffer_head *bh = sb_bread(sb, block);
-
- if (bh)
- return bh;
-
- ntfs_err(sb, "failed to read volume at offset 0x%llx",
- (u64)block << sb->s_blocksize_bits);
- return NULL;
-}
-
static inline struct ntfs_inode *ntfs_i(struct inode *inode)
{
return container_of(inode, struct ntfs_inode, vfs_inode);
diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c
index 53629b1f65e9..6aa3a9d44df1 100644
--- a/fs/ntfs3/record.c
+++ b/fs/ntfs3/record.c
@@ -279,7 +279,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr)
if (t16 > asize)
return NULL;
- if (t16 + le32_to_cpu(attr->res.data_size) > asize)
+ if (le32_to_cpu(attr->res.data_size) > asize - t16)
return NULL;
t32 = sizeof(short) * attr->name_len;
@@ -535,8 +535,20 @@ bool mi_remove_attr(struct ntfs_inode *ni, struct mft_inode *mi,
return false;
if (ni && is_attr_indexed(attr)) {
- le16_add_cpu(&ni->mi.mrec->hard_links, -1);
- ni->mi.dirty = true;
+ u16 links = le16_to_cpu(ni->mi.mrec->hard_links);
+ struct ATTR_FILE_NAME *fname =
+ attr->type != ATTR_NAME ?
+ NULL :
+ resident_data_ex(attr,
+ SIZEOF_ATTRIBUTE_FILENAME);
+ if (fname && fname->type == FILE_NAME_DOS) {
+ /* Do not decrease links count deleting DOS name. */
+ } else if (!links) {
+ /* minor error. Not critical. */
+ } else {
+ ni->mi.mrec->hard_links = cpu_to_le16(links - 1);
+ ni->mi.dirty = true;
+ }
}
used -= asize;
diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c
index 9153dffde950..cef5467fd928 100644
--- a/fs/ntfs3/super.c
+++ b/fs/ntfs3/super.c
@@ -122,13 +122,12 @@ void ntfs_inode_printk(struct inode *inode, const char *fmt, ...)
if (name) {
struct dentry *de = d_find_alias(inode);
- const u32 name_len = ARRAY_SIZE(s_name_buf) - 1;
if (de) {
spin_lock(&de->d_lock);
- snprintf(name, name_len, " \"%s\"", de->d_name.name);
+ snprintf(name, sizeof(s_name_buf), " \"%s\"",
+ de->d_name.name);
spin_unlock(&de->d_lock);
- name[name_len] = 0; /* To be sure. */
} else {
name[0] = 0;
}
@@ -625,7 +624,7 @@ static void ntfs3_free_sbi(struct ntfs_sb_info *sbi)
{
kfree(sbi->new_rec);
kvfree(ntfs_put_shared(sbi->upcase));
- kfree(sbi->def_table);
+ kvfree(sbi->def_table);
kfree(sbi->compress.lznt);
#ifdef CONFIG_NTFS3_LZX_XPRESS
xpress_free_decompressor(sbi->compress.xpress);
@@ -715,6 +714,14 @@ static int ntfs_show_options(struct seq_file *m, struct dentry *root)
}
/*
+ * ntfs_shutdown - super_operations::shutdown
+ */
+static void ntfs_shutdown(struct super_block *sb)
+{
+ set_bit(NTFS_FLAGS_SHUTDOWN_BIT, &ntfs_sb(sb)->flags);
+}
+
+/*
* ntfs_sync_fs - super_operations::sync_fs
*/
static int ntfs_sync_fs(struct super_block *sb, int wait)
@@ -724,6 +731,9 @@ static int ntfs_sync_fs(struct super_block *sb, int wait)
struct ntfs_inode *ni;
struct inode *inode;
+ if (unlikely(ntfs3_forced_shutdown(sb)))
+ return -EIO;
+
ni = sbi->security.ni;
if (ni) {
inode = &ni->vfs_inode;
@@ -763,6 +773,7 @@ static const struct super_operations ntfs_sops = {
.put_super = ntfs_put_super,
.statfs = ntfs_statfs,
.show_options = ntfs_show_options,
+ .shutdown = ntfs_shutdown,
.sync_fs = ntfs_sync_fs,
.write_inode = ntfs3_write_inode,
};
@@ -866,6 +877,7 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
u16 fn, ao;
u8 cluster_bits;
u32 boot_off = 0;
+ sector_t boot_block = 0;
const char *hint = "Primary boot";
/* Save original dev_size. Used with alternative boot. */
@@ -873,11 +885,11 @@ static int ntfs_init_from_boot(struct super_block *sb, u32 sector_size,
sbi->volume.blocks = dev_size >> PAGE_SHIFT;
- bh = ntfs_bread(sb, 0);
+read_boot:
+ bh = ntfs_bread(sb, boot_block);
if (!bh)
- return -EIO;
+ return boot_block ? -EINVAL : -EIO;
-check_boot:
err = -EINVAL;
/* Corrupted image; do not read OOB */
@@ -1108,26 +1120,24 @@ check_boot:
}
out:
- if (err == -EINVAL && !bh->b_blocknr && dev_size0 > PAGE_SHIFT) {
+ brelse(bh);
+
+ if (err == -EINVAL && !boot_block && dev_size0 > PAGE_SHIFT) {
u32 block_size = min_t(u32, sector_size, PAGE_SIZE);
u64 lbo = dev_size0 - sizeof(*boot);
- /*
- * Try alternative boot (last sector)
- */
- brelse(bh);
-
- sb_set_blocksize(sb, block_size);
- bh = ntfs_bread(sb, lbo >> blksize_bits(block_size));
- if (!bh)
- return -EINVAL;
-
+ boot_block = lbo >> blksize_bits(block_size);
boot_off = lbo & (block_size - 1);
- hint = "Alternative boot";
- dev_size = dev_size0; /* restore original size. */
- goto check_boot;
+ if (boot_block && block_size >= boot_off + sizeof(*boot)) {
+ /*
+ * Try alternative boot (last sector)
+ */
+ sb_set_blocksize(sb, block_size);
+ hint = "Alternative boot";
+ dev_size = dev_size0; /* restore original size. */
+ goto read_boot;
+ }
}
- brelse(bh);
return err;
}
diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c
index 4274b6f31cfa..53e7d1fa036a 100644
--- a/fs/ntfs3/xattr.c
+++ b/fs/ntfs3/xattr.c
@@ -219,6 +219,9 @@ static ssize_t ntfs_list_ea(struct ntfs_inode *ni, char *buffer,
if (!ea->name_len)
break;
+ if (ea->name_len > ea_size)
+ break;
+
if (buffer) {
/* Check if we can use field ea->name */
if (off + ea_size > size)
@@ -744,6 +747,9 @@ static int ntfs_getxattr(const struct xattr_handler *handler, struct dentry *de,
int err;
struct ntfs_inode *ni = ntfs_i(inode);
+ if (unlikely(ntfs3_forced_shutdown(inode->i_sb)))
+ return -EIO;
+
/* Dispatch request. */
if (!strcmp(name, SYSTEM_DOS_ATTRIB)) {
/* system.dos_attrib */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 91b32b2377ac..ea9127ba3208 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6934,7 +6934,7 @@ static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
* nonzero data on subsequent file extends.
*
* We need to call this before i_size is updated on the inode because
- * otherwise block_write_full_page() will skip writeout of pages past
+ * otherwise block_write_full_folio() will skip writeout of pages past
* i_size.
*/
int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ba790219d528..b82185075de7 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -389,21 +389,18 @@ out_unlock:
/* Note: Because we don't support holes, our allocation has
* already happened (allocation writes zeros to the file data)
* so we don't have to worry about ordered writes in
- * ocfs2_writepage.
+ * ocfs2_writepages.
*
- * ->writepage is called during the process of invalidating the page cache
+ * ->writepages is called during the process of invalidating the page cache
* during blocked lock processing. It can't block on any cluster locks
* to during block mapping. It's relying on the fact that the block
* mapping can't have disappeared under the dirty pages that it is
* being asked to write back.
*/
-static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
+static int ocfs2_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- trace_ocfs2_writepage(
- (unsigned long long)OCFS2_I(page->mapping->host)->ip_blkno,
- page->index);
-
- return block_write_full_page(page, ocfs2_get_block, wbc);
+ return mpage_writepages(mapping, wbc, ocfs2_get_block);
}
/* Taken from ext3. We don't necessarily need the full blown
@@ -2471,7 +2468,7 @@ const struct address_space_operations ocfs2_aops = {
.dirty_folio = block_dirty_folio,
.read_folio = ocfs2_read_folio,
.readahead = ocfs2_readahead,
- .writepage = ocfs2_writepage,
+ .writepages = ocfs2_writepages,
.write_begin = ocfs2_write_begin,
.write_end = ocfs2_write_end,
.bmap = ocfs2_bmap,
@@ -2480,5 +2477,5 @@ const struct address_space_operations ocfs2_aops = {
.release_folio = ocfs2_release_folio,
.migrate_folio = buffer_migrate_folio,
.is_partially_uptodate = block_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
};
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 04fc8344063a..a9b8688aaf30 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -124,17 +124,10 @@ static int ocfs2_match_dentry(struct dentry *dentry,
if (!dentry->d_fsdata)
return 0;
- if (!dentry->d_parent)
- return 0;
-
if (skip_unhashed && d_unhashed(dentry))
return 0;
parent = d_inode(dentry->d_parent);
- /* Negative parent dentry? */
- if (!parent)
- return 0;
-
/* Name is in a different directory. */
if (OCFS2_I(parent)->ip_blkno != parent_blkno)
return 0;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index a14c8fee6ee5..d620d4c53c6f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -1593,9 +1593,6 @@ int __ocfs2_add_entry(handle_t *handle,
struct buffer_head *insert_bh = lookup->dl_leaf_bh;
char *data_start = insert_bh->b_data;
- if (!namelen)
- return -EINVAL;
-
if (ocfs2_dir_indexed(dir)) {
struct buffer_head *bh;
@@ -4245,12 +4242,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb,
trace_ocfs2_prepare_dir_for_insert(
(unsigned long long)OCFS2_I(dir)->ip_blkno, namelen);
- if (!namelen) {
- ret = -EINVAL;
- mlog_errno(ret);
- goto out;
- }
-
/*
* Do this up front to reduce confusion.
*
diff --git a/fs/ocfs2/export.c b/fs/ocfs2/export.c
index eaa8c80ace3c..b8b6a191b5cb 100644
--- a/fs/ocfs2/export.c
+++ b/fs/ocfs2/export.c
@@ -280,4 +280,5 @@ const struct export_operations ocfs2_export_ops = {
.fh_to_dentry = ocfs2_fh_to_dentry,
.fh_to_parent = ocfs2_fh_to_parent,
.get_parent = ocfs2_get_parent,
+ .flags = EXPORT_OP_ASYNC_LOCK,
};
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 94e2a1244442..8b6d15010703 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -818,7 +818,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from,
/*
* fs-writeback will release the dirty pages without page lock
* whose offset are over inode size, the release happens at
- * block_write_full_page().
+ * block_write_full_folio().
*/
i_size_write(inode, abs_to);
inode->i_blocks = ocfs2_inode_sector_count(inode);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 814733ba2f4b..9221a33f917b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1336,7 +1336,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
goto bail;
}
- if (S_ISDIR(old_inode->i_mode)) {
+ if (S_ISDIR(old_inode->i_mode) && new_dir != old_dir) {
u64 old_inode_parent;
update_dot_dot = 1;
@@ -1353,8 +1353,7 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
goto bail;
}
- if (!new_inode && new_dir != old_dir &&
- new_dir->i_nlink >= ocfs2_link_max(osb)) {
+ if (!new_inode && new_dir->i_nlink >= ocfs2_link_max(osb)) {
status = -EMLINK;
goto bail;
}
@@ -1601,6 +1600,9 @@ static int ocfs2_rename(struct mnt_idmap *idmap,
mlog_errno(status);
goto bail;
}
+ }
+
+ if (S_ISDIR(old_inode->i_mode)) {
drop_nlink(old_dir);
if (new_inode) {
drop_nlink(new_inode);
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index ac4fd1d5b128..9898c11bdfa1 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -1157,8 +1157,6 @@ DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_get_block_end);
DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_readpage);
-DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_writepage);
-
DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_bmap);
TRACE_EVENT(ocfs2_try_to_write_inline_data,
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index a8d5ca98fa57..20aa37b67cfb 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -658,7 +658,6 @@ static struct ctl_table ocfs2_nm_table[] = {
.mode = 0644,
.proc_handler = proc_dostring,
},
- { }
};
static struct ctl_table_header *ocfs2_table_header;
diff --git a/fs/open.c b/fs/open.c
index 3494a9cd8046..a84d21e55c39 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -304,6 +304,10 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
if (ret)
return ret;
+ ret = fsnotify_file_area_perm(file, MAY_WRITE, &offset, len);
+ if (ret)
+ return ret;
+
if (S_ISFIFO(inode->i_mode))
return -ESPIPE;
@@ -442,7 +446,8 @@ static const struct cred *access_override_creds(void)
* 'get_current_cred()' function), that will clear the
* non_rcu field, because now that other user may be
* expecting RCU freeing. But normal thread-synchronous
- * cred accesses will keep things non-RCY.
+ * cred accesses will keep things non-racy to avoid RCU
+ * freeing.
*/
override_cred->non_rcu = 1;
@@ -1177,44 +1182,6 @@ struct file *kernel_file_open(const struct path *path, int flags,
}
EXPORT_SYMBOL_GPL(kernel_file_open);
-/**
- * backing_file_open - open a backing file for kernel internal use
- * @user_path: path that the user reuqested to open
- * @flags: open flags
- * @real_path: path of the backing file
- * @cred: credentials for open
- *
- * Open a backing file for a stackable filesystem (e.g., overlayfs).
- * @user_path may be on the stackable filesystem and @real_path on the
- * underlying filesystem. In this case, we want to be able to return the
- * @user_path of the stackable filesystem. This is done by embedding the
- * returned file into a container structure that also stores the stacked
- * file's path, which can be retrieved using backing_file_user_path().
- */
-struct file *backing_file_open(const struct path *user_path, int flags,
- const struct path *real_path,
- const struct cred *cred)
-{
- struct file *f;
- int error;
-
- f = alloc_empty_backing_file(flags, cred);
- if (IS_ERR(f))
- return f;
-
- path_get(user_path);
- *backing_file_user_path(f) = *user_path;
- f->f_path = *real_path;
- error = do_dentry_open(f, d_inode(real_path->dentry), NULL);
- if (error) {
- fput(f);
- f = ERR_PTR(error);
- }
-
- return f;
-}
-EXPORT_SYMBOL_GPL(backing_file_open);
-
#define WILL_CREATE(flags) (flags & (O_CREAT | __O_TMPFILE))
#define O_PATH_FLAGS (O_DIRECTORY | O_NOFOLLOW | O_PATH | O_CLOEXEC)
@@ -1574,7 +1541,7 @@ SYSCALL_DEFINE1(close, unsigned int, fd)
int retval;
struct file *file;
- file = close_fd_get_file(fd);
+ file = file_close_fd(fd);
if (!file)
return -EBADF;
diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index 9cacce5d55c1..6d1fbeca9d81 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -58,10 +58,10 @@ struct orangefs_dir {
* first part of the part list.
*/
-static int do_readdir(struct orangefs_inode_s *oi,
- struct orangefs_dir *od, struct dentry *dentry,
+static int do_readdir(struct orangefs_dir *od, struct inode *inode,
struct orangefs_kernel_op_s *op)
{
+ struct orangefs_inode_s *oi = ORANGEFS_I(inode);
struct orangefs_readdir_response_s *resp;
int bufi, r;
@@ -87,7 +87,7 @@ again:
op->upcall.req.readdir.buf_index = bufi;
r = service_operation(op, "orangefs_readdir",
- get_interruptible_flag(dentry->d_inode));
+ get_interruptible_flag(inode));
orangefs_readdir_index_put(bufi);
@@ -158,8 +158,7 @@ static int parse_readdir(struct orangefs_dir *od,
return 0;
}
-static int orangefs_dir_more(struct orangefs_inode_s *oi,
- struct orangefs_dir *od, struct dentry *dentry)
+static int orangefs_dir_more(struct orangefs_dir *od, struct inode *inode)
{
struct orangefs_kernel_op_s *op;
int r;
@@ -169,7 +168,7 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi,
od->error = -ENOMEM;
return -ENOMEM;
}
- r = do_readdir(oi, od, dentry, op);
+ r = do_readdir(od, inode, op);
if (r) {
od->error = r;
goto out;
@@ -238,9 +237,7 @@ next:
return 1;
}
-static int orangefs_dir_fill(struct orangefs_inode_s *oi,
- struct orangefs_dir *od, struct dentry *dentry,
- struct dir_context *ctx)
+static int orangefs_dir_fill(struct orangefs_dir *od, struct dir_context *ctx)
{
struct orangefs_dir_part *part;
size_t count;
@@ -304,15 +301,10 @@ static loff_t orangefs_dir_llseek(struct file *file, loff_t offset,
static int orangefs_dir_iterate(struct file *file,
struct dir_context *ctx)
{
- struct orangefs_inode_s *oi;
- struct orangefs_dir *od;
- struct dentry *dentry;
+ struct orangefs_dir *od = file->private_data;
+ struct inode *inode = file_inode(file);
int r;
- dentry = file->f_path.dentry;
- oi = ORANGEFS_I(dentry->d_inode);
- od = file->private_data;
-
if (od->error)
return od->error;
@@ -342,7 +334,7 @@ static int orangefs_dir_iterate(struct file *file,
*/
while (od->token != ORANGEFS_ITERATE_END &&
ctx->pos > od->end) {
- r = orangefs_dir_more(oi, od, dentry);
+ r = orangefs_dir_more(od, inode);
if (r)
return r;
}
@@ -351,17 +343,17 @@ static int orangefs_dir_iterate(struct file *file,
/* Then try to fill if there's any left in the buffer. */
if (ctx->pos < od->end) {
- r = orangefs_dir_fill(oi, od, dentry, ctx);
+ r = orangefs_dir_fill(od, ctx);
if (r)
return r;
}
/* Finally get some more and try to fill. */
if (od->token != ORANGEFS_ITERATE_END) {
- r = orangefs_dir_more(oi, od, dentry);
+ r = orangefs_dir_more(od, inode);
if (r)
return r;
- r = orangefs_dir_fill(oi, od, dentry, ctx);
+ r = orangefs_dir_fill(od, ctx);
}
return r;
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index fec5020c3495..2ac67e04a6fb 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -1,6 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config OVERLAY_FS
tristate "Overlay filesystem support"
+ select FS_STACK
select EXPORTFS
help
An overlay filesystem combines two filesystems - an 'upper' filesystem
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 8bea66c97316..8586e2f5d243 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -230,6 +230,19 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old,
return ovl_real_fileattr_set(new, &newfa);
}
+static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen)
+{
+ loff_t tmp;
+
+ if (WARN_ON_ONCE(pos != pos2))
+ return -EIO;
+ if (WARN_ON_ONCE(pos < 0 || len < 0 || totlen < 0))
+ return -EIO;
+ if (WARN_ON_ONCE(check_add_overflow(pos, len, &tmp)))
+ return -EIO;
+ return 0;
+}
+
static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
struct file *new_file, loff_t len)
{
@@ -244,7 +257,8 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
int error = 0;
ovl_path_lowerdata(dentry, &datapath);
- if (WARN_ON(datapath.dentry == NULL))
+ if (WARN_ON_ONCE(datapath.dentry == NULL) ||
+ WARN_ON_ONCE(len < 0))
return -EIO;
old_file = ovl_path_open(&datapath, O_LARGEFILE | O_RDONLY);
@@ -252,12 +266,16 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
return PTR_ERR(old_file);
/* Try to use clone_file_range to clone up within the same fs */
- ovl_start_write(dentry);
- cloned = do_clone_file_range(old_file, 0, new_file, 0, len, 0);
- ovl_end_write(dentry);
+ cloned = vfs_clone_file_range(old_file, 0, new_file, 0, len, 0);
if (cloned == len)
goto out_fput;
+
/* Couldn't clone, so now we try to copy the data */
+ error = rw_verify_area(READ, old_file, &old_pos, len);
+ if (!error)
+ error = rw_verify_area(WRITE, new_file, &new_pos, len);
+ if (error)
+ goto out_fput;
/* Check if lower fs supports seek operation */
if (old_file->f_mode & FMODE_LSEEK)
@@ -265,7 +283,7 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
while (len) {
size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
- long bytes;
+ ssize_t bytes;
if (len < this_len)
this_len = len;
@@ -309,11 +327,13 @@ static int ovl_copy_up_file(struct ovl_fs *ofs, struct dentry *dentry,
}
}
- ovl_start_write(dentry);
+ error = ovl_verify_area(old_pos, new_pos, this_len, len);
+ if (error)
+ break;
+
bytes = do_splice_direct(old_file, &old_pos,
new_file, &new_pos,
this_len, SPLICE_F_MOVE);
- ovl_end_write(dentry);
if (bytes <= 0) {
error = bytes;
break;
@@ -722,7 +742,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
struct inode *inode;
struct inode *udir = d_inode(c->destdir), *wdir = d_inode(c->workdir);
struct path path = { .mnt = ovl_upper_mnt(ofs) };
- struct dentry *temp, *upper;
+ struct dentry *temp, *upper, *trap;
struct ovl_cu_creds cc;
int err;
struct ovl_cattr cattr = {
@@ -759,11 +779,13 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
* temp wasn't moved before copy up completion or cleanup.
*/
ovl_start_write(c->dentry);
- if (lock_rename(c->workdir, c->destdir) != NULL ||
- temp->d_parent != c->workdir) {
+ trap = lock_rename(c->workdir, c->destdir);
+ if (trap || temp->d_parent != c->workdir) {
/* temp or workdir moved underneath us? abort without cleanup */
dput(temp);
err = -EIO;
+ if (IS_ERR(trap))
+ goto out;
goto unlock;
} else if (err) {
goto cleanup;
@@ -804,6 +826,7 @@ static int ovl_copy_up_workdir(struct ovl_copy_up_ctx *c)
ovl_set_flag(OVL_WHITEOUTS, inode);
unlock:
unlock_rename(c->workdir, c->destdir);
+out:
ovl_end_write(c->dentry);
return err;
@@ -931,6 +954,13 @@ static int ovl_do_copy_up(struct ovl_copy_up_ctx *c)
goto out_free_fh;
} else {
/*
+ * c->dentry->d_name is stabilzed by ovl_copy_up_start(),
+ * because if we got here, it means that c->dentry has no upper
+ * alias and changing ->d_name means going through ovl_rename()
+ * that will call ovl_copy_up() on source and target dentry.
+ */
+ c->destname = c->dentry->d_name;
+ /*
* Mark parent "impure" because it may now contain non-pure
* upper
*/
@@ -1110,7 +1140,6 @@ static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
if (parent) {
ovl_path_upper(parent, &parentpath);
ctx.destdir = parentpath.dentry;
- ctx.destname = dentry->d_name;
err = vfs_getattr(&parentpath, &ctx.pstat,
STATX_ATIME | STATX_MTIME,
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index aab3f5d93556..0f8b4a719237 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1180,6 +1180,10 @@ static int ovl_rename(struct mnt_idmap *idmap, struct inode *olddir,
}
trap = lock_rename(new_upperdir, old_upperdir);
+ if (IS_ERR(trap)) {
+ err = PTR_ERR(trap);
+ goto out_revert_creds;
+ }
olddentry = ovl_lookup_upper(ofs, old->d_name.name, old_upperdir,
old->d_name.len);
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index 7e16bbcad95e..063409069f56 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -289,7 +289,6 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb,
{
struct dentry *lower = lowerpath ? lowerpath->dentry : NULL;
struct dentry *upper = upper_alias ?: index;
- struct dentry *dentry;
struct inode *inode = NULL;
struct ovl_entry *oe;
struct ovl_inode_params oip = {
@@ -320,27 +319,7 @@ static struct dentry *ovl_obtain_alias(struct super_block *sb,
if (upper)
ovl_set_flag(OVL_UPPERDATA, inode);
- dentry = d_find_any_alias(inode);
- if (dentry)
- goto out_iput;
-
- dentry = d_alloc_anon(inode->i_sb);
- if (unlikely(!dentry))
- goto nomem;
-
- if (upper_alias)
- ovl_dentry_set_upper_alias(dentry);
-
- ovl_dentry_init_reval(dentry, upper, OVL_I_E(inode));
-
- return d_instantiate_anon(dentry, inode);
-
-nomem:
- dput(dentry);
- dentry = ERR_PTR(-ENOMEM);
-out_iput:
- iput(inode);
- return dentry;
+ return d_obtain_alias(inode);
}
/* Get the upper or lower dentry in stack whose on layer @idx */
@@ -460,7 +439,7 @@ static struct dentry *ovl_lookup_real_inode(struct super_block *sb,
* For decoded lower dir file handle, lookup index by origin to check
* if lower dir was copied up and and/or removed.
*/
- if (!this && layer->idx && ofs->indexdir && !WARN_ON(!d_is_dir(real))) {
+ if (!this && layer->idx && ovl_indexdir(sb) && !WARN_ON(!d_is_dir(real))) {
index = ovl_lookup_index(ofs, NULL, real, false);
if (IS_ERR(index))
return index;
@@ -733,7 +712,7 @@ static struct dentry *ovl_lower_fh_to_d(struct super_block *sb,
}
/* Then lookup indexed upper/whiteout by origin fh */
- if (ofs->indexdir) {
+ if (ovl_indexdir(sb)) {
index = ovl_get_index_fh(ofs, fh);
err = PTR_ERR(index);
if (IS_ERR(index)) {
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 131621daeb13..05536964d37f 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -9,25 +9,11 @@
#include <linux/xattr.h>
#include <linux/uio.h>
#include <linux/uaccess.h>
-#include <linux/splice.h>
#include <linux/security.h>
-#include <linux/mm.h>
#include <linux/fs.h>
+#include <linux/backing-file.h>
#include "overlayfs.h"
-#include "../internal.h" /* for sb_init_dio_done_wq */
-
-struct ovl_aio_req {
- struct kiocb iocb;
- refcount_t ref;
- struct kiocb *orig_iocb;
- /* used for aio completion */
- struct work_struct work;
- long res;
-};
-
-static struct kmem_cache *ovl_aio_request_cachep;
-
static char ovl_whatisit(struct inode *inode, struct inode *realinode)
{
if (realinode != ovl_inode_upper(inode))
@@ -274,83 +260,16 @@ static void ovl_file_accessed(struct file *file)
touch_atime(&file->f_path);
}
-#define OVL_IOCB_MASK \
- (IOCB_NOWAIT | IOCB_HIPRI | IOCB_DSYNC | IOCB_SYNC | IOCB_APPEND)
-
-static rwf_t iocb_to_rw_flags(int flags)
-{
- return (__force rwf_t)(flags & OVL_IOCB_MASK);
-}
-
-static inline void ovl_aio_put(struct ovl_aio_req *aio_req)
-{
- if (refcount_dec_and_test(&aio_req->ref)) {
- fput(aio_req->iocb.ki_filp);
- kmem_cache_free(ovl_aio_request_cachep, aio_req);
- }
-}
-
-static void ovl_aio_cleanup_handler(struct ovl_aio_req *aio_req)
-{
- struct kiocb *iocb = &aio_req->iocb;
- struct kiocb *orig_iocb = aio_req->orig_iocb;
-
- if (iocb->ki_flags & IOCB_WRITE) {
- kiocb_end_write(iocb);
- ovl_file_modified(orig_iocb->ki_filp);
- }
-
- orig_iocb->ki_pos = iocb->ki_pos;
- ovl_aio_put(aio_req);
-}
-
-static void ovl_aio_rw_complete(struct kiocb *iocb, long res)
-{
- struct ovl_aio_req *aio_req = container_of(iocb,
- struct ovl_aio_req, iocb);
- struct kiocb *orig_iocb = aio_req->orig_iocb;
-
- ovl_aio_cleanup_handler(aio_req);
- orig_iocb->ki_complete(orig_iocb, res);
-}
-
-static void ovl_aio_complete_work(struct work_struct *work)
-{
- struct ovl_aio_req *aio_req = container_of(work,
- struct ovl_aio_req, work);
-
- ovl_aio_rw_complete(&aio_req->iocb, aio_req->res);
-}
-
-static void ovl_aio_queue_completion(struct kiocb *iocb, long res)
-{
- struct ovl_aio_req *aio_req = container_of(iocb,
- struct ovl_aio_req, iocb);
- struct kiocb *orig_iocb = aio_req->orig_iocb;
-
- /*
- * Punt to a work queue to serialize updates of mtime/size.
- */
- aio_req->res = res;
- INIT_WORK(&aio_req->work, ovl_aio_complete_work);
- queue_work(file_inode(orig_iocb->ki_filp)->i_sb->s_dio_done_wq,
- &aio_req->work);
-}
-
-static int ovl_init_aio_done_wq(struct super_block *sb)
-{
- if (sb->s_dio_done_wq)
- return 0;
-
- return sb_init_dio_done_wq(sb);
-}
-
static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
{
struct file *file = iocb->ki_filp;
struct fd real;
- const struct cred *old_cred;
ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(file_inode(file)->i_sb),
+ .user_file = file,
+ .accessed = ovl_file_accessed,
+ };
if (!iov_iter_count(iter))
return 0;
@@ -359,37 +278,8 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
if (ret)
return ret;
- ret = -EINVAL;
- if (iocb->ki_flags & IOCB_DIRECT &&
- !(real.file->f_mode & FMODE_CAN_ODIRECT))
- goto out_fdput;
-
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- if (is_sync_kiocb(iocb)) {
- rwf_t rwf = iocb_to_rw_flags(iocb->ki_flags);
-
- ret = vfs_iter_read(real.file, iter, &iocb->ki_pos, rwf);
- } else {
- struct ovl_aio_req *aio_req;
-
- ret = -ENOMEM;
- aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
- if (!aio_req)
- goto out;
-
- aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
- aio_req->iocb.ki_complete = ovl_aio_rw_complete;
- refcount_set(&aio_req->ref, 2);
- ret = vfs_iocb_iter_read(real.file, &aio_req->iocb, iter);
- ovl_aio_put(aio_req);
- if (ret != -EIOCBQUEUED)
- ovl_aio_cleanup_handler(aio_req);
- }
-out:
- revert_creds(old_cred);
- ovl_file_accessed(file);
-out_fdput:
+ ret = backing_file_read_iter(real.file, iter, iocb, iocb->ki_flags,
+ &ctx);
fdput(real);
return ret;
@@ -400,9 +290,13 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
struct fd real;
- const struct cred *old_cred;
ssize_t ret;
int ifl = iocb->ki_flags;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(inode->i_sb),
+ .user_file = file,
+ .end_write = ovl_file_modified,
+ };
if (!iov_iter_count(iter))
return 0;
@@ -410,19 +304,11 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
inode_lock(inode);
/* Update mode */
ovl_copyattr(inode);
- ret = file_remove_privs(file);
- if (ret)
- goto out_unlock;
ret = ovl_real_fdget(file, &real);
if (ret)
goto out_unlock;
- ret = -EINVAL;
- if (iocb->ki_flags & IOCB_DIRECT &&
- !(real.file->f_mode & FMODE_CAN_ODIRECT))
- goto out_fdput;
-
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
ifl &= ~(IOCB_DSYNC | IOCB_SYNC);
@@ -431,42 +317,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
* this property in case it is set by the issuer.
*/
ifl &= ~IOCB_DIO_CALLER_COMP;
-
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- if (is_sync_kiocb(iocb)) {
- rwf_t rwf = iocb_to_rw_flags(ifl);
-
- file_start_write(real.file);
- ret = vfs_iter_write(real.file, iter, &iocb->ki_pos, rwf);
- file_end_write(real.file);
- /* Update size */
- ovl_file_modified(file);
- } else {
- struct ovl_aio_req *aio_req;
-
- ret = ovl_init_aio_done_wq(inode->i_sb);
- if (ret)
- goto out;
-
- ret = -ENOMEM;
- aio_req = kmem_cache_zalloc(ovl_aio_request_cachep, GFP_KERNEL);
- if (!aio_req)
- goto out;
-
- aio_req->orig_iocb = iocb;
- kiocb_clone(&aio_req->iocb, iocb, get_file(real.file));
- aio_req->iocb.ki_flags = ifl;
- aio_req->iocb.ki_complete = ovl_aio_queue_completion;
- refcount_set(&aio_req->ref, 2);
- kiocb_start_write(&aio_req->iocb);
- ret = vfs_iocb_iter_write(real.file, &aio_req->iocb, iter);
- ovl_aio_put(aio_req);
- if (ret != -EIOCBQUEUED)
- ovl_aio_cleanup_handler(aio_req);
- }
-out:
- revert_creds(old_cred);
-out_fdput:
+ ret = backing_file_write_iter(real.file, iter, iocb, ifl, &ctx);
fdput(real);
out_unlock:
@@ -479,20 +330,21 @@ static ssize_t ovl_splice_read(struct file *in, loff_t *ppos,
struct pipe_inode_info *pipe, size_t len,
unsigned int flags)
{
- const struct cred *old_cred;
struct fd real;
ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(file_inode(in)->i_sb),
+ .user_file = in,
+ .accessed = ovl_file_accessed,
+ };
ret = ovl_real_fdget(in, &real);
if (ret)
return ret;
- old_cred = ovl_override_creds(file_inode(in)->i_sb);
- ret = vfs_splice_read(real.file, ppos, pipe, len, flags);
- revert_creds(old_cred);
- ovl_file_accessed(in);
-
+ ret = backing_file_splice_read(real.file, ppos, pipe, len, flags, &ctx);
fdput(real);
+
return ret;
}
@@ -508,30 +360,23 @@ static ssize_t ovl_splice_write(struct pipe_inode_info *pipe, struct file *out,
loff_t *ppos, size_t len, unsigned int flags)
{
struct fd real;
- const struct cred *old_cred;
struct inode *inode = file_inode(out);
ssize_t ret;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(inode->i_sb),
+ .user_file = out,
+ .end_write = ovl_file_modified,
+ };
inode_lock(inode);
/* Update mode */
ovl_copyattr(inode);
- ret = file_remove_privs(out);
- if (ret)
- goto out_unlock;
ret = ovl_real_fdget(out, &real);
if (ret)
goto out_unlock;
- old_cred = ovl_override_creds(inode->i_sb);
- file_start_write(real.file);
-
- ret = iter_file_splice_write(pipe, real.file, ppos, len, flags);
-
- file_end_write(real.file);
- /* Update size */
- ovl_file_modified(out);
- revert_creds(old_cred);
+ ret = backing_file_splice_write(pipe, real.file, ppos, len, flags, &ctx);
fdput(real);
out_unlock:
@@ -569,23 +414,13 @@ static int ovl_fsync(struct file *file, loff_t start, loff_t end, int datasync)
static int ovl_mmap(struct file *file, struct vm_area_struct *vma)
{
struct file *realfile = file->private_data;
- const struct cred *old_cred;
- int ret;
-
- if (!realfile->f_op->mmap)
- return -ENODEV;
+ struct backing_file_ctx ctx = {
+ .cred = ovl_creds(file_inode(file)->i_sb),
+ .user_file = file,
+ .accessed = ovl_file_accessed,
+ };
- if (WARN_ON(file != vma->vm_file))
- return -EIO;
-
- vma_set_file(vma, realfile);
-
- old_cred = ovl_override_creds(file_inode(file)->i_sb);
- ret = call_mmap(vma->vm_file, vma);
- revert_creds(old_cred);
- ovl_file_accessed(file);
-
- return ret;
+ return backing_file_mmap(realfile, vma, &ctx);
}
static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
@@ -778,19 +613,3 @@ const struct file_operations ovl_file_operations = {
.copy_file_range = ovl_copy_file_range,
.remap_file_range = ovl_remap_file_range,
};
-
-int __init ovl_aio_request_cache_init(void)
-{
- ovl_aio_request_cachep = kmem_cache_create("ovl_aio_req",
- sizeof(struct ovl_aio_req),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!ovl_aio_request_cachep)
- return -ENOMEM;
-
- return 0;
-}
-
-void ovl_aio_request_cache_destroy(void)
-{
- kmem_cache_destroy(ovl_aio_request_cachep);
-}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index 03bc8d5dfa31..5764f91d283e 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -18,10 +18,11 @@
struct ovl_lookup_data {
struct super_block *sb;
- struct vfsmount *mnt;
+ const struct ovl_layer *layer;
struct qstr name;
bool is_dir;
bool opaque;
+ bool xwhiteouts;
bool stop;
bool last;
char *redirect;
@@ -201,17 +202,13 @@ struct dentry *ovl_decode_real_fh(struct ovl_fs *ofs, struct ovl_fh *fh,
return real;
}
-static bool ovl_is_opaquedir(struct ovl_fs *ofs, const struct path *path)
-{
- return ovl_path_check_dir_xattr(ofs, path, OVL_XATTR_OPAQUE);
-}
-
static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d,
const char *name,
struct dentry *base, int len,
bool drop_negative)
{
- struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->mnt), name, base, len);
+ struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name,
+ base, len);
if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) {
if (drop_negative && ret->d_lockref.count == 1) {
@@ -232,10 +229,13 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
size_t prelen, const char *post,
struct dentry **ret, bool drop_negative)
{
+ struct ovl_fs *ofs = OVL_FS(d->sb);
struct dentry *this;
struct path path;
int err;
bool last_element = !post[0];
+ bool is_upper = d->layer->idx == 0;
+ char val;
this = ovl_lookup_positive_unlocked(d, name, base, namelen, drop_negative);
if (IS_ERR(this)) {
@@ -253,8 +253,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
}
path.dentry = this;
- path.mnt = d->mnt;
- if (ovl_path_is_whiteout(OVL_FS(d->sb), &path)) {
+ path.mnt = d->layer->mnt;
+ if (ovl_path_is_whiteout(ofs, &path)) {
d->stop = d->opaque = true;
goto put_and_out;
}
@@ -272,7 +272,7 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
d->stop = true;
goto put_and_out;
}
- err = ovl_check_metacopy_xattr(OVL_FS(d->sb), &path, NULL);
+ err = ovl_check_metacopy_xattr(ofs, &path, NULL);
if (err < 0)
goto out_err;
@@ -292,7 +292,12 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
if (d->last)
goto out;
- if (ovl_is_opaquedir(OVL_FS(d->sb), &path)) {
+ /* overlay.opaque=x means xwhiteouts directory */
+ val = ovl_get_opaquedir_val(ofs, &path);
+ if (last_element && !is_upper && val == 'x') {
+ d->xwhiteouts = true;
+ ovl_layer_set_xwhiteouts(ofs, d->layer);
+ } else if (val == 'y') {
d->stop = true;
if (last_element)
d->opaque = true;
@@ -754,7 +759,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh)
if (err)
return ERR_PTR(err);
- index = lookup_positive_unlocked(name.name, ofs->indexdir, name.len);
+ index = lookup_positive_unlocked(name.name, ofs->workdir, name.len);
kfree(name.name);
if (IS_ERR(index)) {
if (PTR_ERR(index) == -ENOENT)
@@ -787,7 +792,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
return ERR_PTR(err);
index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name,
- ofs->indexdir, name.len);
+ ofs->workdir, name.len);
if (IS_ERR(index)) {
err = PTR_ERR(index);
if (err == -ENOENT) {
@@ -863,7 +868,8 @@ fail:
* Returns next layer in stack starting from top.
* Returns -1 if this is the last layer.
*/
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
+ const struct ovl_layer **layer)
{
struct ovl_entry *oe = OVL_E(dentry);
struct ovl_path *lowerstack = ovl_lowerstack(oe);
@@ -871,13 +877,16 @@ int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
BUG_ON(idx < 0);
if (idx == 0) {
ovl_path_upper(dentry, path);
- if (path->dentry)
+ if (path->dentry) {
+ *layer = &OVL_FS(dentry->d_sb)->layers[0];
return ovl_numlower(oe) ? 1 : -1;
+ }
idx++;
}
BUG_ON(idx > ovl_numlower(oe));
path->dentry = lowerstack[idx - 1].dentry;
- path->mnt = lowerstack[idx - 1].layer->mnt;
+ *layer = lowerstack[idx - 1].layer;
+ path->mnt = (*layer)->mnt;
return (idx < ovl_numlower(oe)) ? idx + 1 : -1;
}
@@ -1055,7 +1064,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
old_cred = ovl_override_creds(dentry->d_sb);
upperdir = ovl_dentry_upper(dentry->d_parent);
if (upperdir) {
- d.mnt = ovl_upper_mnt(ofs);
+ d.layer = &ofs->layers[0];
err = ovl_lookup_layer(upperdir, &d, &upperdentry, true);
if (err)
goto out;
@@ -1111,7 +1120,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
else if (d.is_dir || !ofs->numdatalayer)
d.last = lower.layer->idx == ovl_numlower(roe);
- d.mnt = lower.layer->mnt;
+ d.layer = lower.layer;
err = ovl_lookup_layer(lower.dentry, &d, &this, false);
if (err)
goto out_put;
@@ -1278,6 +1287,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
if (upperopaque)
ovl_dentry_set_opaque(dentry);
+ if (d.xwhiteouts)
+ ovl_dentry_set_xwhiteouts(dentry);
if (upperdentry)
ovl_dentry_set_upper_alias(dentry);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index 05c3dd597fa8..ee949f3e7c77 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -50,7 +50,6 @@ enum ovl_xattr {
OVL_XATTR_METACOPY,
OVL_XATTR_PROTATTR,
OVL_XATTR_XWHITEOUT,
- OVL_XATTR_XWHITEOUTS,
};
enum ovl_inode_flag {
@@ -70,6 +69,8 @@ enum ovl_entry_flag {
OVL_E_UPPER_ALIAS,
OVL_E_OPAQUE,
OVL_E_CONNECTED,
+ /* Lower stack may contain xwhiteout entries */
+ OVL_E_XWHITEOUTS,
};
enum {
@@ -425,6 +426,12 @@ int ovl_want_write(struct dentry *dentry);
void ovl_drop_write(struct dentry *dentry);
struct dentry *ovl_workdir(struct dentry *dentry);
const struct cred *ovl_override_creds(struct super_block *sb);
+
+static inline const struct cred *ovl_creds(struct super_block *sb)
+{
+ return OVL_FS(sb)->creator_cred;
+}
+
int ovl_can_decode_fh(struct super_block *sb);
struct dentry *ovl_indexdir(struct super_block *sb);
bool ovl_index_all(struct super_block *sb);
@@ -471,6 +478,10 @@ bool ovl_dentry_test_flag(unsigned long flag, struct dentry *dentry);
bool ovl_dentry_is_opaque(struct dentry *dentry);
bool ovl_dentry_is_whiteout(struct dentry *dentry);
void ovl_dentry_set_opaque(struct dentry *dentry);
+bool ovl_dentry_has_xwhiteouts(struct dentry *dentry);
+void ovl_dentry_set_xwhiteouts(struct dentry *dentry);
+void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
+ const struct ovl_layer *layer);
bool ovl_dentry_has_upper_alias(struct dentry *dentry);
void ovl_dentry_set_upper_alias(struct dentry *dentry);
bool ovl_dentry_needs_data_copy_up(struct dentry *dentry, int flags);
@@ -488,11 +499,10 @@ struct file *ovl_path_open(const struct path *path, int flags);
int ovl_copy_up_start(struct dentry *dentry, int flags);
void ovl_copy_up_end(struct dentry *dentry);
bool ovl_already_copied_up(struct dentry *dentry, int flags);
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
- enum ovl_xattr ox);
+char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox);
bool ovl_path_check_origin_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path);
-bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path);
bool ovl_init_uuid_xattr(struct super_block *sb, struct ovl_fs *ofs,
const struct path *upperpath);
@@ -567,7 +577,13 @@ static inline bool ovl_is_impuredir(struct super_block *sb,
.mnt = ovl_upper_mnt(ofs),
};
- return ovl_path_check_dir_xattr(ofs, &upperpath, OVL_XATTR_IMPURE);
+ return ovl_get_dir_xattr_val(ofs, &upperpath, OVL_XATTR_IMPURE) == 'y';
+}
+
+static inline char ovl_get_opaquedir_val(struct ovl_fs *ofs,
+ const struct path *path)
+{
+ return ovl_get_dir_xattr_val(ofs, path, OVL_XATTR_OPAQUE);
}
static inline bool ovl_redirect_follow(struct ovl_fs *ofs)
@@ -674,7 +690,8 @@ int ovl_get_index_name(struct ovl_fs *ofs, struct dentry *origin,
struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh);
struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
struct dentry *origin, bool verify);
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path,
+ const struct ovl_layer **layer);
int ovl_verify_lowerdata(struct dentry *dentry);
struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags);
@@ -837,8 +854,6 @@ struct dentry *ovl_create_temp(struct ovl_fs *ofs, struct dentry *workdir,
/* file.c */
extern const struct file_operations ovl_file_operations;
-int __init ovl_aio_request_cache_init(void);
-void ovl_aio_request_cache_destroy(void);
int ovl_real_fileattr_get(const struct path *realpath, struct fileattr *fa);
int ovl_real_fileattr_set(const struct path *realpath, struct fileattr *fa);
int ovl_fileattr_get(struct dentry *dentry, struct fileattr *fa);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
index d82d2a043da2..cb449ab310a7 100644
--- a/fs/overlayfs/ovl_entry.h
+++ b/fs/overlayfs/ovl_entry.h
@@ -40,6 +40,8 @@ struct ovl_layer {
int idx;
/* One fsid per unique underlying sb (upper fsid == 0) */
int fsid;
+ /* xwhiteouts were found on this layer */
+ bool has_xwhiteouts;
};
struct ovl_path {
@@ -59,14 +61,12 @@ struct ovl_fs {
unsigned int numfs;
/* Number of data-only lower layers */
unsigned int numdatalayer;
- const struct ovl_layer *layers;
+ struct ovl_layer *layers;
struct ovl_sb *fs;
/* workbasedir is the path at workdir= mount option */
struct dentry *workbasedir;
- /* workdir is the 'work' directory under workbasedir */
+ /* workdir is the 'work' or 'index' directory under workbasedir */
struct dentry *workdir;
- /* index directory listing overlay inodes by origin file handle */
- struct dentry *indexdir;
long namelen;
/* pathnames of lower and upper dirs, for show_options */
struct ovl_config config;
@@ -81,7 +81,6 @@ struct ovl_fs {
/* Traps in ovl inode cache */
struct inode *workbasedir_trap;
struct inode *workdir_trap;
- struct inode *indexdir_trap;
/* -1: disabled, 0: same fs, 1..32: number of unused ino bits */
int xino_mode;
/* For allocation of non-persistent inode numbers */
diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c
index 3fe2dde1598f..112b4b12f825 100644
--- a/fs/overlayfs/params.c
+++ b/fs/overlayfs/params.c
@@ -743,10 +743,8 @@ void ovl_free_fs(struct ovl_fs *ofs)
unsigned i;
iput(ofs->workbasedir_trap);
- iput(ofs->indexdir_trap);
iput(ofs->workdir_trap);
dput(ofs->whiteout);
- dput(ofs->indexdir);
dput(ofs->workdir);
if (ofs->workdir_locked)
ovl_inuse_unlock(ofs->workbasedir);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index a490fc47c3e7..0ca8af060b0c 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -305,8 +305,6 @@ static inline int ovl_dir_read(const struct path *realpath,
if (IS_ERR(realfile))
return PTR_ERR(realfile);
- rdd->in_xwhiteouts_dir = rdd->dentry &&
- ovl_path_check_xwhiteouts_xattr(OVL_FS(rdd->dentry->d_sb), realpath);
rdd->first_maybe_whiteout = NULL;
rdd->ctx.pos = 0;
do {
@@ -359,10 +357,13 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list,
.is_lowest = false,
};
int idx, next;
+ const struct ovl_layer *layer;
for (idx = 0; idx != -1; idx = next) {
- next = ovl_path_next(idx, dentry, &realpath);
+ next = ovl_path_next(idx, dentry, &realpath, &layer);
rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry;
+ rdd.in_xwhiteouts_dir = layer->has_xwhiteouts &&
+ ovl_dentry_has_xwhiteouts(dentry);
if (next != -1) {
err = ovl_dir_read(&realpath, &rdd);
@@ -1169,7 +1170,7 @@ int ovl_workdir_cleanup(struct ovl_fs *ofs, struct inode *dir,
int ovl_indexdir_cleanup(struct ovl_fs *ofs)
{
int err;
- struct dentry *indexdir = ofs->indexdir;
+ struct dentry *indexdir = ofs->workdir;
struct dentry *index = NULL;
struct inode *dir = indexdir->d_inode;
struct path path = { .mnt = ovl_upper_mnt(ofs), .dentry = indexdir };
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index a0967bb25003..2eef6c70b2ae 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -439,8 +439,10 @@ static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir)
bool ok = false;
if (workdir != upperdir) {
- ok = (lock_rename(workdir, upperdir) == NULL);
- unlock_rename(workdir, upperdir);
+ struct dentry *trap = lock_rename(workdir, upperdir);
+ if (!IS_ERR(trap))
+ unlock_rename(workdir, upperdir);
+ ok = (trap == NULL);
}
return ok;
}
@@ -853,10 +855,8 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
if (IS_ERR(indexdir)) {
err = PTR_ERR(indexdir);
} else if (indexdir) {
- ofs->indexdir = indexdir;
- ofs->workdir = dget(indexdir);
-
- err = ovl_setup_trap(sb, ofs->indexdir, &ofs->indexdir_trap,
+ ofs->workdir = indexdir;
+ err = ovl_setup_trap(sb, indexdir, &ofs->workdir_trap,
"indexdir");
if (err)
goto out;
@@ -869,16 +869,15 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
* ".overlay.upper" to indicate that index may have
* directory entries.
*/
- if (ovl_check_origin_xattr(ofs, ofs->indexdir)) {
- err = ovl_verify_origin_xattr(ofs, ofs->indexdir,
+ if (ovl_check_origin_xattr(ofs, indexdir)) {
+ err = ovl_verify_origin_xattr(ofs, indexdir,
OVL_XATTR_ORIGIN,
upperpath->dentry, true,
false);
if (err)
pr_err("failed to verify index dir 'origin' xattr\n");
}
- err = ovl_verify_upper(ofs, ofs->indexdir, upperpath->dentry,
- true);
+ err = ovl_verify_upper(ofs, indexdir, upperpath->dentry, true);
if (err)
pr_err("failed to verify index dir 'upper' xattr\n");
@@ -886,7 +885,7 @@ static int ovl_get_indexdir(struct super_block *sb, struct ovl_fs *ofs,
if (!err)
err = ovl_indexdir_cleanup(ofs);
}
- if (err || !ofs->indexdir)
+ if (err || !indexdir)
pr_warn("try deleting index dir or mounting with '-o index=off' to disable inodes index.\n");
out:
@@ -1250,6 +1249,7 @@ static struct dentry *ovl_get_root(struct super_block *sb,
struct ovl_entry *oe)
{
struct dentry *root;
+ struct ovl_fs *ofs = OVL_FS(sb);
struct ovl_path *lowerpath = ovl_lowerstack(oe);
unsigned long ino = d_inode(lowerpath->dentry)->i_ino;
int fsid = lowerpath->layer->fsid;
@@ -1271,6 +1271,20 @@ static struct dentry *ovl_get_root(struct super_block *sb,
ovl_set_flag(OVL_IMPURE, d_inode(root));
}
+ /* Look for xwhiteouts marker except in the lowermost layer */
+ for (int i = 0; i < ovl_numlower(oe) - 1; i++, lowerpath++) {
+ struct path path = {
+ .mnt = lowerpath->layer->mnt,
+ .dentry = lowerpath->dentry,
+ };
+
+ /* overlay.opaque=x means xwhiteouts directory */
+ if (ovl_get_opaquedir_val(ofs, &path) == 'x') {
+ ovl_layer_set_xwhiteouts(ofs, lowerpath->layer);
+ ovl_dentry_set_xwhiteouts(root);
+ }
+ }
+
/* Root is always merge -> can have whiteouts */
ovl_set_flag(OVL_WHITEOUTS, d_inode(root));
ovl_dentry_set_flag(OVL_E_CONNECTED, root);
@@ -1406,7 +1420,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
goto out_free_oe;
/* Force r/o mount with no index dir */
- if (!ofs->indexdir)
+ if (!ofs->workdir)
sb->s_flags |= SB_RDONLY;
}
@@ -1415,7 +1429,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
goto out_free_oe;
/* Show index=off in /proc/mounts for forced r/o mount */
- if (!ofs->indexdir) {
+ if (!ofs->workdir) {
ofs->config.index = false;
if (ovl_upper_mnt(ofs) && ofs->config.nfs_export) {
pr_warn("NFS export requires an index dir, falling back to nfs_export=off.\n");
@@ -1454,6 +1468,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc)
* lead to unexpected results.
*/
sb->s_iflags |= SB_I_NOUMASK;
+ sb->s_iflags |= SB_I_EVM_UNSUPPORTED;
err = -ENOMEM;
root_dentry = ovl_get_root(sb, ctx->upper.dentry, oe);
@@ -1501,14 +1516,10 @@ static int __init ovl_init(void)
if (ovl_inode_cachep == NULL)
return -ENOMEM;
- err = ovl_aio_request_cache_init();
- if (!err) {
- err = register_filesystem(&ovl_fs_type);
- if (!err)
- return 0;
+ err = register_filesystem(&ovl_fs_type);
+ if (!err)
+ return 0;
- ovl_aio_request_cache_destroy();
- }
kmem_cache_destroy(ovl_inode_cachep);
return err;
@@ -1524,7 +1535,6 @@ static void __exit ovl_exit(void)
*/
rcu_barrier();
kmem_cache_destroy(ovl_inode_cachep);
- ovl_aio_request_cache_destroy();
}
module_init(ovl_init);
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index c3f020ca13a8..a8e17f14d7a2 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -91,7 +91,7 @@ struct dentry *ovl_indexdir(struct super_block *sb)
{
struct ovl_fs *ofs = OVL_FS(sb);
- return ofs->indexdir;
+ return ofs->config.index ? ofs->workdir : NULL;
}
/* Index all files on copy up. For now only enabled for NFS export */
@@ -461,6 +461,33 @@ void ovl_dentry_set_opaque(struct dentry *dentry)
ovl_dentry_set_flag(OVL_E_OPAQUE, dentry);
}
+bool ovl_dentry_has_xwhiteouts(struct dentry *dentry)
+{
+ return ovl_dentry_test_flag(OVL_E_XWHITEOUTS, dentry);
+}
+
+void ovl_dentry_set_xwhiteouts(struct dentry *dentry)
+{
+ ovl_dentry_set_flag(OVL_E_XWHITEOUTS, dentry);
+}
+
+/*
+ * ovl_layer_set_xwhiteouts() is called before adding the overlay dir
+ * dentry to dcache, while readdir of that same directory happens after
+ * the overlay dir dentry is in dcache, so if some cpu observes that
+ * ovl_dentry_is_xwhiteouts(), it will also observe layer->has_xwhiteouts
+ * for the layers where xwhiteouts marker was found in that merge dir.
+ */
+void ovl_layer_set_xwhiteouts(struct ovl_fs *ofs,
+ const struct ovl_layer *layer)
+{
+ if (layer->has_xwhiteouts)
+ return;
+
+ /* Write once to read-mostly layer properties */
+ ofs->layers[layer->idx].has_xwhiteouts = true;
+}
+
/*
* For hard links and decoded file handles, it's possible for ovl_dentry_upper()
* to return positive, while there's no actual upper alias for the inode.
@@ -739,19 +766,6 @@ bool ovl_path_check_xwhiteout_xattr(struct ovl_fs *ofs, const struct path *path)
return res >= 0;
}
-bool ovl_path_check_xwhiteouts_xattr(struct ovl_fs *ofs, const struct path *path)
-{
- struct dentry *dentry = path->dentry;
- int res;
-
- /* xattr.whiteouts must be a directory */
- if (!d_is_dir(dentry))
- return false;
-
- res = ovl_path_getxattr(ofs, path, OVL_XATTR_XWHITEOUTS, NULL, 0);
- return res >= 0;
-}
-
/*
* Load persistent uuid from xattr into s_uuid if found, or store a new
* random generated value in s_uuid and in xattr.
@@ -811,20 +825,17 @@ fail:
return false;
}
-bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
- enum ovl_xattr ox)
+char ovl_get_dir_xattr_val(struct ovl_fs *ofs, const struct path *path,
+ enum ovl_xattr ox)
{
int res;
char val;
if (!d_is_dir(path->dentry))
- return false;
+ return 0;
res = ovl_path_getxattr(ofs, path, ox, &val, 1);
- if (res == 1 && val == 'y')
- return true;
-
- return false;
+ return res == 1 ? val : 0;
}
#define OVL_XATTR_OPAQUE_POSTFIX "opaque"
@@ -837,7 +848,6 @@ bool ovl_path_check_dir_xattr(struct ovl_fs *ofs, const struct path *path,
#define OVL_XATTR_METACOPY_POSTFIX "metacopy"
#define OVL_XATTR_PROTATTR_POSTFIX "protattr"
#define OVL_XATTR_XWHITEOUT_POSTFIX "whiteout"
-#define OVL_XATTR_XWHITEOUTS_POSTFIX "whiteouts"
#define OVL_XATTR_TAB_ENTRY(x) \
[x] = { [false] = OVL_XATTR_TRUSTED_PREFIX x ## _POSTFIX, \
@@ -854,7 +864,6 @@ const char *const ovl_xattr_table[][2] = {
OVL_XATTR_TAB_ENTRY(OVL_XATTR_METACOPY),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_PROTATTR),
OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUT),
- OVL_XATTR_TAB_ENTRY(OVL_XATTR_XWHITEOUTS),
};
int ovl_check_setxattr(struct ovl_fs *ofs, struct dentry *upperdentry,
@@ -1198,12 +1207,17 @@ void ovl_nlink_end(struct dentry *dentry)
int ovl_lock_rename_workdir(struct dentry *workdir, struct dentry *upperdir)
{
+ struct dentry *trap;
+
/* Workdir should not be the same as upperdir */
if (workdir == upperdir)
goto err;
/* Workdir should not be subdir of upperdir and vice versa */
- if (lock_rename(workdir, upperdir) != NULL)
+ trap = lock_rename(workdir, upperdir);
+ if (IS_ERR(trap))
+ goto err;
+ if (trap)
goto err_unlock;
return 0;
diff --git a/fs/pipe.c b/fs/pipe.c
index 804a7d789452..f1adbfe743d4 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -446,6 +446,18 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
bool was_empty = false;
bool wake_next_writer = false;
+ /*
+ * Reject writing to watch queue pipes before the point where we lock
+ * the pipe.
+ * Otherwise, lockdep would be unhappy if the caller already has another
+ * pipe locked.
+ * If we had to support locking a normal pipe and a notification pipe at
+ * the same time, we could set up lockdep annotations for that, but
+ * since we don't actually need that, it's simpler to just bail here.
+ */
+ if (pipe_has_watch_queue(pipe))
+ return -EXDEV;
+
/* Null write succeeds. */
if (unlikely(total_len == 0))
return 0;
@@ -458,11 +470,6 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
- if (pipe_has_watch_queue(pipe)) {
- ret = -EXDEV;
- goto out;
- }
-
/*
* If it wasn't empty we try to merge new data into
* the last buffer.
@@ -1317,6 +1324,11 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
pipe->tail = tail;
pipe->head = head;
+ if (!pipe_has_watch_queue(pipe)) {
+ pipe->max_usage = nr_slots;
+ pipe->nr_accounted = nr_slots;
+ }
+
spin_unlock_irq(&pipe->rd_wait.lock);
/* This might have made more room for writers */
@@ -1368,8 +1380,6 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned int arg)
if (ret < 0)
goto out_revert_acct;
- pipe->max_usage = nr_slots;
- pipe->nr_accounted = nr_slots;
return pipe->max_usage * PAGE_SIZE;
out_revert_acct:
@@ -1497,7 +1507,6 @@ static struct ctl_table fs_pipe_sysctls[] = {
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
- { }
};
#endif
diff --git a/fs/pnode.c b/fs/pnode.c
index e4d0340393d5..a799e0315cc9 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -468,7 +468,7 @@ static void umount_one(struct mount *mnt, struct list_head *to_umount)
mnt->mnt.mnt_flags |= MNT_UMOUNT;
list_del_init(&mnt->mnt_child);
list_del_init(&mnt->mnt_umounting);
- list_move_tail(&mnt->mnt_list, to_umount);
+ move_from_ns(mnt, to_umount);
}
/*
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index a05fe94970ce..e1af20893ebe 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -600,7 +600,7 @@ EXPORT_SYMBOL(__posix_acl_chmod);
* the vfsmount must be passed through @idmap. This function will then
* take care to map the inode according to @idmap before checking
* permissions. On non-idmapped mounts or if permission checking is to be
- * performed on the raw inode simply passs @nop_mnt_idmap.
+ * performed on the raw inode simply pass @nop_mnt_idmap.
*/
int
posix_acl_chmod(struct mnt_idmap *idmap, struct dentry *dentry,
@@ -700,7 +700,7 @@ EXPORT_SYMBOL_GPL(posix_acl_create);
* the vfsmount must be passed through @idmap. This function will then
* take care to map the inode according to @idmap before checking
* permissions. On non-idmapped mounts or if permission checking is to be
- * performed on the raw inode simply passs @nop_mnt_idmap.
+ * performed on the raw inode simply pass @nop_mnt_idmap.
*
* Called from set_acl inode operations.
*/
diff --git a/fs/proc/array.c b/fs/proc/array.c
index ff08a8957552..34a47fb0c57f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -477,13 +477,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
int permitted;
struct mm_struct *mm;
unsigned long long start_time;
- unsigned long cmin_flt = 0, cmaj_flt = 0;
- unsigned long min_flt = 0, maj_flt = 0;
- u64 cutime, cstime, utime, stime;
- u64 cgtime, gtime;
+ unsigned long cmin_flt, cmaj_flt, min_flt, maj_flt;
+ u64 cutime, cstime, cgtime, utime, stime, gtime;
unsigned long rsslim = 0;
unsigned long flags;
int exit_code = task->exit_code;
+ struct signal_struct *sig = task->signal;
+ unsigned int seq = 1;
state = *get_task_state(task);
vsize = eip = esp = 0;
@@ -511,12 +511,8 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
sigemptyset(&sigign);
sigemptyset(&sigcatch);
- cutime = cstime = utime = stime = 0;
- cgtime = gtime = 0;
if (lock_task_sighand(task, &flags)) {
- struct signal_struct *sig = task->signal;
-
if (sig->tty) {
struct pid *pgrp = tty_get_pgrp(sig->tty);
tty_pgrp = pid_nr_ns(pgrp, ns);
@@ -527,28 +523,9 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
num_threads = get_nr_threads(task);
collect_sigign_sigcatch(task, &sigign, &sigcatch);
- cmin_flt = sig->cmin_flt;
- cmaj_flt = sig->cmaj_flt;
- cutime = sig->cutime;
- cstime = sig->cstime;
- cgtime = sig->cgtime;
rsslim = READ_ONCE(sig->rlim[RLIMIT_RSS].rlim_cur);
- /* add up live thread stats at the group level */
if (whole) {
- struct task_struct *t;
-
- __for_each_thread(sig, t) {
- min_flt += t->min_flt;
- maj_flt += t->maj_flt;
- gtime += task_gtime(t);
- }
-
- min_flt += sig->min_flt;
- maj_flt += sig->maj_flt;
- thread_group_cputime_adjusted(task, &utime, &stime);
- gtime += sig->gtime;
-
if (sig->flags & (SIGNAL_GROUP_EXIT | SIGNAL_STOP_STOPPED))
exit_code = sig->group_exit_code;
}
@@ -562,10 +539,41 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (permitted && (!whole || num_threads < 2))
wchan = !task_is_running(task);
- if (!whole) {
+
+ do {
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
+ flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
+
+ cmin_flt = sig->cmin_flt;
+ cmaj_flt = sig->cmaj_flt;
+ cutime = sig->cutime;
+ cstime = sig->cstime;
+ cgtime = sig->cgtime;
+
+ if (whole) {
+ struct task_struct *t;
+
+ min_flt = sig->min_flt;
+ maj_flt = sig->maj_flt;
+ gtime = sig->gtime;
+
+ rcu_read_lock();
+ __for_each_thread(sig, t) {
+ min_flt += t->min_flt;
+ maj_flt += t->maj_flt;
+ gtime += task_gtime(t);
+ }
+ rcu_read_unlock();
+ }
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
+
+ if (whole) {
+ thread_group_cputime_adjusted(task, &utime, &stime);
+ } else {
+ task_cputime_adjusted(task, &utime, &stime);
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- task_cputime_adjusted(task, &utime, &stime);
gtime = task_gtime(task);
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index dd31e3b6bf77..18550c071d71 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -97,6 +97,7 @@
#include <linux/resctrl.h>
#include <linux/cn_proc.h>
#include <linux/ksm.h>
+#include <uapi/linux/lsm.h>
#include <trace/events/oom.h>
#include "internal.h"
#include "fd.h"
@@ -146,10 +147,10 @@ struct pid_entry {
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_single_file_operations, \
{ .proc_show = show } )
-#define ATTR(LSM, NAME, MODE) \
+#define ATTR(LSMID, NAME, MODE) \
NOD(NAME, (S_IFREG|(MODE)), \
NULL, &proc_pid_attr_operations, \
- { .lsm = LSM })
+ { .lsmid = LSMID })
/*
* Count the number of hardlinks for the pid_entry table, excluding the .
@@ -1877,8 +1878,6 @@ void proc_pid_evict_inode(struct proc_inode *ei)
hlist_del_init_rcu(&ei->sibling_inodes);
spin_unlock(&pid->lock);
}
-
- put_pid(pid);
}
struct inode *proc_pid_make_inode(struct super_block *sb,
@@ -2726,7 +2725,7 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
if (!task)
return -ESRCH;
- length = security_getprocattr(task, PROC_I(inode)->op.lsm,
+ length = security_getprocattr(task, PROC_I(inode)->op.lsmid,
file->f_path.dentry->d_name.name,
&p);
put_task_struct(task);
@@ -2784,7 +2783,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
if (rv < 0)
goto out_free;
- rv = security_setprocattr(PROC_I(inode)->op.lsm,
+ rv = security_setprocattr(PROC_I(inode)->op.lsmid,
file->f_path.dentry->d_name.name, page,
count);
mutex_unlock(&current->signal->cred_guard_mutex);
@@ -2833,27 +2832,27 @@ static const struct inode_operations proc_##LSM##_attr_dir_inode_ops = { \
#ifdef CONFIG_SECURITY_SMACK
static const struct pid_entry smack_attr_dir_stuff[] = {
- ATTR("smack", "current", 0666),
+ ATTR(LSM_ID_SMACK, "current", 0666),
};
LSM_DIR_OPS(smack);
#endif
#ifdef CONFIG_SECURITY_APPARMOR
static const struct pid_entry apparmor_attr_dir_stuff[] = {
- ATTR("apparmor", "current", 0666),
- ATTR("apparmor", "prev", 0444),
- ATTR("apparmor", "exec", 0666),
+ ATTR(LSM_ID_APPARMOR, "current", 0666),
+ ATTR(LSM_ID_APPARMOR, "prev", 0444),
+ ATTR(LSM_ID_APPARMOR, "exec", 0666),
};
LSM_DIR_OPS(apparmor);
#endif
static const struct pid_entry attr_dir_stuff[] = {
- ATTR(NULL, "current", 0666),
- ATTR(NULL, "prev", 0444),
- ATTR(NULL, "exec", 0666),
- ATTR(NULL, "fscreate", 0666),
- ATTR(NULL, "keycreate", 0666),
- ATTR(NULL, "sockcreate", 0666),
+ ATTR(LSM_ID_UNDEF, "current", 0666),
+ ATTR(LSM_ID_UNDEF, "prev", 0444),
+ ATTR(LSM_ID_UNDEF, "exec", 0666),
+ ATTR(LSM_ID_UNDEF, "fscreate", 0666),
+ ATTR(LSM_ID_UNDEF, "keycreate", 0666),
+ ATTR(LSM_ID_UNDEF, "sockcreate", 0666),
#ifdef CONFIG_SECURITY_SMACK
DIR("smack", 0555,
proc_smack_attr_dir_inode_ops, proc_smack_attr_dir_ops),
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index b33e490e3fd9..05350f3c2812 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -30,7 +30,6 @@
static void proc_evict_inode(struct inode *inode)
{
- struct proc_dir_entry *de;
struct ctl_table_header *head;
struct proc_inode *ei = PROC_I(inode);
@@ -38,17 +37,8 @@ static void proc_evict_inode(struct inode *inode)
clear_inode(inode);
/* Stop tracking associated processes */
- if (ei->pid) {
+ if (ei->pid)
proc_pid_evict_inode(ei);
- ei->pid = NULL;
- }
-
- /* Let go of any associated proc directory entry */
- de = ei->pde;
- if (de) {
- pde_put(de);
- ei->pde = NULL;
- }
head = ei->sysctl;
if (head) {
@@ -80,6 +70,13 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
static void proc_free_inode(struct inode *inode)
{
+ struct proc_inode *ei = PROC_I(inode);
+
+ if (ei->pid)
+ put_pid(ei->pid);
+ /* Let go of any associated proc directory entry */
+ if (ei->pde)
+ pde_put(ei->pde);
kmem_cache_free(proc_inode_cachep, PROC_I(inode));
}
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 9a8f32f21ff5..a71ac5379584 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -92,7 +92,7 @@ union proc_op {
int (*proc_show)(struct seq_file *m,
struct pid_namespace *ns, struct pid *pid,
struct task_struct *task);
- const char *lsm;
+ int lsmid;
};
struct proc_inode {
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 8064ea76f80b..37cde0efee57 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -44,7 +44,7 @@ static struct ctl_table sysctl_mount_point[] = {
*/
struct ctl_table_header *register_sysctl_mount_point(const char *path)
{
- return register_sysctl_sz(path, sysctl_mount_point, 0);
+ return register_sysctl(path, sysctl_mount_point);
}
EXPORT_SYMBOL(register_sysctl_mount_point);
@@ -71,7 +71,6 @@ static struct ctl_table root_table[] = {
.procname = "",
.mode = S_IFDIR|S_IRUGO|S_IXUGO,
},
- { }
};
static struct ctl_table_root sysctl_table_root = {
.default_set.dir.header = {
@@ -233,7 +232,8 @@ static int insert_header(struct ctl_dir *dir, struct ctl_table_header *header)
return -EROFS;
/* Am I creating a permanently empty directory? */
- if (sysctl_is_perm_empty_ctl_table(header->ctl_table)) {
+ if (header->ctl_table_size > 0 &&
+ sysctl_is_perm_empty_ctl_table(header->ctl_table)) {
if (!RB_EMPTY_ROOT(&dir->root))
return -EINVAL;
sysctl_set_perm_empty_ctl_header(dir_h);
@@ -534,13 +534,8 @@ static struct dentry *proc_sys_lookup(struct inode *dir, struct dentry *dentry,
goto out;
}
- inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
- if (IS_ERR(inode)) {
- err = ERR_CAST(inode);
- goto out;
- }
-
d_set_d_op(dentry, &proc_sys_dentry_operations);
+ inode = proc_sys_make_inode(dir->i_sb, h ? h : head, p);
err = d_splice_alias(inode, dentry);
out:
@@ -698,13 +693,8 @@ static bool proc_sys_fill_cache(struct file *file,
return false;
if (d_in_lookup(child)) {
struct dentry *res;
- inode = proc_sys_make_inode(dir->d_sb, head, table);
- if (IS_ERR(inode)) {
- d_lookup_done(child);
- dput(child);
- return false;
- }
d_set_d_op(child, &proc_sys_dentry_operations);
+ inode = proc_sys_make_inode(dir->d_sb, head, table);
res = d_splice_alias(inode, child);
d_lookup_done(child);
if (unlikely(res)) {
@@ -1213,6 +1203,10 @@ static bool get_links(struct ctl_dir *dir,
struct ctl_table_header *tmp_head;
struct ctl_table *entry, *link;
+ if (header->ctl_table_size == 0 ||
+ sysctl_is_perm_empty_ctl_table(header->ctl_table))
+ return true;
+
/* Are there links available for every entry in table? */
list_for_each_table_entry(entry, header) {
const char *procname = entry->procname;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index b55dbc70287b..06a297a27ba3 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -271,7 +271,7 @@ static void proc_kill_sb(struct super_block *sb)
kill_anon_super(sb);
put_pid_ns(fs_info->pid_ns);
- kfree(fs_info);
+ kfree_rcu(fs_info, rcu);
}
static struct file_system_type proc_fs_type = {
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 435b61054b5b..3f78ebbb795f 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -273,7 +273,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
const char *name = NULL;
if (file) {
- struct inode *inode = file_inode(vma->vm_file);
+ const struct inode *inode = file_user_inode(vma->vm_file);
+
dev = inode->i_sb->s_dev;
ino = inode->i_ino;
pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
@@ -865,7 +866,8 @@ static int show_smap(struct seq_file *m, void *v)
__show_smap(m, &mss, false);
seq_printf(m, "THPeligible: %8u\n",
- hugepage_vma_check(vma, vma->vm_flags, true, false, true));
+ !!thp_vma_allowable_orders(vma, vma->vm_flags, true, false,
+ true, THP_ORDERS_ALL));
if (arch_pkeys_enabled())
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
@@ -1761,7 +1763,7 @@ static int pagemap_release(struct inode *inode, struct file *file)
#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \
PAGE_IS_FILE | PAGE_IS_PRESENT | \
PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
- PAGE_IS_HUGE)
+ PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY)
#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
struct pagemap_scan_private {
@@ -1793,6 +1795,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
if (is_zero_pfn(pte_pfn(pte)))
categories |= PAGE_IS_PFNZERO;
+ if (pte_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
swp_entry_t swp;
@@ -1806,6 +1810,8 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
!PageAnon(pfn_swap_entry_to_page(swp)))
categories |= PAGE_IS_FILE;
}
+ if (pte_swp_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
}
return categories;
@@ -1853,12 +1859,16 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
if (is_zero_pfn(pmd_pfn(pmd)))
categories |= PAGE_IS_PFNZERO;
+ if (pmd_soft_dirty(pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
} else if (is_swap_pmd(pmd)) {
swp_entry_t swp;
categories |= PAGE_IS_SWAPPED;
if (!pmd_swp_uffd_wp(pmd))
categories |= PAGE_IS_WRITTEN;
+ if (pmd_swp_soft_dirty(pmd))
+ categories |= PAGE_IS_SOFT_DIRTY;
if (p->masks_of_interest & PAGE_IS_FILE) {
swp = pmd_to_swp_entry(pmd);
@@ -1905,10 +1915,14 @@ static unsigned long pagemap_hugetlb_category(pte_t pte)
categories |= PAGE_IS_FILE;
if (is_zero_pfn(pte_pfn(pte)))
categories |= PAGE_IS_PFNZERO;
+ if (pte_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
} else if (is_swap_pte(pte)) {
categories |= PAGE_IS_SWAPPED;
if (!pte_swp_uffd_wp_any(pte))
categories |= PAGE_IS_WRITTEN;
+ if (pte_swp_soft_dirty(pte))
+ categories |= PAGE_IS_SOFT_DIRTY;
}
return categories;
@@ -2007,6 +2021,9 @@ static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
if (wp_allowed)
vma_category |= PAGE_IS_WPALLOWED;
+ if (vma->vm_flags & VM_SOFTDIRTY)
+ vma_category |= PAGE_IS_SOFT_DIRTY;
+
if (!pagemap_scan_is_interesting_vma(vma_category, p))
return 1;
@@ -2415,7 +2432,6 @@ static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p)
static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
{
- struct mmu_notifier_range range;
struct pagemap_scan_private p = {0};
unsigned long walk_start;
size_t n_ranges_out = 0;
@@ -2431,15 +2447,9 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
if (ret)
return ret;
- /* Protection change for the range is going to happen. */
- if (p.arg.flags & PM_SCAN_WP_MATCHING) {
- mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
- mm, p.arg.start, p.arg.end);
- mmu_notifier_invalidate_range_start(&range);
- }
-
for (walk_start = p.arg.start; walk_start < p.arg.end;
walk_start = p.arg.walk_end) {
+ struct mmu_notifier_range range;
long n_out;
if (fatal_signal_pending(current)) {
@@ -2450,8 +2460,20 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
ret = mmap_read_lock_killable(mm);
if (ret)
break;
+
+ /* Protection change for the range is going to happen. */
+ if (p.arg.flags & PM_SCAN_WP_MATCHING) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0,
+ mm, walk_start, p.arg.end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
ret = walk_page_range(mm, walk_start, p.arg.end,
&pagemap_scan_ops, &p);
+
+ if (p.arg.flags & PM_SCAN_WP_MATCHING)
+ mmu_notifier_invalidate_range_end(&range);
+
mmap_read_unlock(mm);
n_out = pagemap_scan_flush_buffer(&p);
@@ -2477,9 +2499,6 @@ static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg)
if (pagemap_scan_writeback_args(&p.arg, uarg))
ret = -EFAULT;
- if (p.arg.flags & PM_SCAN_WP_MATCHING)
- mmu_notifier_invalidate_range_end(&range);
-
kfree(p.vec_buf);
return ret;
}
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 250eb5bf7b52..0a808951b7d3 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -142,13 +142,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
MAJOR(sb->s_dev), MINOR(sb->s_dev));
- if (sb->s_op->show_path) {
- err = sb->s_op->show_path(m, mnt->mnt_root);
- if (err)
- goto out;
- } else {
- seq_dentry(m, mnt->mnt_root, " \t\n\\");
- }
+ err = show_path(m, mnt->mnt_root);
+ if (err)
+ goto out;
seq_putc(m, ' ');
/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
@@ -283,8 +279,6 @@ static int mounts_open_common(struct inode *inode, struct file *file,
p->ns = ns;
p->root = root;
p->show = show;
- INIT_LIST_HEAD(&p->cursor.mnt_list);
- p->cursor.mnt.mnt_flags = MNT_CURSOR;
return 0;
@@ -301,7 +295,6 @@ static int mounts_release(struct inode *inode, struct file *file)
struct seq_file *m = file->private_data;
struct proc_mounts *p = m->private;
path_put(&p->root);
- mnt_cursor_del(p->ns, &p->cursor);
put_mnt_ns(p->ns);
return seq_release_private(inode, file);
}
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index d41c20d1b5e8..d0d9bfdad30c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -23,6 +23,7 @@
#include <linux/pstore.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
+#include <linux/cleanup.h>
#include "internal.h"
@@ -34,6 +35,8 @@ static LIST_HEAD(records_list);
static DEFINE_MUTEX(pstore_sb_lock);
static struct super_block *pstore_sb;
+DEFINE_FREE(pstore_iput, struct inode *, if (_T) iput(_T))
+
struct pstore_private {
struct list_head list;
struct dentry *dentry;
@@ -60,11 +63,12 @@ static void free_pstore_private(struct pstore_private *private)
}
kfree(private);
}
+DEFINE_FREE(pstore_private, struct pstore_private *, free_pstore_private(_T));
static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos)
{
struct pstore_private *ps = s->private;
- struct pstore_ftrace_seq_data *data;
+ struct pstore_ftrace_seq_data *data __free(kfree) = NULL;
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
@@ -72,13 +76,10 @@ static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos)
data->off = ps->total_size % REC_SIZE;
data->off += *pos * REC_SIZE;
- if (data->off + REC_SIZE > ps->total_size) {
- kfree(data);
+ if (data->off + REC_SIZE > ps->total_size)
return NULL;
- }
-
- return data;
+ return_ptr(data);
}
static void pstore_ftrace_seq_stop(struct seq_file *s, void *v)
@@ -182,25 +183,21 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
{
struct pstore_private *p = d_inode(dentry)->i_private;
struct pstore_record *record = p->record;
- int rc = 0;
if (!record->psi->erase)
return -EPERM;
/* Make sure we can't race while removing this file. */
- mutex_lock(&records_list_lock);
- if (!list_empty(&p->list))
- list_del_init(&p->list);
- else
- rc = -ENOENT;
- p->dentry = NULL;
- mutex_unlock(&records_list_lock);
- if (rc)
- return rc;
-
- mutex_lock(&record->psi->read_mutex);
- record->psi->erase(record);
- mutex_unlock(&record->psi->read_mutex);
+ scoped_guard(mutex, &records_list_lock) {
+ if (!list_empty(&p->list))
+ list_del_init(&p->list);
+ else
+ return -ENOENT;
+ p->dentry = NULL;
+ }
+
+ scoped_guard(mutex, &record->psi->read_mutex)
+ record->psi->erase(record);
return simple_unlink(dir, dentry);
}
@@ -292,19 +289,16 @@ static struct dentry *psinfo_lock_root(void)
{
struct dentry *root;
- mutex_lock(&pstore_sb_lock);
+ guard(mutex)(&pstore_sb_lock);
/*
* Having no backend is fine -- no records appear.
* Not being mounted is fine -- nothing to do.
*/
- if (!psinfo || !pstore_sb) {
- mutex_unlock(&pstore_sb_lock);
+ if (!psinfo || !pstore_sb)
return NULL;
- }
root = pstore_sb->s_root;
inode_lock(d_inode(root));
- mutex_unlock(&pstore_sb_lock);
return root;
}
@@ -319,19 +313,19 @@ int pstore_put_backend_records(struct pstore_info *psi)
if (!root)
return 0;
- mutex_lock(&records_list_lock);
- list_for_each_entry_safe(pos, tmp, &records_list, list) {
- if (pos->record->psi == psi) {
- list_del_init(&pos->list);
- rc = simple_unlink(d_inode(root), pos->dentry);
- if (WARN_ON(rc))
- break;
- d_drop(pos->dentry);
- dput(pos->dentry);
- pos->dentry = NULL;
+ scoped_guard(mutex, &records_list_lock) {
+ list_for_each_entry_safe(pos, tmp, &records_list, list) {
+ if (pos->record->psi == psi) {
+ list_del_init(&pos->list);
+ rc = simple_unlink(d_inode(root), pos->dentry);
+ if (WARN_ON(rc))
+ break;
+ d_drop(pos->dentry);
+ dput(pos->dentry);
+ pos->dentry = NULL;
+ }
}
}
- mutex_unlock(&records_list_lock);
inode_unlock(d_inode(root));
@@ -346,29 +340,27 @@ int pstore_put_backend_records(struct pstore_info *psi)
int pstore_mkfile(struct dentry *root, struct pstore_record *record)
{
struct dentry *dentry;
- struct inode *inode;
- int rc = 0;
+ struct inode *inode __free(pstore_iput) = NULL;
char name[PSTORE_NAMELEN];
- struct pstore_private *private, *pos;
+ struct pstore_private *private __free(pstore_private) = NULL, *pos;
size_t size = record->size + record->ecc_notice_size;
if (WARN_ON(!inode_is_locked(d_inode(root))))
return -EINVAL;
- rc = -EEXIST;
+ guard(mutex)(&records_list_lock);
+
/* Skip records that are already present in the filesystem. */
- mutex_lock(&records_list_lock);
list_for_each_entry(pos, &records_list, list) {
if (pos->record->type == record->type &&
pos->record->id == record->id &&
pos->record->psi == record->psi)
- goto fail;
+ return -EEXIST;
}
- rc = -ENOMEM;
inode = pstore_get_inode(root->d_sb);
if (!inode)
- goto fail;
+ return -ENOMEM;
inode->i_mode = S_IFREG | 0444;
inode->i_fop = &pstore_file_operations;
scnprintf(name, sizeof(name), "%s-%s-%llu%s",
@@ -378,11 +370,11 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
private = kzalloc(sizeof(*private), GFP_KERNEL);
if (!private)
- goto fail_inode;
+ return -ENOMEM;
dentry = d_alloc_name(root, name);
if (!dentry)
- goto fail_private;
+ return -ENOMEM;
private->dentry = dentry;
private->record = record;
@@ -393,20 +385,11 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record)
inode_set_mtime_to_ts(inode,
inode_set_ctime_to_ts(inode, record->time));
- d_add(dentry, inode);
+ d_add(dentry, no_free_ptr(inode));
- list_add(&private->list, &records_list);
- mutex_unlock(&records_list_lock);
+ list_add(&(no_free_ptr(private))->list, &records_list);
return 0;
-
-fail_private:
- free_pstore_private(private);
-fail_inode:
- iput(inode);
-fail:
- mutex_unlock(&records_list_lock);
- return rc;
}
/*
@@ -451,9 +434,8 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent)
if (!sb->s_root)
return -ENOMEM;
- mutex_lock(&pstore_sb_lock);
- pstore_sb = sb;
- mutex_unlock(&pstore_sb_lock);
+ scoped_guard(mutex, &pstore_sb_lock)
+ pstore_sb = sb;
pstore_get_records(0);
@@ -468,17 +450,14 @@ static struct dentry *pstore_mount(struct file_system_type *fs_type,
static void pstore_kill_sb(struct super_block *sb)
{
- mutex_lock(&pstore_sb_lock);
+ guard(mutex)(&pstore_sb_lock);
WARN_ON(pstore_sb && pstore_sb != sb);
kill_litter_super(sb);
pstore_sb = NULL;
- mutex_lock(&records_list_lock);
+ guard(mutex)(&records_list_lock);
INIT_LIST_HEAD(&records_list);
- mutex_unlock(&records_list_lock);
-
- mutex_unlock(&pstore_sb_lock);
}
static struct file_system_type pstore_fs_type = {
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index d36702c7ab3c..88b34fdbf759 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -529,6 +529,7 @@ static int ramoops_init_przs(const char *name,
}
zone_sz = mem_sz / *cnt;
+ zone_sz = ALIGN_DOWN(zone_sz, 2);
if (!zone_sz) {
dev_err(dev, "%s zone size == 0\n", name);
goto fail;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 650e437b55e6..f1848cdd6d34 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -190,7 +190,7 @@ static int persistent_ram_init_ecc(struct persistent_ram_zone *prz,
{
int numerr;
struct persistent_ram_buffer *buffer = prz->buffer;
- int ecc_blocks;
+ size_t ecc_blocks;
size_t ecc_total;
if (!ecc_info || !ecc_info->ecc_size)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 66645a5a35f3..42a529e26bd6 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -15,43 +15,6 @@
#include <linux/buffer_head.h>
#include "qnx4.h"
-/*
- * A qnx4 directory entry is an inode entry or link info
- * depending on the status field in the last byte. The
- * first byte is where the name start either way, and a
- * zero means it's empty.
- *
- * Also, due to a bug in gcc, we don't want to use the
- * real (differently sized) name arrays in the inode and
- * link entries, but always the 'de_name[]' one in the
- * fake struct entry.
- *
- * See
- *
- * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99578#c6
- *
- * for details, but basically gcc will take the size of the
- * 'name' array from one of the used union entries randomly.
- *
- * This use of 'de_name[]' (48 bytes) avoids the false positive
- * warnings that would happen if gcc decides to use 'inode.di_name'
- * (16 bytes) even when the pointer and size were to come from
- * 'link.dl_name' (48 bytes).
- *
- * In all cases the actual name pointer itself is the same, it's
- * only the gcc internal 'what is the size of this field' logic
- * that can get confused.
- */
-union qnx4_directory_entry {
- struct {
- const char de_name[48];
- u8 de_pad[15];
- u8 de_status;
- };
- struct qnx4_inode_entry inode;
- struct qnx4_link_info link;
-};
-
static int qnx4_readdir(struct file *file, struct dir_context *ctx)
{
struct inode *inode = file_inode(file);
@@ -74,26 +37,25 @@ static int qnx4_readdir(struct file *file, struct dir_context *ctx)
ix = (ctx->pos >> QNX4_DIR_ENTRY_SIZE_BITS) % QNX4_INODES_PER_BLOCK;
for (; ix < QNX4_INODES_PER_BLOCK; ix++, ctx->pos += QNX4_DIR_ENTRY_SIZE) {
union qnx4_directory_entry *de;
+ const char *fname;
offset = ix * QNX4_DIR_ENTRY_SIZE;
de = (union qnx4_directory_entry *) (bh->b_data + offset);
- if (!de->de_name[0])
- continue;
- if (!(de->de_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
+ fname = get_entry_fname(de, &size);
+ if (!fname)
continue;
+
if (!(de->de_status & QNX4_FILE_LINK)) {
- size = sizeof(de->inode.di_fname);
ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
} else {
- size = sizeof(de->link.dl_fname);
ino = ( le32_to_cpu(de->link.dl_inode_blk) - 1 ) *
QNX4_INODES_PER_BLOCK +
de->link.dl_inode_ndx;
}
- size = strnlen(de->de_name, size);
- QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, name));
- if (!dir_emit(ctx, de->de_name, size, ino, DT_UNKNOWN)) {
+
+ QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, fname));
+ if (!dir_emit(ctx, fname, size, ino, DT_UNKNOWN)) {
brelse(bh);
return 0;
}
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 8d72221735d7..bb8db6550ca5 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -26,31 +26,24 @@
static int qnx4_match(int len, const char *name,
struct buffer_head *bh, unsigned long *offset)
{
- struct qnx4_inode_entry *de;
- int namelen, thislen;
+ union qnx4_directory_entry *de;
+ const char *fname;
+ int fnamelen;
if (bh == NULL) {
printk(KERN_WARNING "qnx4: matching unassigned buffer !\n");
return 0;
}
- de = (struct qnx4_inode_entry *) (bh->b_data + *offset);
+ de = (union qnx4_directory_entry *) (bh->b_data + *offset);
*offset += QNX4_DIR_ENTRY_SIZE;
- if ((de->di_status & QNX4_FILE_LINK) != 0) {
- namelen = QNX4_NAME_MAX;
- } else {
- namelen = QNX4_SHORT_NAME_MAX;
- }
- thislen = strlen( de->di_fname );
- if ( thislen > namelen )
- thislen = namelen;
- if (len != thislen) {
+
+ fname = get_entry_fname(de, &fnamelen);
+ if (!fname || len != fnamelen)
return 0;
- }
- if (strncmp(name, de->di_fname, len) == 0) {
- if ((de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK)) != 0) {
- return 1;
- }
- }
+
+ if (strncmp(name, fname, len) == 0)
+ return 1;
+
return 0;
}
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
index 6283705466a4..5c2b1fb6b952 100644
--- a/fs/qnx4/qnx4.h
+++ b/fs/qnx4/qnx4.h
@@ -44,3 +44,63 @@ static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
{
return &qnx4_i(inode)->raw;
}
+
+/*
+ * A qnx4 directory entry is an inode entry or link info
+ * depending on the status field in the last byte. The
+ * first byte is where the name start either way, and a
+ * zero means it's empty.
+ *
+ * Also, due to a bug in gcc, we don't want to use the
+ * real (differently sized) name arrays in the inode and
+ * link entries, but always the 'de_name[]' one in the
+ * fake struct entry.
+ *
+ * See
+ *
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=99578#c6
+ *
+ * for details, but basically gcc will take the size of the
+ * 'name' array from one of the used union entries randomly.
+ *
+ * This use of 'de_name[]' (48 bytes) avoids the false positive
+ * warnings that would happen if gcc decides to use 'inode.di_name'
+ * (16 bytes) even when the pointer and size were to come from
+ * 'link.dl_name' (48 bytes).
+ *
+ * In all cases the actual name pointer itself is the same, it's
+ * only the gcc internal 'what is the size of this field' logic
+ * that can get confused.
+ */
+union qnx4_directory_entry {
+ struct {
+ const char de_name[48];
+ u8 de_pad[15];
+ u8 de_status;
+ };
+ struct qnx4_inode_entry inode;
+ struct qnx4_link_info link;
+};
+
+static inline const char *get_entry_fname(union qnx4_directory_entry *de,
+ int *size)
+{
+ /* Make sure the status byte is in the same place for all structs. */
+ BUILD_BUG_ON(offsetof(struct qnx4_inode_entry, di_status) !=
+ offsetof(struct qnx4_link_info, dl_status));
+ BUILD_BUG_ON(offsetof(struct qnx4_inode_entry, di_status) !=
+ offsetof(union qnx4_directory_entry, de_status));
+
+ if (!de->de_name[0])
+ return NULL;
+ if (!(de->de_status & (QNX4_FILE_USED|QNX4_FILE_LINK)))
+ return NULL;
+ if (!(de->de_status & QNX4_FILE_LINK))
+ *size = sizeof(de->inode.di_fname);
+ else
+ *size = sizeof(de->link.dl_fname);
+
+ *size = strnlen(de->de_name, *size);
+
+ return de->de_name;
+}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 58b5de081b57..1f0c754416b6 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1787,7 +1787,7 @@ EXPORT_SYMBOL(dquot_alloc_inode);
/*
* Convert in-memory reserved quotas to real consumed quotas
*/
-int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
+void dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
{
struct dquot **dquots;
int cnt, index;
@@ -1797,7 +1797,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
*inode_reserved_space(inode) -= number;
__inode_add_bytes(inode, number);
spin_unlock(&inode->i_lock);
- return 0;
+ return;
}
dquots = i_dquot(inode);
@@ -1822,7 +1822,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number)
spin_unlock(&inode->i_lock);
mark_all_dquot_dirty(dquots);
srcu_read_unlock(&dquot_srcu, index);
- return 0;
+ return;
}
EXPORT_SYMBOL(dquot_claim_space_nodirty);
@@ -2969,7 +2969,6 @@ static struct ctl_table fs_dqstats_table[] = {
.proc_handler = proc_dointvec,
},
#endif
- { },
};
static int __init dquot_init(void)
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index efb1b4c1a0a4..7a6d980e614d 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -70,7 +70,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
/* make various checks */
order = get_order(newsize);
- if (unlikely(order > MAX_ORDER))
+ if (unlikely(order > MAX_PAGE_ORDER))
return -EFBIG;
ret = inode_newsize_ok(inode, newsize);
diff --git a/fs/read_write.c b/fs/read_write.c
index 4771701c896b..d4c036e82b6c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -354,6 +354,9 @@ out_putf:
int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
+ int mask = read_write == READ ? MAY_READ : MAY_WRITE;
+ int ret;
+
if (unlikely((ssize_t) count < 0))
return -EINVAL;
@@ -371,8 +374,11 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
}
}
- return security_file_permission(file,
- read_write == READ ? MAY_READ : MAY_WRITE);
+ ret = security_file_permission(file, mask);
+ if (ret)
+ return ret;
+
+ return fsnotify_file_area_perm(file, mask, ppos, count);
}
EXPORT_SYMBOL(rw_verify_area);
@@ -773,12 +779,14 @@ static ssize_t do_loop_readv_writev(struct file *filp, struct iov_iter *iter,
return ret;
}
-static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
- loff_t *pos, rwf_t flags)
+ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
+ struct iov_iter *iter)
{
size_t tot_len;
ssize_t ret = 0;
+ if (!file->f_op->read_iter)
+ return -EINVAL;
if (!(file->f_mode & FMODE_READ))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_READ))
@@ -787,22 +795,20 @@ static ssize_t do_iter_read(struct file *file, struct iov_iter *iter,
tot_len = iov_iter_count(iter);
if (!tot_len)
goto out;
- ret = rw_verify_area(READ, file, pos, tot_len);
+ ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
if (ret < 0)
return ret;
- if (file->f_op->read_iter)
- ret = do_iter_readv_writev(file, iter, pos, READ, flags);
- else
- ret = do_loop_readv_writev(file, iter, pos, READ, flags);
+ ret = call_read_iter(file, iocb, iter);
out:
if (ret >= 0)
fsnotify_access(file);
return ret;
}
+EXPORT_SYMBOL(vfs_iocb_iter_read);
-ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
- struct iov_iter *iter)
+ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
+ rwf_t flags)
{
size_t tot_len;
ssize_t ret = 0;
@@ -817,33 +823,30 @@ ssize_t vfs_iocb_iter_read(struct file *file, struct kiocb *iocb,
tot_len = iov_iter_count(iter);
if (!tot_len)
goto out;
- ret = rw_verify_area(READ, file, &iocb->ki_pos, tot_len);
+ ret = rw_verify_area(READ, file, ppos, tot_len);
if (ret < 0)
return ret;
- ret = call_read_iter(file, iocb, iter);
+ ret = do_iter_readv_writev(file, iter, ppos, READ, flags);
out:
if (ret >= 0)
fsnotify_access(file);
return ret;
}
-EXPORT_SYMBOL(vfs_iocb_iter_read);
-
-ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos,
- rwf_t flags)
-{
- if (!file->f_op->read_iter)
- return -EINVAL;
- return do_iter_read(file, iter, ppos, flags);
-}
EXPORT_SYMBOL(vfs_iter_read);
-static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
- loff_t *pos, rwf_t flags)
+/*
+ * Caller is responsible for calling kiocb_end_write() on completion
+ * if async iocb was queued.
+ */
+ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
+ struct iov_iter *iter)
{
size_t tot_len;
ssize_t ret = 0;
+ if (!file->f_op->write_iter)
+ return -EINVAL;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
@@ -852,88 +855,127 @@ static ssize_t do_iter_write(struct file *file, struct iov_iter *iter,
tot_len = iov_iter_count(iter);
if (!tot_len)
return 0;
- ret = rw_verify_area(WRITE, file, pos, tot_len);
+ ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
if (ret < 0)
return ret;
- if (file->f_op->write_iter)
- ret = do_iter_readv_writev(file, iter, pos, WRITE, flags);
- else
- ret = do_loop_readv_writev(file, iter, pos, WRITE, flags);
+ kiocb_start_write(iocb);
+ ret = call_write_iter(file, iocb, iter);
+ if (ret != -EIOCBQUEUED)
+ kiocb_end_write(iocb);
if (ret > 0)
fsnotify_modify(file);
+
return ret;
}
+EXPORT_SYMBOL(vfs_iocb_iter_write);
-ssize_t vfs_iocb_iter_write(struct file *file, struct kiocb *iocb,
- struct iov_iter *iter)
+ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
+ rwf_t flags)
{
size_t tot_len;
- ssize_t ret = 0;
+ ssize_t ret;
- if (!file->f_op->write_iter)
- return -EINVAL;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
+ if (!file->f_op->write_iter)
+ return -EINVAL;
tot_len = iov_iter_count(iter);
if (!tot_len)
return 0;
- ret = rw_verify_area(WRITE, file, &iocb->ki_pos, tot_len);
+
+ ret = rw_verify_area(WRITE, file, ppos, tot_len);
if (ret < 0)
return ret;
- ret = call_write_iter(file, iocb, iter);
+ file_start_write(file);
+ ret = do_iter_readv_writev(file, iter, ppos, WRITE, flags);
if (ret > 0)
fsnotify_modify(file);
+ file_end_write(file);
return ret;
}
-EXPORT_SYMBOL(vfs_iocb_iter_write);
-
-ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos,
- rwf_t flags)
-{
- if (!file->f_op->write_iter)
- return -EINVAL;
- return do_iter_write(file, iter, ppos, flags);
-}
EXPORT_SYMBOL(vfs_iter_write);
static ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
+ unsigned long vlen, loff_t *pos, rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
- ssize_t ret;
+ size_t tot_len;
+ ssize_t ret = 0;
- ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
- if (ret >= 0) {
- ret = do_iter_read(file, &iter, pos, flags);
- kfree(iov);
- }
+ if (!(file->f_mode & FMODE_READ))
+ return -EBADF;
+ if (!(file->f_mode & FMODE_CAN_READ))
+ return -EINVAL;
+
+ ret = import_iovec(ITER_DEST, vec, vlen, ARRAY_SIZE(iovstack), &iov,
+ &iter);
+ if (ret < 0)
+ return ret;
+
+ tot_len = iov_iter_count(&iter);
+ if (!tot_len)
+ goto out;
+ ret = rw_verify_area(READ, file, pos, tot_len);
+ if (ret < 0)
+ goto out;
+
+ if (file->f_op->read_iter)
+ ret = do_iter_readv_writev(file, &iter, pos, READ, flags);
+ else
+ ret = do_loop_readv_writev(file, &iter, pos, READ, flags);
+out:
+ if (ret >= 0)
+ fsnotify_access(file);
+ kfree(iov);
return ret;
}
static ssize_t vfs_writev(struct file *file, const struct iovec __user *vec,
- unsigned long vlen, loff_t *pos, rwf_t flags)
+ unsigned long vlen, loff_t *pos, rwf_t flags)
{
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
struct iov_iter iter;
- ssize_t ret;
+ size_t tot_len;
+ ssize_t ret = 0;
- ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter);
- if (ret >= 0) {
- file_start_write(file);
- ret = do_iter_write(file, &iter, pos, flags);
- file_end_write(file);
- kfree(iov);
- }
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EBADF;
+ if (!(file->f_mode & FMODE_CAN_WRITE))
+ return -EINVAL;
+
+ ret = import_iovec(ITER_SOURCE, vec, vlen, ARRAY_SIZE(iovstack), &iov,
+ &iter);
+ if (ret < 0)
+ return ret;
+
+ tot_len = iov_iter_count(&iter);
+ if (!tot_len)
+ goto out;
+
+ ret = rw_verify_area(WRITE, file, pos, tot_len);
+ if (ret < 0)
+ goto out;
+
+ file_start_write(file);
+ if (file->f_op->write_iter)
+ ret = do_iter_readv_writev(file, &iter, pos, WRITE, flags);
+ else
+ ret = do_loop_readv_writev(file, &iter, pos, WRITE, flags);
+ if (ret > 0)
+ fsnotify_modify(file);
+ file_end_write(file);
+out:
+ kfree(iov);
return ret;
}
@@ -1178,7 +1220,7 @@ COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd,
#endif /* CONFIG_COMPAT */
static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
- size_t count, loff_t max)
+ size_t count, loff_t max)
{
struct fd in, out;
struct inode *in_inode, *out_inode;
@@ -1250,10 +1292,8 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
retval = rw_verify_area(WRITE, out.file, &out_pos, count);
if (retval < 0)
goto fput_out;
- file_start_write(out.file);
retval = do_splice_direct(in.file, &pos, out.file, &out_pos,
count, fl);
- file_end_write(out.file);
} else {
if (out.file->f_flags & O_NONBLOCK)
fl |= SPLICE_F_NONBLOCK;
@@ -1362,38 +1402,6 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
}
#endif
-/**
- * generic_copy_file_range - copy data between two files
- * @file_in: file structure to read from
- * @pos_in: file offset to read from
- * @file_out: file structure to write data to
- * @pos_out: file offset to write data to
- * @len: amount of data to copy
- * @flags: copy flags
- *
- * This is a generic filesystem helper to copy data from one file to another.
- * It has no constraints on the source or destination file owners - the files
- * can belong to different superblocks and different filesystem types. Short
- * copies are allowed.
- *
- * This should be called from the @file_out filesystem, as per the
- * ->copy_file_range() method.
- *
- * Returns the number of bytes copied or a negative error indicating the
- * failure.
- */
-
-ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- size_t len, unsigned int flags)
-{
- lockdep_assert(sb_write_started(file_inode(file_out)->i_sb));
-
- return do_splice_direct(file_in, &pos_in, file_out, &pos_out,
- len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
-}
-EXPORT_SYMBOL(generic_copy_file_range);
-
/*
* Performs necessary checks before doing a file copy
*
@@ -1478,6 +1486,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
{
ssize_t ret;
bool splice = flags & COPY_FILE_SPLICE;
+ bool samesb = file_inode(file_in)->i_sb == file_inode(file_out)->i_sb;
if (flags & ~COPY_FILE_SPLICE)
return -EINVAL;
@@ -1509,19 +1518,24 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
ret = file_out->f_op->copy_file_range(file_in, pos_in,
file_out, pos_out,
len, flags);
- goto done;
- }
-
- if (!splice && file_in->f_op->remap_file_range &&
- file_inode(file_in)->i_sb == file_inode(file_out)->i_sb) {
+ } else if (!splice && file_in->f_op->remap_file_range && samesb) {
ret = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out,
min_t(loff_t, MAX_RW_COUNT, len),
REMAP_FILE_CAN_SHORTEN);
- if (ret > 0)
- goto done;
+ /* fallback to splice */
+ if (ret <= 0)
+ splice = true;
+ } else if (samesb) {
+ /* Fallback to splice for same sb copy for backward compat */
+ splice = true;
}
+ file_end_write(file_out);
+
+ if (!splice)
+ goto done;
+
/*
* We can get here for same sb copy of filesystems that do not implement
* ->copy_file_range() in case filesystem does not support clone or in
@@ -1533,11 +1547,16 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* and which filesystems do not, that will allow userspace tools to
* make consistent desicions w.r.t using copy_file_range().
*
- * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE.
+ * We also get here if caller (e.g. nfsd) requested COPY_FILE_SPLICE
+ * for server-side-copy between any two sb.
+ *
+ * In any case, we call do_splice_direct() and not splice_file_range(),
+ * without file_start_write() held, to avoid possible deadlocks related
+ * to splicing from input file, while file_start_write() is held on
+ * the output file on a different sb.
*/
- ret = generic_copy_file_range(file_in, pos_in, file_out, pos_out, len,
- flags);
-
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+ min_t(size_t, len, MAX_RW_COUNT), 0);
done:
if (ret > 0) {
fsnotify_access(file_in);
@@ -1549,8 +1568,6 @@ done:
inc_syscr(current);
inc_syscw(current);
- file_end_write(file_out);
-
return ret;
}
EXPORT_SYMBOL(vfs_copy_file_range);
diff --git a/fs/readdir.c b/fs/readdir.c
index c8c46e294431..278bc0254732 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -96,6 +96,10 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
if (res)
goto out;
+ res = fsnotify_file_perm(file, MAY_READ);
+ if (res)
+ goto out;
+
res = down_read_killable(&inode->i_rwsem);
if (res)
goto out;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 994d6e6995ab..7e7b531fcc49 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -451,13 +451,6 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th,
BUG_ON(!th->t_trans_id);
- /* cannot allow items to be added into a busy deleted directory */
- if (!namelen)
- return -EINVAL;
-
- if (namelen > REISERFS_MAX_NAME(dir->i_sb->s_blocksize))
- return -ENAMETOOLONG;
-
/* each entry has unique key. compose it */
make_cpu_key(&entry_key, dir,
get_third_component(dir->i_sb, name, namelen),
@@ -1324,8 +1317,8 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
struct inode *old_inode, *new_dentry_inode;
struct reiserfs_transaction_handle th;
int jbegin_count;
- umode_t old_inode_mode;
unsigned long savelink = 1;
+ bool update_dir_parent = false;
if (flags & ~RENAME_NOREPLACE)
return -EINVAL;
@@ -1375,8 +1368,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
return -ENOENT;
}
- old_inode_mode = old_inode->i_mode;
- if (S_ISDIR(old_inode_mode)) {
+ if (S_ISDIR(old_inode->i_mode)) {
/*
* make sure that directory being renamed has correct ".."
* and that its new parent directory has not too many links
@@ -1389,24 +1381,28 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
}
}
- /*
- * directory is renamed, its parent directory will be changed,
- * so find ".." entry
- */
- dot_dot_de.de_gen_number_bit_string = NULL;
- retval =
- reiserfs_find_entry(old_inode, "..", 2, &dot_dot_entry_path,
+ if (old_dir != new_dir) {
+ /*
+ * directory is renamed, its parent directory will be
+ * changed, so find ".." entry
+ */
+ dot_dot_de.de_gen_number_bit_string = NULL;
+ retval =
+ reiserfs_find_entry(old_inode, "..", 2,
+ &dot_dot_entry_path,
&dot_dot_de);
- pathrelse(&dot_dot_entry_path);
- if (retval != NAME_FOUND) {
- reiserfs_write_unlock(old_dir->i_sb);
- return -EIO;
- }
+ pathrelse(&dot_dot_entry_path);
+ if (retval != NAME_FOUND) {
+ reiserfs_write_unlock(old_dir->i_sb);
+ return -EIO;
+ }
- /* inode number of .. must equal old_dir->i_ino */
- if (dot_dot_de.de_objectid != old_dir->i_ino) {
- reiserfs_write_unlock(old_dir->i_sb);
- return -EIO;
+ /* inode number of .. must equal old_dir->i_ino */
+ if (dot_dot_de.de_objectid != old_dir->i_ino) {
+ reiserfs_write_unlock(old_dir->i_sb);
+ return -EIO;
+ }
+ update_dir_parent = true;
}
}
@@ -1486,7 +1482,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
reiserfs_prepare_for_journal(old_inode->i_sb, new_de.de_bh, 1);
- if (S_ISDIR(old_inode->i_mode)) {
+ if (update_dir_parent) {
if ((retval =
search_by_entry_key(new_dir->i_sb,
&dot_dot_de.de_entry_key,
@@ -1534,14 +1530,14 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
new_de.de_bh);
reiserfs_restore_prepared_buffer(old_inode->i_sb,
old_de.de_bh);
- if (S_ISDIR(old_inode_mode))
+ if (update_dir_parent)
reiserfs_restore_prepared_buffer(old_inode->
i_sb,
dot_dot_de.
de_bh);
continue;
}
- if (S_ISDIR(old_inode_mode)) {
+ if (update_dir_parent) {
if (item_moved(&dot_dot_ih, &dot_dot_entry_path) ||
!entry_points_to_object("..", 2, &dot_dot_de,
old_dir)) {
@@ -1559,7 +1555,7 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
}
}
- RFALSE(S_ISDIR(old_inode_mode) &&
+ RFALSE(update_dir_parent &&
!buffer_journal_prepared(dot_dot_de.de_bh), "");
break;
@@ -1592,11 +1588,12 @@ static int reiserfs_rename(struct mnt_idmap *idmap,
savelink = new_dentry_inode->i_nlink;
}
- if (S_ISDIR(old_inode_mode)) {
+ if (update_dir_parent) {
/* adjust ".." of renamed directory */
set_ino_in_dir_entry(&dot_dot_de, INODE_PKEY(new_dir));
journal_mark_dirty(&th, dot_dot_de.de_bh);
-
+ }
+ if (S_ISDIR(old_inode->i_mode)) {
/*
* there (in new_dir) was no directory, so it got new link
* (".." of renamed directory)
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index 2138ee7d271d..5faf702f8d15 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1407,7 +1407,7 @@ void reiserfs_delete_solid_item(struct reiserfs_transaction_handle *th,
INITIALIZE_PATH(path);
int item_len = 0;
int tb_init = 0;
- struct cpu_key cpu_key;
+ struct cpu_key cpu_key = {};
int retval;
int quota_cut_bytes = 0;
diff --git a/fs/remap_range.c b/fs/remap_range.c
index 87ae4f0dc3aa..de07f978ce3e 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -102,7 +102,9 @@ static int generic_remap_checks(struct file *file_in, loff_t pos_in,
static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
bool write)
{
+ int mask = write ? MAY_WRITE : MAY_READ;
loff_t tmp;
+ int ret;
if (unlikely(pos < 0 || len < 0))
return -EINVAL;
@@ -110,7 +112,11 @@ static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
if (unlikely(check_add_overflow(pos, len, &tmp)))
return -EINVAL;
- return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
+ ret = security_file_permission(file, mask);
+ if (ret)
+ return ret;
+
+ return fsnotify_file_area_perm(file, mask, &pos, len);
}
/*
@@ -367,9 +373,9 @@ int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
}
EXPORT_SYMBOL(generic_remap_file_range_prep);
-loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags)
+loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t len, unsigned int remap_flags)
{
loff_t ret;
@@ -393,8 +399,10 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
if (ret)
return ret;
+ file_start_write(file_out);
ret = file_in->f_op->remap_file_range(file_in, pos_in,
file_out, pos_out, len, remap_flags);
+ file_end_write(file_out);
if (ret < 0)
return ret;
@@ -402,25 +410,10 @@ loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
fsnotify_modify(file_out);
return ret;
}
-EXPORT_SYMBOL(do_clone_file_range);
-
-loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out,
- loff_t len, unsigned int remap_flags)
-{
- loff_t ret;
-
- file_start_write(file_out);
- ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
- remap_flags);
- file_end_write(file_out);
-
- return ret;
-}
EXPORT_SYMBOL(vfs_clone_file_range);
/* Check whether we are allowed to dedupe the destination file */
-static bool allow_file_dedupe(struct file *file)
+static bool may_dedupe_file(struct file *file)
{
struct mnt_idmap *idmap = file_mnt_idmap(file);
struct inode *inode = file_inode(file);
@@ -445,24 +438,29 @@ loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
REMAP_FILE_CAN_SHORTEN));
- ret = mnt_want_write_file(dst_file);
- if (ret)
- return ret;
-
/*
* This is redundant if called from vfs_dedupe_file_range(), but other
* callers need it and it's not performance sesitive...
*/
ret = remap_verify_area(src_file, src_pos, len, false);
if (ret)
- goto out_drop_write;
+ return ret;
ret = remap_verify_area(dst_file, dst_pos, len, true);
if (ret)
- goto out_drop_write;
+ return ret;
+
+ /*
+ * This needs to be called after remap_verify_area() because of
+ * sb_start_write() and before may_dedupe_file() because the mount's
+ * MAY_WRITE need to be checked with mnt_get_write_access_file() held.
+ */
+ ret = mnt_want_write_file(dst_file);
+ if (ret)
+ return ret;
ret = -EPERM;
- if (!allow_file_dedupe(dst_file))
+ if (!may_dedupe_file(dst_file))
goto out_drop_write;
ret = -EXDEV;
diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c
index d64a306a414b..3de5047a7ff9 100644
--- a/fs/smb/client/cached_dir.c
+++ b/fs/smb/client/cached_dir.c
@@ -145,21 +145,27 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
struct cached_fid *cfid;
struct cached_fids *cfids;
const char *npath;
+ int retries = 0, cur_sleep = 1;
if (tcon == NULL || tcon->cfids == NULL || tcon->nohandlecache ||
is_smb1_server(tcon->ses->server) || (dir_cache_timeout == 0))
return -EOPNOTSUPP;
ses = tcon->ses;
- server = ses->server;
cfids = tcon->cfids;
- if (!server->ops->new_lease_key)
- return -EIO;
-
if (cifs_sb->root == NULL)
return -ENOENT;
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ oplock = SMB2_OPLOCK_LEVEL_II;
+ server = cifs_pick_channel(ses);
+
+ if (!server->ops->new_lease_key)
+ return -EIO;
+
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
return -ENOMEM;
@@ -236,6 +242,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
.desired_access = FILE_READ_DATA | FILE_READ_ATTRIBUTES,
.disposition = FILE_OPEN,
.fid = pfid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
@@ -268,6 +275,11 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon,
*/
cfid->has_lease = true;
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ }
+
rc = compound_send_recv(xid, ses, server,
flags, 2, rqst,
resp_buftype, rsp_iov);
@@ -367,6 +379,11 @@ out:
atomic_inc(&tcon->num_remote_opens);
}
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c
index 60027f5aebe8..3e4209f41c18 100644
--- a/fs/smb/client/cifs_debug.c
+++ b/fs/smb/client/cifs_debug.c
@@ -659,6 +659,7 @@ static ssize_t cifs_stats_proc_write(struct file *file,
spin_lock(&tcon->stat_lock);
tcon->bytes_read = 0;
tcon->bytes_written = 0;
+ tcon->stats_from_time = ktime_get_real_seconds();
spin_unlock(&tcon->stat_lock);
if (server->ops->clear_stats)
server->ops->clear_stats(tcon);
@@ -737,8 +738,9 @@ static int cifs_stats_proc_show(struct seq_file *m, void *v)
seq_printf(m, "\n%d) %s", i, tcon->tree_name);
if (tcon->need_reconnect)
seq_puts(m, "\tDISCONNECTED ");
- seq_printf(m, "\nSMBs: %d",
- atomic_read(&tcon->num_smbs_sent));
+ seq_printf(m, "\nSMBs: %d since %ptTs UTC",
+ atomic_read(&tcon->num_smbs_sent),
+ &tcon->stats_from_time);
if (server->ops->print_stats)
server->ops->print_stats(m, tcon);
}
diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c
index ef4c2e3c9fa6..6322f0f68a17 100644
--- a/fs/smb/client/cifsencrypt.c
+++ b/fs/smb/client/cifsencrypt.c
@@ -572,7 +572,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash,
len = cifs_strtoUTF16(user, ses->user_name, len, nls_cp);
UniStrupr(user);
} else {
- memset(user, '\0', 2);
+ *(u16 *)user = 0;
}
rc = crypto_shash_update(ses->server->secmech.hmacmd5,
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index 2131638f26d0..0c269396ae15 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -25,6 +25,7 @@
#include <linux/freezer.h>
#include <linux/namei.h>
#include <linux/random.h>
+#include <linux/splice.h>
#include <linux/uuid.h>
#include <linux/xattr.h>
#include <uapi/linux/magic.h>
@@ -395,7 +396,7 @@ cifs_alloc_inode(struct super_block *sb)
spin_lock_init(&cifs_inode->writers_lock);
cifs_inode->writers = 0;
cifs_inode->netfs.inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
- cifs_inode->server_eof = 0;
+ cifs_inode->netfs.remote_i_size = 0;
cifs_inode->uniqueid = 0;
cifs_inode->createtime = 0;
cifs_inode->epoch = 0;
@@ -429,7 +430,7 @@ static void
cifs_evict_inode(struct inode *inode)
{
truncate_inode_pages_final(&inode->i_data);
- if (inode->i_state & I_PINNING_FSCACHE_WB)
+ if (inode->i_state & I_PINNING_NETFS_WB)
cifs_fscache_unuse_inode_cookie(inode, true);
cifs_fscache_release_inode_cookie(inode);
clear_inode(inode);
@@ -680,6 +681,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize);
if (tcon->ses->server->min_offload)
seq_printf(s, ",esize=%u", tcon->ses->server->min_offload);
+ if (tcon->ses->server->retrans)
+ seq_printf(s, ",retrans=%u", tcon->ses->server->retrans);
seq_printf(s, ",echo_interval=%lu",
tcon->ses->server->echo_interval / HZ);
@@ -792,8 +795,7 @@ static int cifs_show_stats(struct seq_file *s, struct dentry *root)
static int cifs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
- fscache_unpin_writeback(wbc, cifs_inode_cookie(inode));
- return 0;
+ return netfs_unpin_writeback(inode, wbc);
}
static int cifs_drop_inode(struct inode *inode)
@@ -1170,6 +1172,9 @@ const char *cifs_get_link(struct dentry *dentry, struct inode *inode,
{
char *target_path;
+ if (!dentry)
+ return ERR_PTR(-ECHILD);
+
target_path = kmalloc(PATH_MAX, GFP_KERNEL);
if (!target_path)
return ERR_PTR(-ENOMEM);
@@ -1221,7 +1226,7 @@ static int cifs_precopy_set_eof(struct inode *src_inode, struct cifsInodeInfo *s
if (rc < 0)
goto set_failed;
- netfs_resize_file(&src_cifsi->netfs, src_end);
+ netfs_resize_file(&src_cifsi->netfs, src_end, true);
fscache_resize_cookie(cifs_inode_cookie(src_inode), src_end);
return 0;
@@ -1352,7 +1357,7 @@ static loff_t cifs_remap_file_range(struct file *src_file, loff_t off,
smb_file_src, smb_file_target, off, len, destoff);
if (rc == 0 && new_size > i_size_read(target_inode)) {
truncate_setsize(target_inode, new_size);
- netfs_resize_file(&target_cifsi->netfs, new_size);
+ netfs_resize_file(&target_cifsi->netfs, new_size, true);
fscache_resize_cookie(cifs_inode_cookie(target_inode),
new_size);
}
@@ -1378,6 +1383,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
struct inode *src_inode = file_inode(src_file);
struct inode *target_inode = file_inode(dst_file);
struct cifsInodeInfo *src_cifsi = CIFS_I(src_inode);
+ struct cifsInodeInfo *target_cifsi = CIFS_I(target_inode);
struct cifsFileInfo *smb_file_src;
struct cifsFileInfo *smb_file_target;
struct cifs_tcon *src_tcon;
@@ -1426,7 +1432,7 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
* Advance the EOF marker after the flush above to the end of the range
* if it's short of that.
*/
- if (src_cifsi->server_eof < off + len) {
+ if (src_cifsi->netfs.remote_i_size < off + len) {
rc = cifs_precopy_set_eof(src_inode, src_cifsi, src_tcon, xid, off + len);
if (rc < 0)
goto unlock;
@@ -1450,12 +1456,22 @@ ssize_t cifs_file_copychunk_range(unsigned int xid,
/* Discard all the folios that overlap the destination region. */
truncate_inode_pages_range(&target_inode->i_data, fstart, fend);
+ fscache_invalidate(cifs_inode_cookie(target_inode), NULL,
+ i_size_read(target_inode), 0);
+
rc = file_modified(dst_file);
if (!rc) {
rc = target_tcon->ses->server->ops->copychunk_range(xid,
smb_file_src, smb_file_target, off, len, destoff);
- if (rc > 0 && destoff + rc > i_size_read(target_inode))
+ if (rc > 0 && destoff + rc > i_size_read(target_inode)) {
truncate_setsize(target_inode, destoff + rc);
+ netfs_resize_file(&target_cifsi->netfs,
+ i_size_read(target_inode), true);
+ fscache_resize_cookie(cifs_inode_cookie(target_inode),
+ i_size_read(target_inode));
+ }
+ if (rc > 0 && destoff + rc > target_cifsi->netfs.zero_point)
+ target_cifsi->netfs.zero_point = destoff + rc;
}
file_accessed(src_file);
@@ -1506,8 +1522,8 @@ static ssize_t cifs_copy_file_range(struct file *src_file, loff_t off,
free_xid(xid);
if (rc == -EOPNOTSUPP || rc == -EXDEV)
- rc = generic_copy_file_range(src_file, off, dst_file,
- destoff, len, flags);
+ rc = splice_copy_file_range(src_file, off, dst_file,
+ destoff, len);
return rc;
}
diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h
index 3adea10aa9da..685f7d1139c6 100644
--- a/fs/smb/client/cifsfs.h
+++ b/fs/smb/client/cifsfs.h
@@ -152,6 +152,6 @@ extern const struct export_operations cifs_export_ops;
#endif /* CONFIG_CIFS_NFSD_EXPORT */
/* when changing internal version - update following two lines at same time */
-#define SMB3_PRODUCT_BUILD 46
-#define CIFS_VERSION "2.46"
+#define SMB3_PRODUCT_BUILD 47
+#define CIFS_VERSION "2.47"
#endif /* _CIFSFS_H */
diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h
index 5e32c79f03a7..53c75cfb33ab 100644
--- a/fs/smb/client/cifsglob.h
+++ b/fs/smb/client/cifsglob.h
@@ -50,6 +50,11 @@
#define CIFS_DEF_ACTIMEO (1 * HZ)
/*
+ * max sleep time before retry to server
+ */
+#define CIFS_MAX_SLEEP 2000
+
+/*
* max attribute cache timeout (jiffies) - 2^30
*/
#define CIFS_MAX_ACTIMEO (1 << 30)
@@ -82,7 +87,7 @@
#define SMB_INTERFACE_POLL_INTERVAL 600
/* maximum number of PDUs in one compound */
-#define MAX_COMPOUND 5
+#define MAX_COMPOUND 7
/*
* Default number of credits to keep available for SMB3.
@@ -192,6 +197,11 @@ struct cifs_open_info_data {
bool symlink;
};
struct {
+ /* ioctl response buffer */
+ struct {
+ int buftype;
+ struct kvec iov;
+ } io;
__u32 tag;
union {
struct reparse_data_buffer *buf;
@@ -199,19 +209,25 @@ struct cifs_open_info_data {
};
} reparse;
char *symlink_target;
+ struct cifs_sid posix_owner;
+ struct cifs_sid posix_group;
union {
struct smb2_file_all_info fi;
struct smb311_posix_qinfo posix_fi;
};
};
-#define cifs_open_data_reparse(d) \
- ((d)->reparse_point || \
- (le32_to_cpu((d)->fi.Attributes) & ATTR_REPARSE))
-
-static inline void cifs_free_open_info(struct cifs_open_info_data *data)
+static inline bool cifs_open_data_reparse(struct cifs_open_info_data *data)
{
- kfree(data->symlink_target);
+ struct smb2_file_all_info *fi = &data->fi;
+ u32 attrs = le32_to_cpu(fi->Attributes);
+ bool ret;
+
+ ret = data->reparse_point || (attrs & ATTR_REPARSE);
+ if (ret)
+ attrs |= ATTR_REPARSE;
+ fi->Attributes = cpu_to_le32(attrs);
+ return ret;
}
/*
@@ -390,12 +406,17 @@ struct smb_version_operations {
int (*rename_pending_delete)(const char *, struct dentry *,
const unsigned int);
/* send rename request */
- int (*rename)(const unsigned int, struct cifs_tcon *, const char *,
- const char *, struct cifs_sb_info *);
+ int (*rename)(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
/* send create hardlink request */
- int (*create_hardlink)(const unsigned int, struct cifs_tcon *,
- const char *, const char *,
- struct cifs_sb_info *);
+ int (*create_hardlink)(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
/* query symlink target */
int (*query_symlink)(const unsigned int xid,
struct cifs_tcon *tcon,
@@ -560,6 +581,12 @@ struct smb_version_operations {
int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb,
struct kvec *rsp_iov,
struct cifs_open_info_data *data);
+ int (*create_reparse_symlink)(const unsigned int xid,
+ struct inode *inode,
+ struct dentry *dentry,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ const char *symname);
};
struct smb_version_values {
@@ -731,6 +758,7 @@ struct TCP_Server_Info {
unsigned int max_read;
unsigned int max_write;
unsigned int min_offload;
+ unsigned int retrans;
__le16 compress_algorithm;
__u16 signing_algorithm;
__le16 cipher_type;
@@ -1004,6 +1032,8 @@ struct cifs_chan {
__u8 signkey[SMB3_SIGN_KEY_SIZE];
};
+#define CIFS_SES_FLAG_SCALE_CHANNELS (0x1)
+
/*
* Session structure. One of these for each uid session with a particular host
*/
@@ -1036,6 +1066,7 @@ struct cifs_ses {
enum securityEnum sectype; /* what security flavor was specified? */
bool sign; /* is signing required? */
bool domainAuto:1;
+ unsigned int flags;
__u16 session_flags;
__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
__u8 smb3encryptionkey[SMB3_ENC_DEC_KEY_SIZE];
@@ -1187,6 +1218,7 @@ struct cifs_tcon {
__u64 bytes_read;
__u64 bytes_written;
spinlock_t stat_lock; /* protects the two fields above */
+ time64_t stats_from_time;
FILE_SYSTEM_DEVICE_INFO fsDevInfo;
FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */
FILE_SYSTEM_UNIX_INFO fsUnixInfo;
@@ -1346,6 +1378,7 @@ struct cifs_open_parms {
struct cifs_fid *fid;
umode_t mode;
bool reconnect:1;
+ bool replay:1; /* indicates that this open is for a replay */
};
struct cifs_fid {
@@ -1477,6 +1510,7 @@ struct cifs_writedata {
struct smbd_mr *mr;
#endif
struct cifs_credits credits;
+ bool replay;
};
/*
@@ -1537,7 +1571,6 @@ struct cifsInodeInfo {
spinlock_t writers_lock;
unsigned int writers; /* Number of writers on this inode */
unsigned long time; /* jiffies of last update of inode */
- u64 server_eof; /* current file size on server -- protected by i_lock */
u64 uniqueid; /* server inode number */
u64 createtime; /* creation time on server */
__u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for this inode */
@@ -1545,6 +1578,7 @@ struct cifsInodeInfo {
spinlock_t deferred_lock; /* protection on deferred list */
bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */
char *symlink_target;
+ __u32 reparse_tag;
};
static inline struct cifsInodeInfo *
@@ -1806,6 +1840,13 @@ static inline bool is_retryable_error(int error)
return false;
}
+static inline bool is_replayable_error(int error)
+{
+ if (error == -EAGAIN || error == -ECONNABORTED)
+ return true;
+ return false;
+}
+
/* cifs_get_writable_file() flags */
#define FIND_WR_ANY 0
@@ -2238,8 +2279,8 @@ static inline void cifs_sg_set_buf(struct sg_table *sgtable,
struct smb2_compound_vars {
struct cifs_open_parms oparms;
- struct kvec rsp_iov[3];
- struct smb_rqst rqst[3];
+ struct kvec rsp_iov[MAX_COMPOUND];
+ struct smb_rqst rqst[MAX_COMPOUND];
struct kvec open_iov[SMB2_CREATE_IOV_SIZE];
struct kvec qi_iov;
struct kvec io_iov[SMB2_IOCTL_IOV_SIZE];
diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h
index 46feaa0880bd..a841bf4967fa 100644
--- a/fs/smb/client/cifsproto.h
+++ b/fs/smb/client/cifsproto.h
@@ -211,8 +211,12 @@ int cifs_get_inode_info(struct inode **inode, const char *full_path,
bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
struct cifs_fattr *fattr,
struct cifs_open_info_data *data);
-extern int smb311_posix_get_inode_info(struct inode **pinode, const char *search_path,
- struct super_block *sb, unsigned int xid);
+
+extern int smb311_posix_get_inode_info(struct inode **inode,
+ const char *full_path,
+ struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid);
extern int cifs_get_inode_info_unix(struct inode **pinode,
const unsigned char *search_path,
struct super_block *sb, unsigned int xid);
@@ -435,16 +439,19 @@ extern int CIFSPOSIXDelFile(const unsigned int xid, struct cifs_tcon *tcon,
int remap_special_chars);
extern int CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon,
const char *name, struct cifs_sb_info *cifs_sb);
-extern int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb);
+int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
extern int CIFSSMBRenameOpenFile(const unsigned int xid, struct cifs_tcon *tcon,
int netfid, const char *target_name,
const struct nls_table *nls_codepage,
int remap_special_chars);
-extern int CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb);
+int CIFSCreateHardLink(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
extern int CIFSUnixCreateHardLink(const unsigned int xid,
struct cifs_tcon *tcon,
const char *fromName, const char *toName,
@@ -649,7 +656,7 @@ cifs_chan_is_iface_active(struct cifs_ses *ses,
struct TCP_Server_Info *server);
void
cifs_disable_secondary_channels(struct cifs_ses *ses);
-int
+void
cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server);
int
SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon, bool in_mount);
@@ -760,4 +767,11 @@ static inline void release_mid(struct mid_q_entry *mid)
kref_put(&mid->refcount, __release_mid);
}
+static inline void cifs_free_open_info(struct cifs_open_info_data *data)
+{
+ kfree(data->symlink_target);
+ free_rsp_buf(data->reparse.io.buftype, data->reparse.io.iov.iov_base);
+ memset(data, 0, sizeof(*data));
+}
+
#endif /* _CIFSPROTO_H */
diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c
index 9ee348e6d106..01e89070df5a 100644
--- a/fs/smb/client/cifssmb.c
+++ b/fs/smb/client/cifssmb.c
@@ -2149,10 +2149,10 @@ CIFSSMBFlush(const unsigned int xid, struct cifs_tcon *tcon, int smb_file_id)
return rc;
}
-int
-CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb)
+int CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb)
{
int rc = 0;
RENAME_REQ *pSMB = NULL;
@@ -2530,10 +2530,11 @@ createHardLinkRetry:
return rc;
}
-int
-CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb)
+int CIFSCreateHardLink(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb)
{
int rc = 0;
NT_RENAME_REQ *pSMB = NULL;
@@ -2699,11 +2700,12 @@ int cifs_query_reparse_point(const unsigned int xid,
u32 *tag, struct kvec *rsp,
int *rsp_buftype)
{
+ struct reparse_data_buffer *buf;
struct cifs_open_parms oparms;
TRANSACT_IOCTL_REQ *io_req = NULL;
TRANSACT_IOCTL_RSP *io_rsp = NULL;
struct cifs_fid fid;
- __u32 data_offset, data_count;
+ __u32 data_offset, data_count, len;
__u8 *start, *end;
int io_rsp_len;
int oplock = 0;
@@ -2773,7 +2775,16 @@ int cifs_query_reparse_point(const unsigned int xid,
goto error;
}
- *tag = le32_to_cpu(((struct reparse_data_buffer *)start)->ReparseTag);
+ data_count = le16_to_cpu(io_rsp->ByteCount);
+ buf = (struct reparse_data_buffer *)start;
+ len = sizeof(*buf);
+ if (data_count < len ||
+ data_count < le16_to_cpu(buf->ReparseDataLength) + len) {
+ rc = -EIO;
+ goto error;
+ }
+
+ *tag = le32_to_cpu(buf->ReparseTag);
rsp->iov_base = io_rsp;
rsp->iov_len = io_rsp_len;
*rsp_buftype = CIFS_LARGE_BUFFER;
diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c
index dc9b95ca71e6..ac9595504f4b 100644
--- a/fs/smb/client/connect.c
+++ b/fs/smb/client/connect.c
@@ -233,6 +233,12 @@ cifs_mark_tcp_ses_conns_for_reconnect(struct TCP_Server_Info *server,
list_for_each_entry_safe(ses, nses, &pserver->smb_ses_list, smb_ses_list) {
/* check if iface is still active */
spin_lock(&ses->chan_lock);
+ if (cifs_ses_get_chan_index(ses, server) ==
+ CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ continue;
+ }
+
if (!cifs_chan_is_iface_active(ses, server)) {
spin_unlock(&ses->chan_lock);
cifs_chan_update_iface(ses, server);
@@ -483,6 +489,7 @@ static int reconnect_target_unlocked(struct TCP_Server_Info *server, struct dfs_
static int reconnect_dfs_server(struct TCP_Server_Info *server)
{
struct dfs_cache_tgt_iterator *target_hint = NULL;
+
DFS_CACHE_TGT_LIST(tl);
int num_targets = 0;
int rc = 0;
@@ -745,6 +752,7 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
{
struct msghdr smb_msg = {};
struct kvec iov = {.iov_base = buf, .iov_len = to_read};
+
iov_iter_kvec(&smb_msg.msg_iter, ITER_DEST, &iov, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
@@ -1400,11 +1408,13 @@ cifs_match_ipaddr(struct sockaddr *srcaddr, struct sockaddr *rhs)
case AF_INET: {
struct sockaddr_in *saddr4 = (struct sockaddr_in *)srcaddr;
struct sockaddr_in *vaddr4 = (struct sockaddr_in *)rhs;
+
return (saddr4->sin_addr.s_addr == vaddr4->sin_addr.s_addr);
}
case AF_INET6: {
struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
+
return (ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr)
&& saddr6->sin6_scope_id == vaddr6->sin6_scope_id);
}
@@ -1570,6 +1580,9 @@ static int match_server(struct TCP_Server_Info *server,
if (server->min_offload != ctx->min_offload)
return 0;
+ if (server->retrans != ctx->retrans)
+ return 0;
+
return 1;
}
@@ -1794,6 +1807,7 @@ smbd_connected:
goto out_err_crypto_release;
}
tcp_ses->min_offload = ctx->min_offload;
+ tcp_ses->retrans = ctx->retrans;
/*
* at this point we are the only ones with the pointer
* to the struct since the kernel thread not created yet
@@ -2599,8 +2613,8 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx)
rc = -EOPNOTSUPP;
goto out_fail;
} else {
- cifs_dbg(VFS, "Check vers= mount option. SMB3.11 "
- "disabled but required for POSIX extensions\n");
+ cifs_dbg(VFS,
+ "Check vers= mount option. SMB3.11 disabled but required for POSIX extensions\n");
rc = -EOPNOTSUPP;
goto out_fail;
}
@@ -2743,7 +2757,6 @@ cifs_put_tlink(struct tcon_link *tlink)
if (!IS_ERR(tlink_tcon(tlink)))
cifs_put_tcon(tlink_tcon(tlink));
kfree(tlink);
- return;
}
static int
@@ -2884,6 +2897,7 @@ static inline void
cifs_reclassify_socket4(struct socket *sock)
{
struct sock *sk = sock->sk;
+
BUG_ON(!sock_allow_reclassification(sk));
sock_lock_init_class_and_name(sk, "slock-AF_INET-CIFS",
&cifs_slock_key[0], "sk_lock-AF_INET-CIFS", &cifs_key[0]);
@@ -2893,6 +2907,7 @@ static inline void
cifs_reclassify_socket6(struct socket *sock)
{
struct sock *sk = sock->sk;
+
BUG_ON(!sock_allow_reclassification(sk));
sock_lock_init_class_and_name(sk, "slock-AF_INET6-CIFS",
&cifs_slock_key[1], "sk_lock-AF_INET6-CIFS", &cifs_key[1]);
@@ -2927,15 +2942,18 @@ static int
bind_socket(struct TCP_Server_Info *server)
{
int rc = 0;
+
if (server->srcaddr.ss_family != AF_UNSPEC) {
/* Bind to the specified local IP address */
struct socket *socket = server->ssocket;
+
rc = kernel_bind(socket,
(struct sockaddr *) &server->srcaddr,
sizeof(server->srcaddr));
if (rc < 0) {
struct sockaddr_in *saddr4;
struct sockaddr_in6 *saddr6;
+
saddr4 = (struct sockaddr_in *)&server->srcaddr;
saddr6 = (struct sockaddr_in6 *)&server->srcaddr;
if (saddr6->sin6_family == AF_INET6)
@@ -3165,6 +3183,7 @@ void reset_cifs_unix_caps(unsigned int xid, struct cifs_tcon *tcon,
if (!CIFSSMBQFSUnixInfo(xid, tcon)) {
__u64 cap = le64_to_cpu(tcon->fsUnixInfo.Capability);
+
cifs_dbg(FYI, "unix caps which server supports %lld\n", cap);
/*
* check for reconnect case in which we do not
@@ -3425,8 +3444,18 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx)
* the user on mount
*/
if ((cifs_sb->ctx->wsize == 0) ||
- (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx)))
- cifs_sb->ctx->wsize = server->ops->negotiate_wsize(tcon, ctx);
+ (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) {
+ cifs_sb->ctx->wsize =
+ round_down(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE);
+ /*
+ * in the very unlikely event that the server sent a max write size under PAGE_SIZE,
+ * (which would get rounded down to 0) then reset wsize to absolute minimum eg 4096
+ */
+ if (cifs_sb->ctx->wsize == 0) {
+ cifs_sb->ctx->wsize = PAGE_SIZE;
+ cifs_dbg(VFS, "wsize too small, reset to minimum ie PAGE_SIZE, usually 4096\n");
+ }
+ }
if ((cifs_sb->ctx->rsize == 0) ||
(cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx)))
cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx);
@@ -3668,7 +3697,7 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
smb_buffer_response = smb_buffer;
header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
- NULL /*no tid */ , 4 /*wct */ );
+ NULL /*no tid */, 4 /*wct */);
smb_buffer->Mid = get_next_mid(ses->server);
smb_buffer->Uid = ses->Suid;
@@ -3687,12 +3716,12 @@ CIFSTCon(const unsigned int xid, struct cifs_ses *ses,
if (ses->server->sign)
smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE;
- if (ses->capabilities & CAP_STATUS32) {
+ if (ses->capabilities & CAP_STATUS32)
smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS;
- }
- if (ses->capabilities & CAP_DFS) {
+
+ if (ses->capabilities & CAP_DFS)
smb_buffer->Flags2 |= SMBFLG2_DFS;
- }
+
if (ses->capabilities & CAP_UNICODE) {
smb_buffer->Flags2 |= SMBFLG2_UNICODE;
length =
@@ -4215,6 +4244,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* only send once per connect */
spin_lock(&tcon->tc_lock);
+
+ /* if tcon is marked for needing reconnect, update state */
+ if (tcon->need_reconnect)
+ tcon->status = TID_NEED_TCON;
+
if (tcon->status == TID_GOOD) {
spin_unlock(&tcon->tc_lock);
return 0;
diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c
index a8a1d386da65..449c59830039 100644
--- a/fs/smb/client/dfs.c
+++ b/fs/smb/client/dfs.c
@@ -565,6 +565,11 @@ int cifs_tree_connect(const unsigned int xid, struct cifs_tcon *tcon, const stru
/* only send once per connect */
spin_lock(&tcon->tc_lock);
+
+ /* if tcon is marked for needing reconnect, update state */
+ if (tcon->need_reconnect)
+ tcon->status = TID_NEED_TCON;
+
if (tcon->status == TID_GOOD) {
spin_unlock(&tcon->tc_lock);
return 0;
@@ -625,8 +630,8 @@ out:
spin_lock(&tcon->tc_lock);
if (tcon->status == TID_IN_TCON)
tcon->status = TID_GOOD;
- spin_unlock(&tcon->tc_lock);
tcon->need_reconnect = false;
+ spin_unlock(&tcon->tc_lock);
}
return rc;
diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c
index 580a27a3a7e6..89333d9bce36 100644
--- a/fs/smb/client/dir.c
+++ b/fs/smb/client/dir.c
@@ -680,9 +680,10 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry,
full_path, d_inode(direntry));
again:
- if (pTcon->posix_extensions)
- rc = smb311_posix_get_inode_info(&newInode, full_path, parent_dir_inode->i_sb, xid);
- else if (pTcon->unix_ext) {
+ if (pTcon->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&newInode, full_path, NULL,
+ parent_dir_inode->i_sb, xid);
+ } else if (pTcon->unix_ext) {
rc = cifs_get_inode_info_unix(&newInode, full_path,
parent_dir_inode->i_sb, xid);
} else {
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 32a8525415d9..f391c9b803d8 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -87,7 +87,7 @@ void cifs_pages_written_back(struct inode *inode, loff_t start, unsigned int len
continue;
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -120,7 +120,7 @@ void cifs_pages_write_failed(struct inode *inode, loff_t start, unsigned int len
continue;
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -151,7 +151,7 @@ void cifs_pages_write_redirty(struct inode *inode, loff_t start, unsigned int le
xas_for_each(&xas, folio, end) {
if (!folio_test_writeback(folio)) {
WARN_ONCE(1, "bad %x @%llx page %lx %lx\n",
- len, start, folio_index(folio), end);
+ len, start, folio->index, end);
continue;
}
@@ -175,6 +175,9 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
/* only send once per connect */
spin_lock(&tcon->tc_lock);
+ if (tcon->need_reconnect)
+ tcon->status = TID_NEED_RECON;
+
if (tcon->status != TID_NEED_RECON) {
spin_unlock(&tcon->tc_lock);
return;
@@ -1020,14 +1023,16 @@ reopen_success:
if (!is_interrupt_error(rc))
mapping_set_error(inode->i_mapping, rc);
- if (tcon->posix_extensions)
- rc = smb311_posix_get_inode_info(&inode, full_path, inode->i_sb, xid);
- else if (tcon->unix_ext)
+ if (tcon->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&inode, full_path,
+ NULL, inode->i_sb, xid);
+ } else if (tcon->unix_ext) {
rc = cifs_get_inode_info_unix(&inode, full_path,
inode->i_sb, xid);
- else
+ } else {
rc = cifs_get_inode_info(&inode, full_path, NULL,
inode->i_sb, xid, NULL);
+ }
}
/*
* Else we are writing out data to server already and could deadlock if
@@ -2118,8 +2123,8 @@ cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset,
{
loff_t end_of_write = offset + bytes_written;
- if (end_of_write > cifsi->server_eof)
- cifsi->server_eof = end_of_write;
+ if (end_of_write > cifsi->netfs.remote_i_size)
+ netfs_resize_file(&cifsi->netfs, end_of_write, true);
}
static ssize_t
@@ -2649,7 +2654,7 @@ static void cifs_extend_writeback(struct address_space *mapping,
continue;
if (xa_is_value(folio))
break;
- if (folio_index(folio) != index)
+ if (folio->index != index)
break;
if (!folio_try_get_rcu(folio)) {
xas_reset(&xas);
@@ -2706,8 +2711,7 @@ static void cifs_extend_writeback(struct address_space *mapping,
*/
if (!folio_clear_dirty_for_io(folio))
WARN_ON(1);
- if (folio_start_writeback(folio))
- WARN_ON(1);
+ folio_start_writeback(folio);
*_count -= folio_nr_pages(folio);
folio_unlock(folio);
@@ -2742,8 +2746,7 @@ static ssize_t cifs_write_back_from_locked_folio(struct address_space *mapping,
int rc;
/* The folio should be locked, dirty and not undergoing writeback. */
- if (folio_start_writeback(folio))
- WARN_ON(1);
+ folio_start_writeback(folio);
count -= folio_nr_pages(folio);
len = folio_size(folio);
@@ -2899,7 +2902,7 @@ redo_folio:
goto skip_write;
}
- if (folio_mapping(folio) != mapping ||
+ if (folio->mapping != mapping ||
!folio_test_dirty(folio)) {
start += folio_size(folio);
folio_unlock(folio);
@@ -3247,8 +3250,8 @@ cifs_uncached_writev_complete(struct work_struct *work)
spin_lock(&inode->i_lock);
cifs_update_eof(cifsi, wdata->offset, wdata->bytes);
- if (cifsi->server_eof > inode->i_size)
- i_size_write(inode, cifsi->server_eof);
+ if (cifsi->netfs.remote_i_size > inode->i_size)
+ i_size_write(inode, cifsi->netfs.remote_i_size);
spin_unlock(&inode->i_lock);
complete(&wdata->done);
@@ -3300,6 +3303,7 @@ cifs_resend_wdata(struct cifs_writedata *wdata, struct list_head *wdata_list,
if (wdata->cfile->invalidHandle)
rc = -EAGAIN;
else {
+ wdata->replay = true;
#ifdef CONFIG_CIFS_SMB_DIRECT
if (wdata->mr) {
wdata->mr->need_invalidate = true;
@@ -5043,27 +5047,13 @@ static void cifs_swap_deactivate(struct file *file)
/* do we need to unpin (or unlock) the file */
}
-/*
- * Mark a page as having been made dirty and thus needing writeback. We also
- * need to pin the cache object to write back to.
- */
-#ifdef CONFIG_CIFS_FSCACHE
-static bool cifs_dirty_folio(struct address_space *mapping, struct folio *folio)
-{
- return fscache_dirty_folio(mapping, folio,
- cifs_inode_cookie(mapping->host));
-}
-#else
-#define cifs_dirty_folio filemap_dirty_folio
-#endif
-
const struct address_space_operations cifs_addr_ops = {
.read_folio = cifs_read_folio,
.readahead = cifs_readahead,
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .dirty_folio = cifs_dirty_folio,
+ .dirty_folio = netfs_dirty_folio,
.release_folio = cifs_release_folio,
.direct_IO = cifs_direct_io,
.invalidate_folio = cifs_invalidate_folio,
@@ -5087,7 +5077,7 @@ const struct address_space_operations cifs_addr_ops_smallbuf = {
.writepages = cifs_writepages,
.write_begin = cifs_write_begin,
.write_end = cifs_write_end,
- .dirty_folio = cifs_dirty_folio,
+ .dirty_folio = netfs_dirty_folio,
.release_folio = cifs_release_folio,
.invalidate_folio = cifs_invalidate_folio,
.launder_folio = cifs_launder_folio,
diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c
index a3493da12ad1..4b2f5aa2ea0e 100644
--- a/fs/smb/client/fs_context.c
+++ b/fs/smb/client/fs_context.c
@@ -139,6 +139,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = {
fsparam_u32("dir_mode", Opt_dirmode),
fsparam_u32("port", Opt_port),
fsparam_u32("min_enc_offload", Opt_min_enc_offload),
+ fsparam_u32("retrans", Opt_retrans),
fsparam_u32("esize", Opt_min_enc_offload),
fsparam_u32("bsize", Opt_blocksize),
fsparam_u32("rasize", Opt_rasize),
@@ -210,7 +211,7 @@ cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_c
switch (match_token(value, cifs_secflavor_tokens, args)) {
case Opt_sec_krb5p:
- cifs_errorf(fc, "sec=krb5p is not supported!\n");
+ cifs_errorf(fc, "sec=krb5p is not supported. Use sec=krb5,seal instead\n");
return 1;
case Opt_sec_krb5i:
ctx->sign = true;
@@ -1064,6 +1065,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_min_enc_offload:
ctx->min_offload = result.uint_32;
break;
+ case Opt_retrans:
+ ctx->retrans = result.uint_32;
+ break;
case Opt_blocksize:
/*
* inode blocksize realistically should never need to be
@@ -1107,6 +1111,17 @@ static int smb3_fs_context_parse_param(struct fs_context *fc,
case Opt_wsize:
ctx->wsize = result.uint_32;
ctx->got_wsize = true;
+ if (ctx->wsize % PAGE_SIZE != 0) {
+ ctx->wsize = round_down(ctx->wsize, PAGE_SIZE);
+ if (ctx->wsize == 0) {
+ ctx->wsize = PAGE_SIZE;
+ cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE);
+ } else {
+ cifs_dbg(VFS,
+ "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n",
+ ctx->wsize, PAGE_SIZE);
+ }
+ }
break;
case Opt_acregmax:
ctx->acregmax = HZ * result.uint_32;
@@ -1619,6 +1634,8 @@ int smb3_init_fs_context(struct fs_context *fc)
ctx->backupuid_specified = false; /* no backup intent for a user */
ctx->backupgid_specified = false; /* no backup intent for a group */
+ ctx->retrans = 1;
+
/*
* short int override_uid = -1;
* short int override_gid = -1;
diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h
index cf46916286d0..182ce11cbe93 100644
--- a/fs/smb/client/fs_context.h
+++ b/fs/smb/client/fs_context.h
@@ -118,6 +118,7 @@ enum cifs_param {
Opt_file_mode,
Opt_dirmode,
Opt_min_enc_offload,
+ Opt_retrans,
Opt_blocksize,
Opt_rasize,
Opt_rsize,
@@ -245,6 +246,7 @@ struct smb3_fs_context {
unsigned int rsize;
unsigned int wsize;
unsigned int min_offload;
+ unsigned int retrans;
bool sockopt_tcp_nodelay:1;
/* attribute cache timemout for files and directories in jiffies */
unsigned long acregmax;
diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c
index e5cad149f5a2..c4a3cb736881 100644
--- a/fs/smb/client/fscache.c
+++ b/fs/smb/client/fscache.c
@@ -180,7 +180,7 @@ static int fscache_fallback_write_pages(struct inode *inode, loff_t start, size_
if (ret < 0)
return ret;
- ret = cres.ops->prepare_write(&cres, &start, &len, i_size_read(inode),
+ ret = cres.ops->prepare_write(&cres, &start, &len, len, i_size_read(inode),
no_space_allocated_yet);
if (ret == 0)
ret = fscache_write(&cres, start, &iter, NULL, NULL);
diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c
index 09c5c0f5c96e..d02f8ba29cb5 100644
--- a/fs/smb/client/inode.c
+++ b/fs/smb/client/inode.c
@@ -104,7 +104,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
fattr->cf_mtime = timestamp_truncate(fattr->cf_mtime, inode);
mtime = inode_get_mtime(inode);
if (timespec64_equal(&mtime, &fattr->cf_mtime) &&
- cifs_i->server_eof == fattr->cf_eof) {
+ cifs_i->netfs.remote_i_size == fattr->cf_eof) {
cifs_dbg(FYI, "%s: inode %llu is unchanged\n",
__func__, cifs_i->uniqueid);
return;
@@ -182,6 +182,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
inode->i_mode = fattr->cf_mode;
cifs_i->cifsAttrs = fattr->cf_cifsattrs;
+ cifs_i->reparse_tag = fattr->cf_cifstag;
if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
cifs_i->time = 0;
@@ -193,7 +194,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
else
clear_bit(CIFS_INO_DELETE_PENDING, &cifs_i->flags);
- cifs_i->server_eof = fattr->cf_eof;
+ cifs_i->netfs.remote_i_size = fattr->cf_eof;
/*
* Can't safely change the file size here if the client is writing to
* it due to potential races.
@@ -209,7 +210,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
inode->i_blocks = (512 - 1 + fattr->cf_bytes) >> 9;
}
- if (S_ISLNK(fattr->cf_mode)) {
+ if (S_ISLNK(fattr->cf_mode) && fattr->cf_symlink_target) {
kfree(cifs_i->symlink_target);
cifs_i->symlink_target = fattr->cf_symlink_target;
fattr->cf_symlink_target = NULL;
@@ -664,8 +665,6 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
/* Fill a cifs_fattr struct with info from POSIX info struct */
static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr,
struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group,
struct super_block *sb)
{
struct smb311_posix_qinfo *info = &data->posix_fi;
@@ -691,31 +690,38 @@ static void smb311_posix_info_to_fattr(struct cifs_fattr *fattr,
fattr->cf_mtime.tv_sec += tcon->ses->server->timeAdj;
}
+ /*
+ * The srv fs device id is overridden on network mount so setting
+ * @fattr->cf_rdev isn't needed here.
+ */
fattr->cf_eof = le64_to_cpu(info->EndOfFile);
fattr->cf_bytes = le64_to_cpu(info->AllocationSize);
fattr->cf_createtime = le64_to_cpu(info->CreationTime);
-
fattr->cf_nlink = le32_to_cpu(info->HardLinks);
fattr->cf_mode = (umode_t) le32_to_cpu(info->Mode);
- /* The srv fs device id is overridden on network mount so setting rdev isn't needed here */
- /* fattr->cf_rdev = le32_to_cpu(info->DeviceId); */
- if (data->symlink) {
- fattr->cf_mode |= S_IFLNK;
- fattr->cf_dtype = DT_LNK;
- fattr->cf_symlink_target = data->symlink_target;
- data->symlink_target = NULL;
- } else if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
+ if (cifs_open_data_reparse(data) &&
+ cifs_reparse_point_to_fattr(cifs_sb, fattr, data))
+ goto out_reparse;
+
+ fattr->cf_mode &= ~S_IFMT;
+ if (fattr->cf_cifsattrs & ATTR_DIRECTORY) {
fattr->cf_mode |= S_IFDIR;
fattr->cf_dtype = DT_DIR;
} else { /* file */
fattr->cf_mode |= S_IFREG;
fattr->cf_dtype = DT_REG;
}
- /* else if reparse point ... TODO: add support for FIFO and blk dev; special file types */
- sid_to_id(cifs_sb, owner, fattr, SIDOWNER);
- sid_to_id(cifs_sb, group, fattr, SIDGROUP);
+out_reparse:
+ if (S_ISLNK(fattr->cf_mode)) {
+ if (likely(data->symlink_target))
+ fattr->cf_eof = strnlen(data->symlink_target, PATH_MAX);
+ fattr->cf_symlink_target = data->symlink_target;
+ data->symlink_target = NULL;
+ }
+ sid_to_id(cifs_sb, &data->posix_owner, fattr, SIDOWNER);
+ sid_to_id(cifs_sb, &data->posix_group, fattr, SIDGROUP);
cifs_dbg(FYI, "POSIX query info: mode 0x%x uniqueid 0x%llx nlink %d\n",
fattr->cf_mode, fattr->cf_uniqueid, fattr->cf_nlink);
@@ -738,25 +744,25 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
if (tag == IO_REPARSE_TAG_NFS && buf) {
switch (le64_to_cpu(buf->InodeType)) {
case NFS_SPECFILE_CHR:
- fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFCHR;
fattr->cf_dtype = DT_CHR;
fattr->cf_rdev = nfs_mkdev(buf);
break;
case NFS_SPECFILE_BLK:
- fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFBLK;
fattr->cf_dtype = DT_BLK;
fattr->cf_rdev = nfs_mkdev(buf);
break;
case NFS_SPECFILE_FIFO:
- fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFIFO;
fattr->cf_dtype = DT_FIFO;
break;
case NFS_SPECFILE_SOCK:
- fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFSOCK;
fattr->cf_dtype = DT_SOCK;
break;
case NFS_SPECFILE_LNK:
- fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
break;
default:
@@ -768,29 +774,29 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb,
switch (tag) {
case IO_REPARSE_TAG_LX_SYMLINK:
- fattr->cf_mode |= S_IFLNK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
break;
case IO_REPARSE_TAG_LX_FIFO:
- fattr->cf_mode |= S_IFIFO | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFIFO;
fattr->cf_dtype = DT_FIFO;
break;
case IO_REPARSE_TAG_AF_UNIX:
- fattr->cf_mode |= S_IFSOCK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFSOCK;
fattr->cf_dtype = DT_SOCK;
break;
case IO_REPARSE_TAG_LX_CHR:
- fattr->cf_mode |= S_IFCHR | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFCHR;
fattr->cf_dtype = DT_CHR;
break;
case IO_REPARSE_TAG_LX_BLK:
- fattr->cf_mode |= S_IFBLK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFBLK;
fattr->cf_dtype = DT_BLK;
break;
case 0: /* SMB1 symlink */
case IO_REPARSE_TAG_SYMLINK:
case IO_REPARSE_TAG_NFS:
- fattr->cf_mode = S_IFLNK | cifs_sb->ctx->file_mode;
+ fattr->cf_mode |= S_IFLNK;
fattr->cf_dtype = DT_LNK;
break;
default:
@@ -830,6 +836,7 @@ static void cifs_open_info_to_fattr(struct cifs_fattr *fattr,
fattr->cf_createtime = le64_to_cpu(info->CreationTime);
fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
+ fattr->cf_mode = cifs_sb->ctx->file_mode;
if (cifs_open_data_reparse(data) &&
cifs_reparse_point_to_fattr(cifs_sb, fattr, data))
goto out_reparse;
@@ -1076,6 +1083,9 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
&rsp_iov, &rsp_buftype);
if (!rc)
iov = &rsp_iov;
+ } else if (data->reparse.io.buftype != CIFS_NO_BUFFER &&
+ data->reparse.io.iov.iov_base) {
+ iov = &data->reparse.io.iov;
}
rc = -EOPNOTSUPP;
@@ -1092,17 +1102,22 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data,
rc = 0;
goto out;
default:
- if (data->symlink_target) {
+ /* Check for cached reparse point data */
+ if (data->symlink_target || data->reparse.buf) {
rc = 0;
- } else if (server->ops->parse_reparse_point) {
+ } else if (iov && server->ops->parse_reparse_point) {
rc = server->ops->parse_reparse_point(cifs_sb,
iov, data);
}
break;
}
- cifs_open_info_to_fattr(fattr, data, sb);
+ if (tcon->posix_extensions)
+ smb311_posix_info_to_fattr(fattr, data, sb);
+ else
+ cifs_open_info_to_fattr(fattr, data, sb);
out:
+ fattr->cf_cifstag = data->reparse.tag;
free_rsp_buf(rsp_buftype, rsp_iov.iov_base);
return rc;
}
@@ -1290,31 +1305,34 @@ out:
return rc;
}
-static int smb311_posix_get_fattr(struct cifs_fattr *fattr,
+static int smb311_posix_get_fattr(struct cifs_open_info_data *data,
+ struct cifs_fattr *fattr,
const char *full_path,
struct super_block *sb,
const unsigned int xid)
{
- struct cifs_open_info_data data = {};
+ struct cifs_open_info_data tmp_data = {};
+ struct TCP_Server_Info *server;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_tcon *tcon;
struct tcon_link *tlink;
- struct cifs_sid owner, group;
int tmprc;
- int rc;
+ int rc = 0;
tlink = cifs_sb_tlink(cifs_sb);
if (IS_ERR(tlink))
return PTR_ERR(tlink);
tcon = tlink_tcon(tlink);
+ server = tcon->ses->server;
/*
- * 1. Fetch file metadata
+ * 1. Fetch file metadata if not provided (data)
*/
-
- rc = smb311_posix_query_path_info(xid, tcon, cifs_sb,
- full_path, &data,
- &owner, &group);
+ if (!data) {
+ rc = server->ops->query_path_info(xid, tcon, cifs_sb,
+ full_path, &tmp_data);
+ data = &tmp_data;
+ }
/*
* 2. Convert it to internal cifs metadata (fattr)
@@ -1322,7 +1340,12 @@ static int smb311_posix_get_fattr(struct cifs_fattr *fattr,
switch (rc) {
case 0:
- smb311_posix_info_to_fattr(fattr, &data, &owner, &group, sb);
+ if (cifs_open_data_reparse(data)) {
+ rc = reparse_info_to_fattr(data, sb, xid, tcon,
+ full_path, fattr);
+ } else {
+ smb311_posix_info_to_fattr(fattr, data, sb);
+ }
break;
case -EREMOTE:
/* DFS link, no metadata available on this server */
@@ -1353,12 +1376,15 @@ static int smb311_posix_get_fattr(struct cifs_fattr *fattr,
out:
cifs_put_tlink(tlink);
- cifs_free_open_info(&data);
+ cifs_free_open_info(data);
return rc;
}
-int smb311_posix_get_inode_info(struct inode **inode, const char *full_path,
- struct super_block *sb, const unsigned int xid)
+int smb311_posix_get_inode_info(struct inode **inode,
+ const char *full_path,
+ struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid)
{
struct cifs_fattr fattr = {};
int rc;
@@ -1368,7 +1394,7 @@ int smb311_posix_get_inode_info(struct inode **inode, const char *full_path,
return 0;
}
- rc = smb311_posix_get_fattr(&fattr, full_path, sb, xid);
+ rc = smb311_posix_get_fattr(data, &fattr, full_path, sb, xid);
if (rc)
goto out;
@@ -1516,7 +1542,7 @@ struct inode *cifs_root_iget(struct super_block *sb)
convert_delimiter(path, CIFS_DIR_SEP(cifs_sb));
if (tcon->posix_extensions)
- rc = smb311_posix_get_fattr(&fattr, path, sb, xid);
+ rc = smb311_posix_get_fattr(NULL, &fattr, path, sb, xid);
else
rc = cifs_get_fattr(NULL, sb, xid, NULL, &fattr, &inode, path);
@@ -1889,16 +1915,18 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode,
int rc = 0;
struct inode *inode = NULL;
- if (tcon->posix_extensions)
- rc = smb311_posix_get_inode_info(&inode, full_path, parent->i_sb, xid);
+ if (tcon->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&inode, full_path,
+ NULL, parent->i_sb, xid);
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- else if (tcon->unix_ext)
+ } else if (tcon->unix_ext) {
rc = cifs_get_inode_info_unix(&inode, full_path, parent->i_sb,
xid);
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
- else
+ } else {
rc = cifs_get_inode_info(&inode, full_path, NULL, parent->i_sb,
xid, NULL);
+ }
if (rc)
return rc;
@@ -2219,7 +2247,8 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry,
return -ENOSYS;
/* try path-based rename first */
- rc = server->ops->rename(xid, tcon, from_path, to_path, cifs_sb);
+ rc = server->ops->rename(xid, tcon, from_dentry,
+ from_path, to_path, cifs_sb);
/*
* Don't bother with rename by filehandle unless file is busy and
@@ -2579,13 +2608,15 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry)
dentry, cifs_get_time(dentry), jiffies);
again:
- if (cifs_sb_master_tcon(CIFS_SB(sb))->posix_extensions)
- rc = smb311_posix_get_inode_info(&inode, full_path, sb, xid);
- else if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext)
+ if (cifs_sb_master_tcon(CIFS_SB(sb))->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&inode, full_path,
+ NULL, sb, xid);
+ } else if (cifs_sb_master_tcon(CIFS_SB(sb))->unix_ext) {
rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
- else
+ } else {
rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
xid, NULL);
+ }
if (rc == -EAGAIN && count++ < 10)
goto again;
out:
@@ -2827,7 +2858,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
set_size_out:
if (rc == 0) {
- cifsInode->server_eof = attrs->ia_size;
+ netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
cifs_setsize(inode, attrs->ia_size);
/*
* i_blocks is not related to (i_size / i_blksize), but instead
@@ -2980,6 +3011,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
if ((attrs->ia_valid & ATTR_SIZE) &&
attrs->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attrs->ia_size);
+ netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size);
}
@@ -3179,6 +3211,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs)
if ((attrs->ia_valid & ATTR_SIZE) &&
attrs->ia_size != i_size_read(inode)) {
truncate_setsize(inode, attrs->ia_size);
+ netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true);
fscache_resize_cookie(cifs_inode_cookie(inode), attrs->ia_size);
}
diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c
index a1da50e66fbb..d86da949a919 100644
--- a/fs/smb/client/link.c
+++ b/fs/smb/client/link.c
@@ -510,8 +510,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode,
rc = -ENOSYS;
goto cifs_hl_exit;
}
- rc = server->ops->create_hardlink(xid, tcon, from_name, to_name,
- cifs_sb);
+ rc = server->ops->create_hardlink(xid, tcon, old_file,
+ from_name, to_name, cifs_sb);
if ((rc == -EIO) || (rc == -EINVAL))
rc = -EOPNOTSUPP;
}
@@ -569,6 +569,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
int rc = -EOPNOTSUPP;
unsigned int xid;
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct TCP_Server_Info *server;
struct tcon_link *tlink;
struct cifs_tcon *pTcon;
const char *full_path;
@@ -590,6 +591,7 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
goto symlink_exit;
}
pTcon = tlink_tcon(tlink);
+ server = cifs_pick_channel(pTcon->ses);
full_path = build_path_from_dentry(direntry, page);
if (IS_ERR(full_path)) {
@@ -601,27 +603,32 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode,
cifs_dbg(FYI, "symname is %s\n", symname);
/* BB what if DFS and this volume is on different share? BB */
- if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS)
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) {
rc = create_mf_symlink(xid, pTcon, cifs_sb, full_path, symname);
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
- else if (pTcon->unix_ext)
+ } else if (pTcon->unix_ext) {
rc = CIFSUnixCreateSymLink(xid, pTcon, full_path, symname,
cifs_sb->local_nls,
cifs_remap(cifs_sb));
#endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */
- /* else
- rc = CIFSCreateReparseSymLink(xid, pTcon, fromName, toName,
- cifs_sb_target->local_nls); */
+ } else if (server->ops->create_reparse_symlink) {
+ rc = server->ops->create_reparse_symlink(xid, inode, direntry,
+ pTcon, full_path,
+ symname);
+ goto symlink_exit;
+ }
if (rc == 0) {
- if (pTcon->posix_extensions)
- rc = smb311_posix_get_inode_info(&newinode, full_path, inode->i_sb, xid);
- else if (pTcon->unix_ext)
+ if (pTcon->posix_extensions) {
+ rc = smb311_posix_get_inode_info(&newinode, full_path,
+ NULL, inode->i_sb, xid);
+ } else if (pTcon->unix_ext) {
rc = cifs_get_inode_info_unix(&newinode, full_path,
inode->i_sb, xid);
- else
+ } else {
rc = cifs_get_inode_info(&newinode, full_path, NULL,
inode->i_sb, xid, NULL);
+ }
if (rc != 0) {
cifs_dbg(FYI, "Create symlink ok, getinodeinfo fail rc = %d\n",
diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c
index c2137ea3c253..0748d7b757b9 100644
--- a/fs/smb/client/misc.c
+++ b/fs/smb/client/misc.c
@@ -140,6 +140,7 @@ tcon_info_alloc(bool dir_leases_enabled)
spin_lock_init(&ret_buf->stat_lock);
atomic_set(&ret_buf->num_local_opens, 0);
atomic_set(&ret_buf->num_remote_opens, 0);
+ ret_buf->stats_from_time = ktime_get_real_seconds();
#ifdef CONFIG_CIFS_DFS_UPCALL
INIT_LIST_HEAD(&ret_buf->dfs_ses_list);
#endif
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index a6968573b775..4a517b280f2b 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -168,6 +168,21 @@ static char *automount_fullpath(struct dentry *dentry, void *page)
return s;
}
+static void fs_context_set_ids(struct smb3_fs_context *ctx)
+{
+ kuid_t uid = current_fsuid();
+ kgid_t gid = current_fsgid();
+
+ if (ctx->multiuser) {
+ if (!ctx->uid_specified)
+ ctx->linux_uid = uid;
+ if (!ctx->gid_specified)
+ ctx->linux_gid = gid;
+ }
+ if (!ctx->cruid_specified)
+ ctx->cred_uid = uid;
+}
+
/*
* Create a vfsmount that we can automount
*/
@@ -205,6 +220,7 @@ static struct vfsmount *cifs_do_automount(struct path *path)
tmp.leaf_fullpath = NULL;
tmp.UNC = tmp.prepath = NULL;
tmp.dfs_root_ses = NULL;
+ fs_context_set_ids(&tmp);
rc = smb3_fs_context_dup(ctx, &tmp);
if (rc) {
diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c
index d30ea2005eb3..b520eea7bfce 100644
--- a/fs/smb/client/readdir.c
+++ b/fs/smb/client/readdir.c
@@ -56,6 +56,23 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
#endif /* DEBUG2 */
/*
+ * Match a reparse point inode if reparse tag and ctime haven't changed.
+ *
+ * Windows Server updates ctime of reparse points when their data have changed.
+ * The server doesn't allow changing reparse tags from existing reparse points,
+ * though it's worth checking.
+ */
+static inline bool reparse_inode_match(struct inode *inode,
+ struct cifs_fattr *fattr)
+{
+ struct timespec64 ctime = inode_get_ctime(inode);
+
+ return (CIFS_I(inode)->cifsAttrs & ATTR_REPARSE) &&
+ CIFS_I(inode)->reparse_tag == fattr->cf_cifstag &&
+ timespec64_equal(&ctime, &fattr->cf_ctime);
+}
+
+/*
* Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
*
* Find the dentry that matches "name". If there isn't one, create one. If it's
@@ -71,6 +88,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
struct super_block *sb = parent->d_sb;
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
+ int rc;
cifs_dbg(FYI, "%s: for %s\n", __func__, name->name);
@@ -82,9 +100,11 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
* We'll end up doing an on the wire call either way and
* this spares us an invalidation.
*/
- if (fattr->cf_flags & CIFS_FATTR_NEED_REVAL)
- return;
retry:
+ if ((fattr->cf_cifsattrs & ATTR_REPARSE) ||
+ (fattr->cf_flags & CIFS_FATTR_NEED_REVAL))
+ return;
+
dentry = d_alloc_parallel(parent, name, &wq);
}
if (IS_ERR(dentry))
@@ -104,12 +124,34 @@ retry:
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
fattr->cf_uniqueid = CIFS_I(inode)->uniqueid;
- /* update inode in place
- * if both i_ino and i_mode didn't change */
- if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid &&
- cifs_fattr_to_inode(inode, fattr) == 0) {
- dput(dentry);
- return;
+ /*
+ * Update inode in place if both i_ino and i_mode didn't
+ * change.
+ */
+ if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
+ /*
+ * Query dir responses don't provide enough
+ * information about reparse points other than
+ * their reparse tags. Save an invalidation by
+ * not clobbering some existing attributes when
+ * reparse tag and ctime haven't changed.
+ */
+ rc = 0;
+ if (fattr->cf_cifsattrs & ATTR_REPARSE) {
+ if (likely(reparse_inode_match(inode, fattr))) {
+ fattr->cf_mode = inode->i_mode;
+ fattr->cf_rdev = inode->i_rdev;
+ fattr->cf_eof = CIFS_I(inode)->netfs.remote_i_size;
+ fattr->cf_symlink_target = NULL;
+ } else {
+ CIFS_I(inode)->time = 0;
+ rc = -ESTALE;
+ }
+ }
+ if (!rc && !cifs_fattr_to_inode(inode, fattr)) {
+ dput(dentry);
+ return;
+ }
}
}
d_invalidate(dentry);
@@ -127,29 +169,6 @@ retry:
dput(dentry);
}
-static bool reparse_file_needs_reval(const struct cifs_fattr *fattr)
-{
- if (!(fattr->cf_cifsattrs & ATTR_REPARSE))
- return false;
- /*
- * The DFS tags should be only intepreted by server side as per
- * MS-FSCC 2.1.2.1, but let's include them anyway.
- *
- * Besides, if cf_cifstag is unset (0), then we still need it to be
- * revalidated to know exactly what reparse point it is.
- */
- switch (fattr->cf_cifstag) {
- case IO_REPARSE_TAG_DFS:
- case IO_REPARSE_TAG_DFSR:
- case IO_REPARSE_TAG_SYMLINK:
- case IO_REPARSE_TAG_NFS:
- case IO_REPARSE_TAG_MOUNT_POINT:
- case 0:
- return true;
- }
- return false;
-}
-
static void
cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
{
@@ -181,14 +200,6 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
}
out_reparse:
- /*
- * We need to revalidate it further to make a decision about whether it
- * is a symbolic link, DFS referral or a reparse point with a direct
- * access like junctions, deduplicated files, NFS symlinks.
- */
- if (reparse_file_needs_reval(fattr))
- fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
-
/* non-unix readdir doesn't provide nlink */
fattr->cf_flags |= CIFS_FATTR_UNKNOWN_NLINK;
@@ -269,9 +280,6 @@ cifs_posix_to_fattr(struct cifs_fattr *fattr, struct smb2_posix_info *info,
fattr->cf_dtype = DT_REG;
}
- if (reparse_file_needs_reval(fattr))
- fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
-
sid_to_id(cifs_sb, &parsed.owner, fattr, SIDOWNER);
sid_to_id(cifs_sb, &parsed.group, fattr, SIDGROUP);
}
@@ -299,14 +307,16 @@ cifs_dir_info_to_fattr(struct cifs_fattr *fattr, FILE_DIRECTORY_INFO *info,
}
static void cifs_fulldir_info_to_fattr(struct cifs_fattr *fattr,
- SEARCH_ID_FULL_DIR_INFO *info,
+ const void *info,
struct cifs_sb_info *cifs_sb)
{
+ const FILE_FULL_DIRECTORY_INFO *di = info;
+
__dir_info_to_fattr(fattr, info);
- /* See MS-FSCC 2.4.19 FileIdFullDirectoryInformation */
+ /* See MS-FSCC 2.4.14, 2.4.19 */
if (fattr->cf_cifsattrs & ATTR_REPARSE)
- fattr->cf_cifstag = le32_to_cpu(info->EaSize);
+ fattr->cf_cifstag = le32_to_cpu(di->EaSize);
cifs_fill_common_info(fattr, cifs_sb);
}
@@ -331,38 +341,6 @@ cifs_std_info_to_fattr(struct cifs_fattr *fattr, FIND_FILE_STANDARD_INFO *info,
cifs_fill_common_info(fattr, cifs_sb);
}
-/* BB eventually need to add the following helper function to
- resolve NT_STATUS_STOPPED_ON_SYMLINK return code when
- we try to do FindFirst on (NTFS) directory symlinks */
-/*
-int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb,
- unsigned int xid)
-{
- __u16 fid;
- int len;
- int oplock = 0;
- int rc;
- struct cifs_tcon *ptcon = cifs_sb_tcon(cifs_sb);
- char *tmpbuffer;
-
- rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ,
- OPEN_REPARSE_POINT, &fid, &oplock, NULL,
- cifs_sb->local_nls,
- cifs_remap(cifs_sb);
- if (!rc) {
- tmpbuffer = kmalloc(maxpath);
- rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path,
- tmpbuffer,
- maxpath -1,
- fid,
- cifs_sb->local_nls);
- if (CIFSSMBClose(xid, ptcon, fid)) {
- cifs_dbg(FYI, "Error closing temporary reparsepoint open\n");
- }
- }
-}
- */
-
static int
_initiate_cifs_search(const unsigned int xid, struct file *file,
const char *full_path)
@@ -420,7 +398,7 @@ ffirst_retry:
} else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO;
} else /* not srvinos - BB fixme add check for backlevel? */ {
- cifsFile->srch_inf.info_level = SMB_FIND_FILE_DIRECTORY_INFO;
+ cifsFile->srch_inf.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO;
}
search_flags = CIFS_SEARCH_CLOSE_AT_END | CIFS_SEARCH_RETURN_RESUME;
@@ -431,13 +409,10 @@ ffirst_retry:
&cifsFile->fid, search_flags,
&cifsFile->srch_inf);
- if (rc == 0)
+ if (rc == 0) {
cifsFile->invalidHandle = false;
- /* BB add following call to handle readdir on new NTFS symlink errors
- else if STATUS_STOPPED_ON_SYMLINK
- call get_symlink_reparse_path and retry with new path */
- else if ((rc == -EOPNOTSUPP) &&
- (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
+ } else if ((rc == -EOPNOTSUPP) &&
+ (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) {
cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SERVER_INUM;
goto ffirst_retry;
}
@@ -672,10 +647,10 @@ static int cifs_entry_is_dot(struct cifs_dirent *de, bool is_unicode)
static int is_dir_changed(struct file *file)
{
struct inode *inode = file_inode(file);
- struct cifsInodeInfo *cifsInfo = CIFS_I(inode);
+ struct cifsInodeInfo *cifs_inode_info = CIFS_I(inode);
- if (cifsInfo->time == 0)
- return 1; /* directory was changed, perhaps due to unlink */
+ if (cifs_inode_info->time == 0)
+ return 1; /* directory was changed, e.g. unlink or new file */
else
return 0;
@@ -1014,10 +989,9 @@ static int cifs_filldir(char *find_entry, struct file *file,
(FIND_FILE_STANDARD_INFO *)find_entry,
cifs_sb);
break;
+ case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
case SMB_FIND_FILE_ID_FULL_DIR_INFO:
- cifs_fulldir_info_to_fattr(&fattr,
- (SEARCH_ID_FULL_DIR_INFO *)find_entry,
- cifs_sb);
+ cifs_fulldir_info_to_fattr(&fattr, find_entry, cifs_sb);
break;
default:
cifs_dir_info_to_fattr(&fattr,
diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c
index 2d3b332a79a1..8f37373fd333 100644
--- a/fs/smb/client/sess.c
+++ b/fs/smb/client/sess.c
@@ -75,6 +75,10 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
{
unsigned int i;
+ /* if the channel is waiting for termination */
+ if (server && server->terminate)
+ return CIFS_INVAL_CHAN_INDEX;
+
for (i = 0; i < ses->chan_count; i++) {
if (ses->chans[i].server == server)
return i;
@@ -84,7 +88,6 @@ cifs_ses_get_chan_index(struct cifs_ses *ses,
if (server)
cifs_dbg(VFS, "unable to get chan index for server: 0x%llx",
server->conn_id);
- WARN_ON(1);
return CIFS_INVAL_CHAN_INDEX;
}
@@ -269,6 +272,8 @@ int cifs_try_adding_channels(struct cifs_ses *ses)
&iface->sockaddr,
rc);
kref_put(&iface->refcount, release_iface);
+ /* failure to add chan should increase weight */
+ iface->weight_fulfilled++;
continue;
}
@@ -356,10 +361,9 @@ done:
/*
* update the iface for the channel if necessary.
- * will return 0 when iface is updated, 1 if removed, 2 otherwise
* Must be called with chan_lock held.
*/
-int
+void
cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
{
unsigned int chan_index;
@@ -368,20 +372,19 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
struct cifs_server_iface *old_iface = NULL;
struct cifs_server_iface *last_iface = NULL;
struct sockaddr_storage ss;
- int rc = 0;
spin_lock(&ses->chan_lock);
chan_index = cifs_ses_get_chan_index(ses, server);
if (chan_index == CIFS_INVAL_CHAN_INDEX) {
spin_unlock(&ses->chan_lock);
- return 0;
+ return;
}
if (ses->chans[chan_index].iface) {
old_iface = ses->chans[chan_index].iface;
if (old_iface->is_active) {
spin_unlock(&ses->chan_lock);
- return 1;
+ return;
}
}
spin_unlock(&ses->chan_lock);
@@ -394,7 +397,7 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
if (!ses->iface_count) {
spin_unlock(&ses->iface_lock);
cifs_dbg(VFS, "server %s does not advertise interfaces\n", ses->server->hostname);
- return 0;
+ return;
}
last_iface = list_last_entry(&ses->iface_list, struct cifs_server_iface,
@@ -434,16 +437,21 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
}
if (list_entry_is_head(iface, &ses->iface_list, iface_head)) {
- rc = 1;
iface = NULL;
cifs_dbg(FYI, "unable to find a suitable iface\n");
}
if (!iface) {
- cifs_dbg(FYI, "unable to get the interface matching: %pIS\n",
- &ss);
+ if (!chan_index)
+ cifs_dbg(FYI, "unable to get the interface matching: %pIS\n",
+ &ss);
+ else {
+ cifs_dbg(FYI, "unable to find another interface to replace: %pIS\n",
+ &old_iface->sockaddr);
+ }
+
spin_unlock(&ses->iface_lock);
- return 0;
+ return;
}
/* now drop the ref to the current iface */
@@ -459,34 +467,24 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server)
iface->weight_fulfilled++;
kref_put(&old_iface->refcount, release_iface);
- } else if (old_iface) {
- /* if a new candidate is not found, keep things as is */
- cifs_dbg(FYI, "could not replace iface: %pIS\n",
- &old_iface->sockaddr);
} else if (!chan_index) {
/* special case: update interface for primary channel */
- if (iface) {
- cifs_dbg(FYI, "referencing primary channel iface: %pIS\n",
- &iface->sockaddr);
- iface->num_channels++;
- iface->weight_fulfilled++;
- }
+ cifs_dbg(FYI, "referencing primary channel iface: %pIS\n",
+ &iface->sockaddr);
+ iface->num_channels++;
+ iface->weight_fulfilled++;
}
spin_unlock(&ses->iface_lock);
- if (iface) {
- spin_lock(&ses->chan_lock);
- chan_index = cifs_ses_get_chan_index(ses, server);
- if (chan_index == CIFS_INVAL_CHAN_INDEX) {
- spin_unlock(&ses->chan_lock);
- return 0;
- }
-
- ses->chans[chan_index].iface = iface;
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
spin_unlock(&ses->chan_lock);
+ return;
}
- return rc;
+ ses->chans[chan_index].iface = iface;
+ spin_unlock(&ses->chan_lock);
}
/*
diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h
index 82e916ad167c..a0c156996fc5 100644
--- a/fs/smb/client/smb2glob.h
+++ b/fs/smb/client/smb2glob.h
@@ -23,17 +23,21 @@
* Identifiers for functions that use the open, operation, close pattern
* in smb2inode.c:smb2_compound_op()
*/
-#define SMB2_OP_SET_DELETE 1
-#define SMB2_OP_SET_INFO 2
-#define SMB2_OP_QUERY_INFO 3
-#define SMB2_OP_QUERY_DIR 4
-#define SMB2_OP_MKDIR 5
-#define SMB2_OP_RENAME 6
-#define SMB2_OP_DELETE 7
-#define SMB2_OP_HARDLINK 8
-#define SMB2_OP_SET_EOF 9
-#define SMB2_OP_RMDIR 10
-#define SMB2_OP_POSIX_QUERY_INFO 11
+enum smb2_compound_ops {
+ SMB2_OP_SET_DELETE = 1,
+ SMB2_OP_SET_INFO,
+ SMB2_OP_QUERY_INFO,
+ SMB2_OP_QUERY_DIR,
+ SMB2_OP_MKDIR,
+ SMB2_OP_RENAME,
+ SMB2_OP_DELETE,
+ SMB2_OP_HARDLINK,
+ SMB2_OP_SET_EOF,
+ SMB2_OP_RMDIR,
+ SMB2_OP_POSIX_QUERY_INFO,
+ SMB2_OP_SET_REPARSE,
+ SMB2_OP_GET_REPARSE
+};
/* Used when constructing chained read requests. */
#define CHAINED_REQUEST 1
diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c
index c94940af5d4b..05818cd6d932 100644
--- a/fs/smb/client/smb2inode.c
+++ b/fs/smb/client/smb2inode.c
@@ -26,13 +26,63 @@
#include "cached_dir.h"
#include "smb2status.h"
-static void
-free_set_inf_compound(struct smb_rqst *rqst)
+static struct reparse_data_buffer *reparse_buf_ptr(struct kvec *iov)
{
- if (rqst[1].rq_iov)
- SMB2_set_info_free(&rqst[1]);
- if (rqst[2].rq_iov)
- SMB2_close_free(&rqst[2]);
+ struct reparse_data_buffer *buf;
+ struct smb2_ioctl_rsp *io = iov->iov_base;
+ u32 off, count, len;
+
+ count = le32_to_cpu(io->OutputCount);
+ off = le32_to_cpu(io->OutputOffset);
+ if (check_add_overflow(off, count, &len) || len > iov->iov_len)
+ return ERR_PTR(-EIO);
+
+ buf = (struct reparse_data_buffer *)((u8 *)io + off);
+ len = sizeof(*buf);
+ if (count < len || count < le16_to_cpu(buf->ReparseDataLength) + len)
+ return ERR_PTR(-EIO);
+ return buf;
+}
+
+static inline __u32 file_create_options(struct dentry *dentry)
+{
+ struct cifsInodeInfo *ci;
+
+ if (dentry) {
+ ci = CIFS_I(d_inode(dentry));
+ if (ci->cifsAttrs & ATTR_REPARSE)
+ return OPEN_REPARSE_POINT;
+ }
+ return 0;
+}
+
+/* Parse owner and group from SMB3.1.1 POSIX query info */
+static int parse_posix_sids(struct cifs_open_info_data *data,
+ struct kvec *rsp_iov)
+{
+ struct smb2_query_info_rsp *qi = rsp_iov->iov_base;
+ unsigned int out_len = le32_to_cpu(qi->OutputBufferLength);
+ unsigned int qi_len = sizeof(data->posix_fi);
+ int owner_len, group_len;
+ u8 *sidsbuf, *sidsbuf_end;
+
+ if (out_len <= qi_len)
+ return -EINVAL;
+
+ sidsbuf = (u8 *)qi + le16_to_cpu(qi->OutputBufferOffset) + qi_len;
+ sidsbuf_end = sidsbuf + out_len - qi_len;
+
+ owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
+ if (owner_len == -1)
+ return -EINVAL;
+
+ memcpy(&data->posix_owner, sidsbuf, owner_len);
+ group_len = posix_info_sid_size(sidsbuf + owner_len, sidsbuf_end);
+ if (group_len == -1)
+ return -EINVAL;
+
+ memcpy(&data->posix_group, sidsbuf + owner_len, group_len);
+ return 0;
}
/*
@@ -45,13 +95,15 @@ free_set_inf_compound(struct smb_rqst *rqst)
*/
static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *full_path,
- __u32 desired_access, __u32 create_disposition, __u32 create_options,
- umode_t mode, void *ptr, int command, struct cifsFileInfo *cfile,
- __u8 **extbuf, size_t *extbuflen,
+ __u32 desired_access, __u32 create_disposition,
+ __u32 create_options, umode_t mode, struct kvec *in_iov,
+ int *cmds, int num_cmds, struct cifsFileInfo *cfile,
struct kvec *out_iov, int *out_buftype)
{
+
+ struct reparse_data_buffer *rbuf;
struct smb2_compound_vars *vars = NULL;
- struct kvec *rsp_iov;
+ struct kvec *rsp_iov, *iov;
struct smb_rqst *rqst;
int rc;
__le16 *utf16_path = NULL;
@@ -59,8 +111,8 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_fid fid;
struct cifs_ses *ses = tcon->ses;
struct TCP_Server_Info *server;
- int num_rqst = 0;
- int resp_buftype[3];
+ int num_rqst = 0, i;
+ int resp_buftype[MAX_COMPOUND];
struct smb2_query_info_rsp *qi_rsp = NULL;
struct cifs_open_info_data *idata;
int flags = 0;
@@ -68,6 +120,14 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int size[2];
void *data[2];
int len;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ num_rqst = 0;
+ server = cifs_pick_channel(ses);
vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
if (vars == NULL)
@@ -75,12 +135,11 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
rqst = &vars->rqst[0];
rsp_iov = &vars->rsp_iov[0];
- server = cifs_pick_channel(ses);
-
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
+ for (i = 0; i < ARRAY_SIZE(resp_buftype); i++)
+ resp_buftype[i] = CIFS_NO_BUFFER;
/* We already have a handle so we can skip the open */
if (cfile)
@@ -118,242 +177,277 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
num_rqst++;
rc = 0;
- /* Operation */
- switch (command) {
- case SMB2_OP_QUERY_INFO:
- rqst[num_rqst].rq_iov = &vars->qi_iov;
- rqst[num_rqst].rq_nvec = 1;
-
- if (cfile)
- rc = SMB2_query_info_init(tcon, server,
- &rqst[num_rqst],
- cfile->fid.persistent_fid,
- cfile->fid.volatile_fid,
- FILE_ALL_INFORMATION,
- SMB2_O_INFO_FILE, 0,
- sizeof(struct smb2_file_all_info) +
- PATH_MAX * 2, 0, NULL);
- else {
- rc = SMB2_query_info_init(tcon, server,
- &rqst[num_rqst],
- COMPOUND_FID,
- COMPOUND_FID,
- FILE_ALL_INFORMATION,
- SMB2_O_INFO_FILE, 0,
- sizeof(struct smb2_file_all_info) +
- PATH_MAX * 2, 0, NULL);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
+ for (i = 0; i < num_cmds; i++) {
+ /* Operation */
+ switch (cmds[i]) {
+ case SMB2_OP_QUERY_INFO:
+ rqst[num_rqst].rq_iov = &vars->qi_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+ if (cfile) {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ } else {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ FILE_ALL_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb2_file_all_info) +
+ PATH_MAX * 2, 0, NULL);
+ if (!rc) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ }
}
- }
- if (rc)
- goto finished;
- num_rqst++;
- trace_smb3_query_info_compound_enter(xid, ses->Suid, tcon->tid,
- full_path);
- break;
- case SMB2_OP_POSIX_QUERY_INFO:
- rqst[num_rqst].rq_iov = &vars->qi_iov;
- rqst[num_rqst].rq_nvec = 1;
-
- if (cfile)
- rc = SMB2_query_info_init(tcon, server,
- &rqst[num_rqst],
- cfile->fid.persistent_fid,
- cfile->fid.volatile_fid,
- SMB_FIND_FILE_POSIX_INFO,
- SMB2_O_INFO_FILE, 0,
+ if (rc)
+ goto finished;
+ num_rqst++;
+ trace_smb3_query_info_compound_enter(xid, ses->Suid,
+ tcon->tid, full_path);
+ break;
+ case SMB2_OP_POSIX_QUERY_INFO:
+ rqst[num_rqst].rq_iov = &vars->qi_iov;
+ rqst[num_rqst].rq_nvec = 1;
+
+ if (cfile) {
/* TBD: fix following to allow for longer SIDs */
- sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) +
- (sizeof(struct cifs_sid) * 2), 0, NULL);
- else {
- rc = SMB2_query_info_init(tcon, server,
- &rqst[num_rqst],
- COMPOUND_FID,
- COMPOUND_FID,
- SMB_FIND_FILE_POSIX_INFO,
- SMB2_O_INFO_FILE, 0,
- sizeof(struct smb311_posix_qinfo *) + (PATH_MAX * 2) +
- (sizeof(struct cifs_sid) * 2), 0, NULL);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ SMB_FIND_FILE_POSIX_INFO,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb311_posix_qinfo *) +
+ (PATH_MAX * 2) +
+ (sizeof(struct cifs_sid) * 2), 0, NULL);
+ } else {
+ rc = SMB2_query_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ SMB_FIND_FILE_POSIX_INFO,
+ SMB2_O_INFO_FILE, 0,
+ sizeof(struct smb311_posix_qinfo *) +
+ (PATH_MAX * 2) +
+ (sizeof(struct cifs_sid) * 2), 0, NULL);
+ if (!rc) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ }
}
- }
- if (rc)
- goto finished;
- num_rqst++;
- trace_smb3_posix_query_info_compound_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_DELETE:
- trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_MKDIR:
- /*
- * Directories are created through parameters in the
- * SMB2_open() call.
- */
- trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_RMDIR:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
- rqst[num_rqst].rq_nvec = 1;
-
- size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
- data[0] = &delete_pending[0];
-
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst], COMPOUND_FID,
- COMPOUND_FID, current->tgid,
- FILE_DISPOSITION_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (rc)
- goto finished;
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst++]);
- trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_SET_EOF:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
- rqst[num_rqst].rq_nvec = 1;
+ if (rc)
+ goto finished;
+ num_rqst++;
+ trace_smb3_posix_query_info_compound_enter(xid, ses->Suid,
+ tcon->tid, full_path);
+ break;
+ case SMB2_OP_DELETE:
+ trace_smb3_delete_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_MKDIR:
+ /*
+ * Directories are created through parameters in the
+ * SMB2_open() call.
+ */
+ trace_smb3_mkdir_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_RMDIR:
+ rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_nvec = 1;
- size[0] = 8; /* sizeof __le64 */
- data[0] = ptr;
+ size[0] = 1; /* sizeof __u8 See MS-FSCC section 2.4.11 */
+ data[0] = &delete_pending[0];
- if (cfile) {
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- cfile->fid.persistent_fid,
- cfile->fid.volatile_fid,
- current->tgid,
- FILE_END_OF_FILE_INFORMATION,
- SMB2_O_INFO_FILE, 0,
- data, size);
- } else {
rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- COMPOUND_FID,
- COMPOUND_FID,
- current->tgid,
- FILE_END_OF_FILE_INFORMATION,
- SMB2_O_INFO_FILE, 0,
- data, size);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
+ &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_DISPOSITION_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto finished;
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_rmdir_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_SET_EOF:
+ rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_nvec = 1;
+
+ size[0] = in_iov[i].iov_len;
+ data[0] = in_iov[i].iov_base;
+
+ if (cfile) {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ current->tgid,
+ FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ } else {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID,
+ current->tgid,
+ FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0,
+ data, size);
+ if (!rc) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ }
}
- }
- if (rc)
- goto finished;
- num_rqst++;
- trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_SET_INFO:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
- rqst[num_rqst].rq_nvec = 1;
-
-
- size[0] = sizeof(FILE_BASIC_INFO);
- data[0] = ptr;
-
- if (cfile)
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, current->tgid,
- FILE_BASIC_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- else {
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- COMPOUND_FID,
- COMPOUND_FID, current->tgid,
- FILE_BASIC_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
+ if (rc)
+ goto finished;
+ num_rqst++;
+ trace_smb3_set_eof_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_SET_INFO:
+ rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_nvec = 1;
+
+ size[0] = in_iov[i].iov_len;
+ data[0] = in_iov[i].iov_base;
+
+ if (cfile) {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid, current->tgid,
+ FILE_BASIC_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ } else {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_BASIC_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ if (!rc) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ }
}
- }
- if (rc)
- goto finished;
- num_rqst++;
- trace_smb3_set_info_compound_enter(xid, ses->Suid, tcon->tid,
- full_path);
- break;
- case SMB2_OP_RENAME:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
- rqst[num_rqst].rq_nvec = 2;
+ if (rc)
+ goto finished;
+ num_rqst++;
+ trace_smb3_set_info_compound_enter(xid, ses->Suid,
+ tcon->tid, full_path);
+ break;
+ case SMB2_OP_RENAME:
+ rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_nvec = 2;
- len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX));
+ len = in_iov[i].iov_len;
- vars->rename_info.ReplaceIfExists = 1;
- vars->rename_info.RootDirectory = 0;
- vars->rename_info.FileNameLength = cpu_to_le32(len);
+ vars->rename_info.ReplaceIfExists = 1;
+ vars->rename_info.RootDirectory = 0;
+ vars->rename_info.FileNameLength = cpu_to_le32(len);
- size[0] = sizeof(struct smb2_file_rename_info);
- data[0] = &vars->rename_info;
+ size[0] = sizeof(struct smb2_file_rename_info);
+ data[0] = &vars->rename_info;
- size[1] = len + 2 /* null */;
- data[1] = (__le16 *)ptr;
+ size[1] = len + 2 /* null */;
+ data[1] = in_iov[i].iov_base;
- if (cfile)
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- cfile->fid.persistent_fid,
- cfile->fid.volatile_fid,
- current->tgid, FILE_RENAME_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- else {
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst],
- COMPOUND_FID, COMPOUND_FID,
- current->tgid, FILE_RENAME_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (!rc) {
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst]);
+ if (cfile) {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ cfile->fid.persistent_fid,
+ cfile->fid.volatile_fid,
+ current->tgid, FILE_RENAME_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ } else {
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst],
+ COMPOUND_FID, COMPOUND_FID,
+ current->tgid, FILE_RENAME_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ if (!rc) {
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst]);
+ }
}
- }
- if (rc)
- goto finished;
- num_rqst++;
- trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- case SMB2_OP_HARDLINK:
- rqst[num_rqst].rq_iov = &vars->si_iov[0];
- rqst[num_rqst].rq_nvec = 2;
+ if (rc)
+ goto finished;
+ num_rqst++;
+ trace_smb3_rename_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_HARDLINK:
+ rqst[num_rqst].rq_iov = &vars->si_iov[0];
+ rqst[num_rqst].rq_nvec = 2;
- len = (2 * UniStrnlen((wchar_t *)ptr, PATH_MAX));
+ len = in_iov[i].iov_len;
- vars->link_info.ReplaceIfExists = 0;
- vars->link_info.RootDirectory = 0;
- vars->link_info.FileNameLength = cpu_to_le32(len);
+ vars->link_info.ReplaceIfExists = 0;
+ vars->link_info.RootDirectory = 0;
+ vars->link_info.FileNameLength = cpu_to_le32(len);
- size[0] = sizeof(struct smb2_file_link_info);
- data[0] = &vars->link_info;
+ size[0] = sizeof(struct smb2_file_link_info);
+ data[0] = &vars->link_info;
- size[1] = len + 2 /* null */;
- data[1] = (__le16 *)ptr;
+ size[1] = len + 2 /* null */;
+ data[1] = in_iov[i].iov_base;
- rc = SMB2_set_info_init(tcon, server,
- &rqst[num_rqst], COMPOUND_FID,
- COMPOUND_FID, current->tgid,
- FILE_LINK_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
- if (rc)
- goto finished;
- smb2_set_next_command(tcon, &rqst[num_rqst]);
- smb2_set_related(&rqst[num_rqst++]);
- trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path);
- break;
- default:
- cifs_dbg(VFS, "Invalid command\n");
- rc = -EINVAL;
+ rc = SMB2_set_info_init(tcon, server,
+ &rqst[num_rqst], COMPOUND_FID,
+ COMPOUND_FID, current->tgid,
+ FILE_LINK_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto finished;
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_hardlink_enter(xid, ses->Suid, tcon->tid, full_path);
+ break;
+ case SMB2_OP_SET_REPARSE:
+ rqst[num_rqst].rq_iov = vars->io_iov;
+ rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov);
+
+ rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+ COMPOUND_FID, COMPOUND_FID,
+ FSCTL_SET_REPARSE_POINT,
+ in_iov[i].iov_base,
+ in_iov[i].iov_len, 0);
+ if (rc)
+ goto finished;
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_set_reparse_compound_enter(xid, ses->Suid,
+ tcon->tid, full_path);
+ break;
+ case SMB2_OP_GET_REPARSE:
+ rqst[num_rqst].rq_iov = vars->io_iov;
+ rqst[num_rqst].rq_nvec = ARRAY_SIZE(vars->io_iov);
+
+ rc = SMB2_ioctl_init(tcon, server, &rqst[num_rqst],
+ COMPOUND_FID, COMPOUND_FID,
+ FSCTL_GET_REPARSE_POINT,
+ NULL, 0, CIFSMaxBufSize);
+ if (rc)
+ goto finished;
+ smb2_set_next_command(tcon, &rqst[num_rqst]);
+ smb2_set_related(&rqst[num_rqst++]);
+ trace_smb3_get_reparse_compound_enter(xid, ses->Suid,
+ tcon->tid, full_path);
+ break;
+ default:
+ cifs_dbg(VFS, "Invalid command\n");
+ rc = -EINVAL;
+ }
}
if (rc)
goto finished;
@@ -375,157 +469,191 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon,
num_rqst++;
if (cfile) {
+ if (retries)
+ for (i = 1; i < num_rqst - 2; i++)
+ smb2_set_replay(server, &rqst[i]);
+
rc = compound_send_recv(xid, ses, server,
flags, num_rqst - 2,
&rqst[1], &resp_buftype[1],
&rsp_iov[1]);
- } else
+ } else {
+ if (retries)
+ for (i = 0; i < num_rqst; i++)
+ smb2_set_replay(server, &rqst[i]);
+
rc = compound_send_recv(xid, ses, server,
flags, num_rqst,
rqst, resp_buftype,
rsp_iov);
+ }
- finished:
- SMB2_open_free(&rqst[0]);
+finished:
+ num_rqst = 0;
+ SMB2_open_free(&rqst[num_rqst++]);
if (rc == -EREMCHG) {
pr_warn_once("server share %s deleted\n", tcon->tree_name);
tcon->need_reconnect = true;
}
- switch (command) {
- case SMB2_OP_QUERY_INFO:
- idata = ptr;
- if (rc == 0 && cfile && cfile->symlink_target) {
- idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
- if (!idata->symlink_target)
- rc = -ENOMEM;
- }
- if (rc == 0) {
- qi_rsp = (struct smb2_query_info_rsp *)
- rsp_iov[1].iov_base;
- rc = smb2_validate_and_copy_iov(
- le16_to_cpu(qi_rsp->OutputBufferOffset),
- le32_to_cpu(qi_rsp->OutputBufferLength),
- &rsp_iov[1], sizeof(idata->fi), (char *)&idata->fi);
- }
- if (rqst[1].rq_iov)
- SMB2_query_info_free(&rqst[1]);
- if (rqst[2].rq_iov)
- SMB2_close_free(&rqst[2]);
- if (rc)
- trace_smb3_query_info_compound_err(xid, ses->Suid,
- tcon->tid, rc);
- else
- trace_smb3_query_info_compound_done(xid, ses->Suid,
- tcon->tid);
- break;
- case SMB2_OP_POSIX_QUERY_INFO:
- idata = ptr;
- if (rc == 0 && cfile && cfile->symlink_target) {
- idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
- if (!idata->symlink_target)
- rc = -ENOMEM;
- }
- if (rc == 0) {
- qi_rsp = (struct smb2_query_info_rsp *)
- rsp_iov[1].iov_base;
- rc = smb2_validate_and_copy_iov(
- le16_to_cpu(qi_rsp->OutputBufferOffset),
- le32_to_cpu(qi_rsp->OutputBufferLength),
- &rsp_iov[1], sizeof(idata->posix_fi) /* add SIDs */,
- (char *)&idata->posix_fi);
- }
- if (rc == 0) {
- unsigned int length = le32_to_cpu(qi_rsp->OutputBufferLength);
-
- if (length > sizeof(idata->posix_fi)) {
- char *base = (char *)rsp_iov[1].iov_base +
- le16_to_cpu(qi_rsp->OutputBufferOffset) +
- sizeof(idata->posix_fi);
- *extbuflen = length - sizeof(idata->posix_fi);
- *extbuf = kmemdup(base, *extbuflen, GFP_KERNEL);
- if (!*extbuf)
+ for (i = 0; i < num_cmds; i++) {
+ switch (cmds[i]) {
+ case SMB2_OP_QUERY_INFO:
+ idata = in_iov[i].iov_base;
+ if (rc == 0 && cfile && cfile->symlink_target) {
+ idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!idata->symlink_target)
rc = -ENOMEM;
+ }
+ if (rc == 0) {
+ qi_rsp = (struct smb2_query_info_rsp *)
+ rsp_iov[i + 1].iov_base;
+ rc = smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ le32_to_cpu(qi_rsp->OutputBufferLength),
+ &rsp_iov[i + 1], sizeof(idata->fi), (char *)&idata->fi);
+ }
+ SMB2_query_info_free(&rqst[num_rqst++]);
+ if (rc)
+ trace_smb3_query_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_query_info_compound_done(xid, ses->Suid,
+ tcon->tid);
+ break;
+ case SMB2_OP_POSIX_QUERY_INFO:
+ idata = in_iov[i].iov_base;
+ if (rc == 0 && cfile && cfile->symlink_target) {
+ idata->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL);
+ if (!idata->symlink_target)
+ rc = -ENOMEM;
+ }
+ if (rc == 0) {
+ qi_rsp = (struct smb2_query_info_rsp *)
+ rsp_iov[i + 1].iov_base;
+ rc = smb2_validate_and_copy_iov(
+ le16_to_cpu(qi_rsp->OutputBufferOffset),
+ le32_to_cpu(qi_rsp->OutputBufferLength),
+ &rsp_iov[i + 1], sizeof(idata->posix_fi) /* add SIDs */,
+ (char *)&idata->posix_fi);
+ }
+ if (rc == 0)
+ rc = parse_posix_sids(idata, &rsp_iov[i + 1]);
+
+ SMB2_query_info_free(&rqst[num_rqst++]);
+ if (rc)
+ trace_smb3_posix_query_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_posix_query_info_compound_done(xid, ses->Suid,
+ tcon->tid);
+ break;
+ case SMB2_OP_DELETE:
+ if (rc)
+ trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
+ break;
+ case SMB2_OP_MKDIR:
+ if (rc)
+ trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid);
+ break;
+ case SMB2_OP_HARDLINK:
+ if (rc)
+ trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid);
+ SMB2_set_info_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_RENAME:
+ if (rc)
+ trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rename_done(xid, ses->Suid, tcon->tid);
+ SMB2_set_info_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_RMDIR:
+ if (rc)
+ trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid);
+ SMB2_set_info_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_SET_EOF:
+ if (rc)
+ trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc);
+ else
+ trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid);
+ SMB2_set_info_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_SET_INFO:
+ if (rc)
+ trace_smb3_set_info_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ else
+ trace_smb3_set_info_compound_done(xid, ses->Suid,
+ tcon->tid);
+ SMB2_set_info_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_SET_REPARSE:
+ if (rc) {
+ trace_smb3_set_reparse_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ } else {
+ trace_smb3_set_reparse_compound_done(xid, ses->Suid,
+ tcon->tid);
+ }
+ SMB2_ioctl_free(&rqst[num_rqst++]);
+ break;
+ case SMB2_OP_GET_REPARSE:
+ if (!rc) {
+ iov = &rsp_iov[i + 1];
+ idata = in_iov[i].iov_base;
+ idata->reparse.io.iov = *iov;
+ idata->reparse.io.buftype = resp_buftype[i + 1];
+ rbuf = reparse_buf_ptr(iov);
+ if (IS_ERR(rbuf)) {
+ rc = PTR_ERR(rbuf);
+ trace_smb3_set_reparse_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
+ } else {
+ idata->reparse.tag = le32_to_cpu(rbuf->ReparseTag);
+ trace_smb3_set_reparse_compound_done(xid, ses->Suid,
+ tcon->tid);
+ }
+ memset(iov, 0, sizeof(*iov));
+ resp_buftype[i + 1] = CIFS_NO_BUFFER;
} else {
- rc = -EINVAL;
+ trace_smb3_set_reparse_compound_err(xid, ses->Suid,
+ tcon->tid, rc);
}
+ SMB2_ioctl_free(&rqst[num_rqst++]);
+ break;
}
- if (rqst[1].rq_iov)
- SMB2_query_info_free(&rqst[1]);
- if (rqst[2].rq_iov)
- SMB2_close_free(&rqst[2]);
- if (rc)
- trace_smb3_posix_query_info_compound_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_posix_query_info_compound_done(xid, ses->Suid, tcon->tid);
- break;
- case SMB2_OP_DELETE:
- if (rc)
- trace_smb3_delete_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_delete_done(xid, ses->Suid, tcon->tid);
- if (rqst[1].rq_iov)
- SMB2_close_free(&rqst[1]);
- break;
- case SMB2_OP_MKDIR:
- if (rc)
- trace_smb3_mkdir_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_mkdir_done(xid, ses->Suid, tcon->tid);
- if (rqst[1].rq_iov)
- SMB2_close_free(&rqst[1]);
- break;
- case SMB2_OP_HARDLINK:
- if (rc)
- trace_smb3_hardlink_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_hardlink_done(xid, ses->Suid, tcon->tid);
- free_set_inf_compound(rqst);
- break;
- case SMB2_OP_RENAME:
- if (rc)
- trace_smb3_rename_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_rename_done(xid, ses->Suid, tcon->tid);
- free_set_inf_compound(rqst);
- break;
- case SMB2_OP_RMDIR:
- if (rc)
- trace_smb3_rmdir_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_rmdir_done(xid, ses->Suid, tcon->tid);
- free_set_inf_compound(rqst);
- break;
- case SMB2_OP_SET_EOF:
- if (rc)
- trace_smb3_set_eof_err(xid, ses->Suid, tcon->tid, rc);
- else
- trace_smb3_set_eof_done(xid, ses->Suid, tcon->tid);
- free_set_inf_compound(rqst);
- break;
- case SMB2_OP_SET_INFO:
- if (rc)
- trace_smb3_set_info_compound_err(xid, ses->Suid,
- tcon->tid, rc);
- else
- trace_smb3_set_info_compound_done(xid, ses->Suid,
- tcon->tid);
- free_set_inf_compound(rqst);
- break;
}
+ SMB2_close_free(&rqst[num_rqst]);
- if (cfile)
- cifsFileInfo_put(cfile);
-
+ num_cmds += 2;
if (out_iov && out_buftype) {
- memcpy(out_iov, rsp_iov, 3 * sizeof(*out_iov));
- memcpy(out_buftype, resp_buftype, 3 * sizeof(*out_buftype));
+ memcpy(out_iov, rsp_iov, num_cmds * sizeof(*out_iov));
+ memcpy(out_buftype, resp_buftype,
+ num_cmds * sizeof(*out_buftype));
} else {
- free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
- free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
+ for (i = 0; i < num_cmds; i++)
+ free_rsp_buf(resp_buftype[i], rsp_iov[i].iov_base);
}
+ num_cmds -= 2; /* correct num_cmds as there could be a retry */
kfree(vars);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
+ if (cfile)
+ cifsFileInfo_put(cfile);
+
return rc;
}
@@ -569,34 +697,57 @@ int smb2_query_path_info(const unsigned int xid,
struct cifsFileInfo *cfile;
struct cached_fid *cfid = NULL;
struct smb2_hdr *hdr;
- struct kvec out_iov[3] = {};
+ struct kvec in_iov[2], out_iov[3] = {};
int out_buftype[3] = {};
+ int cmds[2];
bool islink;
+ int i, num_cmds;
int rc, rc2;
data->adjust_tz = false;
data->reparse_point = false;
- if (strcmp(full_path, ""))
- rc = -ENOENT;
- else
- rc = open_cached_dir(xid, tcon, full_path, cifs_sb, false, &cfid);
- /* If it is a root and its handle is cached then use it */
- if (!rc) {
- if (cfid->file_all_info_is_valid) {
- memcpy(&data->fi, &cfid->file_all_info, sizeof(data->fi));
+ /*
+ * BB TODO: Add support for using cached root handle in SMB3.1.1 POSIX.
+ * Create SMB2_query_posix_info worker function to do non-compounded
+ * query when we already have an open file handle for this. For now this
+ * is fast enough (always using the compounded version).
+ */
+ if (!tcon->posix_extensions) {
+ if (*full_path) {
+ rc = -ENOENT;
} else {
- rc = SMB2_query_info(xid, tcon, cfid->fid.persistent_fid,
- cfid->fid.volatile_fid, &data->fi);
+ rc = open_cached_dir(xid, tcon, full_path,
+ cifs_sb, false, &cfid);
}
- close_cached_dir(cfid);
- return rc;
+ /* If it is a root and its handle is cached then use it */
+ if (!rc) {
+ if (cfid->file_all_info_is_valid) {
+ memcpy(&data->fi, &cfid->file_all_info,
+ sizeof(data->fi));
+ } else {
+ rc = SMB2_query_info(xid, tcon,
+ cfid->fid.persistent_fid,
+ cfid->fid.volatile_fid,
+ &data->fi);
+ }
+ close_cached_dir(cfid);
+ return rc;
+ }
+ cmds[0] = SMB2_OP_QUERY_INFO;
+ } else {
+ cmds[0] = SMB2_OP_POSIX_QUERY_INFO;
}
+ in_iov[0].iov_base = data;
+ in_iov[0].iov_len = sizeof(*data);
+ in_iov[1] = in_iov[0];
+
cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, data, SMB2_OP_QUERY_INFO, cfile,
- NULL, NULL, out_iov, out_buftype);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN,
+ create_options, ACL_NO_MODE, in_iov,
+ cmds, 1, cfile, out_iov, out_buftype);
hdr = out_iov[0].iov_base;
/*
* If first iov is unset, then SMB session was dropped or we've got a
@@ -608,18 +759,27 @@ int smb2_query_path_info(const unsigned int xid,
switch (rc) {
case 0:
case -EOPNOTSUPP:
+ /*
+ * BB TODO: When support for special files added to Samba
+ * re-verify this path.
+ */
rc = parse_create_response(data, cifs_sb, &out_iov[0]);
if (rc || !data->reparse_point)
goto out;
+ if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK) {
+ /* symlink already parsed in create response */
+ num_cmds = 1;
+ } else {
+ cmds[1] = SMB2_OP_GET_REPARSE;
+ num_cmds = 2;
+ }
create_options |= OPEN_REPARSE_POINT;
- /* Failed on a symbolic link - query a reparse point info */
cifs_get_readable_path(tcon, full_path, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, data,
- SMB2_OP_QUERY_INFO, cfile, NULL, NULL,
- NULL, NULL);
+ create_options, ACL_NO_MODE, in_iov,
+ cmds, num_cmds, cfile, NULL, NULL);
break;
case -EREMOTE:
break;
@@ -637,93 +797,8 @@ int smb2_query_path_info(const unsigned int xid,
}
out:
- free_rsp_buf(out_buftype[0], out_iov[0].iov_base);
- free_rsp_buf(out_buftype[1], out_iov[1].iov_base);
- free_rsp_buf(out_buftype[2], out_iov[2].iov_base);
- return rc;
-}
-
-int smb311_posix_query_path_info(const unsigned int xid,
- struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *full_path,
- struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group)
-{
- int rc;
- __u32 create_options = 0;
- struct cifsFileInfo *cfile;
- struct kvec out_iov[3] = {};
- int out_buftype[3] = {};
- __u8 *sidsbuf = NULL;
- __u8 *sidsbuf_end = NULL;
- size_t sidsbuflen = 0;
- size_t owner_len, group_len;
-
- data->adjust_tz = false;
- data->reparse_point = false;
-
- /*
- * BB TODO: Add support for using the cached root handle.
- * Create SMB2_query_posix_info worker function to do non-compounded query
- * when we already have an open file handle for this. For now this is fast enough
- * (always using the compounded version).
- */
-
- cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES, FILE_OPEN,
- create_options, ACL_NO_MODE, data, SMB2_OP_POSIX_QUERY_INFO, cfile,
- &sidsbuf, &sidsbuflen, out_iov, out_buftype);
- /*
- * If first iov is unset, then SMB session was dropped or we've got a
- * cached open file (@cfile).
- */
- if (!out_iov[0].iov_base || out_buftype[0] == CIFS_NO_BUFFER)
- goto out;
-
- switch (rc) {
- case 0:
- case -EOPNOTSUPP:
- /* BB TODO: When support for special files added to Samba re-verify this path */
- rc = parse_create_response(data, cifs_sb, &out_iov[0]);
- if (rc || !data->reparse_point)
- goto out;
-
- create_options |= OPEN_REPARSE_POINT;
- /* Failed on a symbolic link - query a reparse point info */
- cifs_get_readable_path(tcon, full_path, &cfile);
- rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, FILE_READ_ATTRIBUTES,
- FILE_OPEN, create_options, ACL_NO_MODE, data,
- SMB2_OP_POSIX_QUERY_INFO, cfile,
- &sidsbuf, &sidsbuflen, NULL, NULL);
- break;
- }
-
-out:
- if (rc == 0) {
- sidsbuf_end = sidsbuf + sidsbuflen;
-
- owner_len = posix_info_sid_size(sidsbuf, sidsbuf_end);
- if (owner_len == -1) {
- rc = -EINVAL;
- goto out;
- }
- memcpy(owner, sidsbuf, owner_len);
-
- group_len = posix_info_sid_size(
- sidsbuf + owner_len, sidsbuf_end);
- if (group_len == -1) {
- rc = -EINVAL;
- goto out;
- }
- memcpy(group, sidsbuf + owner_len, group_len);
- }
-
- kfree(sidsbuf);
- free_rsp_buf(out_buftype[0], out_iov[0].iov_base);
- free_rsp_buf(out_buftype[1], out_iov[1].iov_base);
- free_rsp_buf(out_buftype[2], out_iov[2].iov_base);
+ for (i = 0; i < ARRAY_SIZE(out_buftype); i++)
+ free_rsp_buf(out_buftype[i], out_iov[i].iov_base);
return rc;
}
@@ -734,8 +809,9 @@ smb2_mkdir(const unsigned int xid, struct inode *parent_inode, umode_t mode,
{
return smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, mode, NULL, SMB2_OP_MKDIR,
- NULL, NULL, NULL, NULL, NULL);
+ CREATE_NOT_FILE, mode,
+ NULL, &(int){SMB2_OP_MKDIR}, 1,
+ NULL, NULL, NULL);
}
void
@@ -743,21 +819,24 @@ smb2_mkdir_setinfo(struct inode *inode, const char *name,
struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon,
const unsigned int xid)
{
- FILE_BASIC_INFO data;
+ FILE_BASIC_INFO data = {};
struct cifsInodeInfo *cifs_i;
struct cifsFileInfo *cfile;
+ struct kvec in_iov;
u32 dosattrs;
int tmprc;
- memset(&data, 0, sizeof(data));
+ in_iov.iov_base = &data;
+ in_iov.iov_len = sizeof(data);
cifs_i = CIFS_I(inode);
dosattrs = cifs_i->cifsAttrs | ATTR_READONLY;
data.Attributes = cpu_to_le32(dosattrs);
cifs_get_writable_path(tcon, name, FIND_WR_ANY, &cfile);
tmprc = smb2_compound_op(xid, tcon, cifs_sb, name,
FILE_WRITE_ATTRIBUTES, FILE_CREATE,
- CREATE_NOT_FILE, ACL_NO_MODE,
- &data, SMB2_OP_SET_INFO, cfile, NULL, NULL, NULL, NULL);
+ CREATE_NOT_FILE, ACL_NO_MODE, &in_iov,
+ &(int){SMB2_OP_SET_INFO}, 1,
+ cfile, NULL, NULL);
if (tmprc == 0)
cifs_i->cifsAttrs = dosattrs;
}
@@ -767,9 +846,11 @@ smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
struct cifs_sb_info *cifs_sb)
{
drop_cached_dir_by_name(xid, tcon, name, cifs_sb);
- return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
- CREATE_NOT_FILE, ACL_NO_MODE,
- NULL, SMB2_OP_RMDIR, NULL, NULL, NULL, NULL, NULL);
+ return smb2_compound_op(xid, tcon, cifs_sb, name,
+ DELETE, FILE_OPEN, CREATE_NOT_FILE,
+ ACL_NO_MODE, NULL,
+ &(int){SMB2_OP_RMDIR}, 1,
+ NULL, NULL, NULL);
}
int
@@ -778,15 +859,18 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
{
return smb2_compound_op(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
- ACL_NO_MODE, NULL, SMB2_OP_DELETE, NULL, NULL, NULL, NULL, NULL);
+ ACL_NO_MODE, NULL,
+ &(int){SMB2_OP_DELETE}, 1,
+ NULL, NULL, NULL);
}
-static int
-smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb, __u32 access, int command,
- struct cifsFileInfo *cfile)
+static int smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb,
+ __u32 create_options, __u32 access,
+ int command, struct cifsFileInfo *cfile)
{
+ struct kvec in_iov;
__le16 *smb2_to_name = NULL;
int rc;
@@ -795,36 +879,43 @@ smb2_set_path_attr(const unsigned int xid, struct cifs_tcon *tcon,
rc = -ENOMEM;
goto smb2_rename_path;
}
+ in_iov.iov_base = smb2_to_name;
+ in_iov.iov_len = 2 * UniStrnlen((wchar_t *)smb2_to_name, PATH_MAX);
rc = smb2_compound_op(xid, tcon, cifs_sb, from_name, access,
- FILE_OPEN, 0, ACL_NO_MODE, smb2_to_name,
- command, cfile, NULL, NULL, NULL, NULL);
+ FILE_OPEN, create_options, ACL_NO_MODE,
+ &in_iov, &command, 1, cfile, NULL, NULL);
smb2_rename_path:
kfree(smb2_to_name);
return rc;
}
-int
-smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb)
+int smb2_rename_path(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb)
{
struct cifsFileInfo *cfile;
+ __u32 co = file_create_options(source_dentry);
drop_cached_dir_by_name(xid, tcon, from_name, cifs_sb);
cifs_get_writable_path(tcon, from_name, FIND_WR_WITH_DELETE, &cfile);
- return smb2_set_path_attr(xid, tcon, from_name, to_name,
- cifs_sb, DELETE, SMB2_OP_RENAME, cfile);
+ return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
+ co, DELETE, SMB2_OP_RENAME, cfile);
}
-int
-smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb)
+int smb2_create_hardlink(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb)
{
- return smb2_set_path_attr(xid, tcon, from_name, to_name, cifs_sb,
- FILE_READ_ATTRIBUTES, SMB2_OP_HARDLINK,
- NULL);
+ __u32 co = file_create_options(source_dentry);
+
+ return smb2_set_path_attr(xid, tcon, from_name, to_name,
+ cifs_sb, co, FILE_READ_ATTRIBUTES,
+ SMB2_OP_HARDLINK, NULL);
}
int
@@ -832,13 +923,18 @@ smb2_set_path_size(const unsigned int xid, struct cifs_tcon *tcon,
const char *full_path, __u64 size,
struct cifs_sb_info *cifs_sb, bool set_alloc)
{
- __le64 eof = cpu_to_le64(size);
struct cifsFileInfo *cfile;
+ struct kvec in_iov;
+ __le64 eof = cpu_to_le64(size);
+ in_iov.iov_base = &eof;
+ in_iov.iov_len = sizeof(eof);
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
return smb2_compound_op(xid, tcon, cifs_sb, full_path,
- FILE_WRITE_DATA, FILE_OPEN, 0, ACL_NO_MODE,
- &eof, SMB2_OP_SET_EOF, cfile, NULL, NULL, NULL, NULL);
+ FILE_WRITE_DATA, FILE_OPEN,
+ 0, ACL_NO_MODE, &in_iov,
+ &(int){SMB2_OP_SET_EOF}, 1,
+ cfile, NULL, NULL);
}
int
@@ -849,6 +945,7 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
struct tcon_link *tlink;
struct cifs_tcon *tcon;
struct cifsFileInfo *cfile;
+ struct kvec in_iov = { .iov_base = buf, .iov_len = sizeof(*buf), };
int rc;
if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) &&
@@ -864,8 +961,91 @@ smb2_set_file_info(struct inode *inode, const char *full_path,
cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
FILE_WRITE_ATTRIBUTES, FILE_OPEN,
- 0, ACL_NO_MODE, buf, SMB2_OP_SET_INFO, cfile,
- NULL, NULL, NULL, NULL);
+ 0, ACL_NO_MODE, &in_iov,
+ &(int){SMB2_OP_SET_INFO}, 1,
+ cfile, NULL, NULL);
cifs_put_tlink(tlink);
return rc;
}
+
+struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ struct kvec *iov)
+{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+ struct cifsFileInfo *cfile;
+ struct inode *new = NULL;
+ struct kvec in_iov[2];
+ int cmds[2];
+ int da, co, cd;
+ int rc;
+
+ da = SYNCHRONIZE | DELETE |
+ FILE_READ_ATTRIBUTES |
+ FILE_WRITE_ATTRIBUTES;
+ co = CREATE_NOT_DIR | OPEN_REPARSE_POINT;
+ cd = FILE_CREATE;
+ cmds[0] = SMB2_OP_SET_REPARSE;
+ in_iov[0] = *iov;
+ in_iov[1].iov_base = data;
+ in_iov[1].iov_len = sizeof(*data);
+
+ if (tcon->posix_extensions) {
+ cmds[1] = SMB2_OP_POSIX_QUERY_INFO;
+ cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ da, cd, co, ACL_NO_MODE, in_iov,
+ cmds, 2, cfile, NULL, NULL);
+ if (!rc) {
+ rc = smb311_posix_get_inode_info(&new, full_path,
+ data, sb, xid);
+ }
+ } else {
+ cmds[1] = SMB2_OP_QUERY_INFO;
+ cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ da, cd, co, ACL_NO_MODE, in_iov,
+ cmds, 2, cfile, NULL, NULL);
+ if (!rc) {
+ rc = cifs_get_inode_info(&new, full_path,
+ data, sb, xid, NULL);
+ }
+ }
+ return rc ? ERR_PTR(rc) : new;
+}
+
+int smb2_query_reparse_point(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ u32 *tag, struct kvec *rsp,
+ int *rsp_buftype)
+{
+ struct cifs_open_info_data data = {};
+ struct cifsFileInfo *cfile;
+ struct kvec in_iov = { .iov_base = &data, .iov_len = sizeof(data), };
+ int rc;
+
+ cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+
+ cifs_get_readable_path(tcon, full_path, &cfile);
+ rc = smb2_compound_op(xid, tcon, cifs_sb, full_path,
+ FILE_READ_ATTRIBUTES, FILE_OPEN,
+ OPEN_REPARSE_POINT, ACL_NO_MODE, &in_iov,
+ &(int){SMB2_OP_GET_REPARSE}, 1,
+ cfile, NULL, NULL);
+ if (rc)
+ goto out;
+
+ *tag = data.reparse.tag;
+ *rsp = data.reparse.io.iov;
+ *rsp_buftype = data.reparse.io.buftype;
+ memset(&data.reparse.io.iov, 0, sizeof(data.reparse.io.iov));
+ data.reparse.io.buftype = CIFS_NO_BUFFER;
+out:
+ cifs_free_open_info(&data);
+ return rc;
+}
diff --git a/fs/smb/client/smb2maperror.c b/fs/smb/client/smb2maperror.c
index 1a90dd78b238..ac1895358908 100644
--- a/fs/smb/client/smb2maperror.c
+++ b/fs/smb/client/smb2maperror.c
@@ -1210,6 +1210,8 @@ static const struct status_to_posix_error smb2_error_map_table[] = {
{STATUS_INVALID_TASK_INDEX, -EIO, "STATUS_INVALID_TASK_INDEX"},
{STATUS_THREAD_ALREADY_IN_TASK, -EIO, "STATUS_THREAD_ALREADY_IN_TASK"},
{STATUS_CALLBACK_BYPASS, -EIO, "STATUS_CALLBACK_BYPASS"},
+ {STATUS_SERVER_UNAVAILABLE, -EAGAIN, "STATUS_SERVER_UNAVAILABLE"},
+ {STATUS_FILE_NOT_AVAILABLE, -EAGAIN, "STATUS_FILE_NOT_AVAILABLE"},
{STATUS_PORT_CLOSED, -EIO, "STATUS_PORT_CLOSED"},
{STATUS_MESSAGE_LOST, -EIO, "STATUS_MESSAGE_LOST"},
{STATUS_INVALID_MESSAGE, -EIO, "STATUS_INVALID_MESSAGE"},
diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c
index 14bc745de199..4695433fcf39 100644
--- a/fs/smb/client/smb2ops.c
+++ b/fs/smb/client/smb2ops.c
@@ -614,11 +614,12 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
"multichannel not available\n"
"Empty network interface list returned by server %s\n",
ses->server->hostname);
- rc = -EINVAL;
+ rc = -EOPNOTSUPP;
+ ses->iface_last_update = jiffies;
goto out;
}
- while (bytes_left >= sizeof(*p)) {
+ while (bytes_left >= (ssize_t)sizeof(*p)) {
memset(&tmp_iface, 0, sizeof(tmp_iface));
tmp_iface.speed = le64_to_cpu(p->LinkSpeed);
tmp_iface.rdma_capable = le32_to_cpu(p->Capability & RDMA_CAPABLE) ? 1 : 0;
@@ -712,7 +713,6 @@ parse_server_interfaces(struct network_interface_info_ioctl_rsp *buf,
ses->iface_count++;
spin_unlock(&ses->iface_lock);
- ses->iface_last_update = jiffies;
next_iface:
nb_iface++;
next = le32_to_cpu(p->Next);
@@ -734,11 +734,7 @@ next_iface:
if ((bytes_left > 8) || p->Next)
cifs_dbg(VFS, "%s: incomplete interface info\n", __func__);
-
- if (!ses->iface_count) {
- rc = -EINVAL;
- goto out;
- }
+ ses->iface_last_update = jiffies;
out:
/*
@@ -1112,7 +1108,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
{
struct smb2_compound_vars *vars;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct smb_rqst *rqst;
struct kvec *rsp_iov;
__le16 *utf16_path = NULL;
@@ -1128,6 +1124,13 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
struct smb2_file_full_ea_info *ea = NULL;
struct smb2_query_info_rsp *rsp;
int rc, used_len = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_CP_CREATE_CLOSE_OP;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(ses);
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -1201,6 +1204,7 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, 0),
.fid = &fid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
@@ -1248,6 +1252,12 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
goto sea_exit;
smb2_set_related(&rqst[2]);
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ smb2_set_replay(server, &rqst[2]);
+ }
+
rc = compound_send_recv(xid, ses, server,
flags, 3, rqst,
resp_buftype, rsp_iov);
@@ -1264,6 +1274,11 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
kfree(vars);
out_free_path:
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
#endif
@@ -1488,7 +1503,7 @@ smb2_ioctl_query_info(const unsigned int xid,
struct smb_rqst *rqst;
struct kvec *rsp_iov;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
char __user *arg = (char __user *)p;
struct smb_query_info qi;
struct smb_query_info __user *pqi;
@@ -1505,6 +1520,13 @@ smb2_ioctl_query_info(const unsigned int xid,
void *data[2];
int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR;
void (*free_req1_func)(struct smb_rqst *r);
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_CP_CREATE_CLOSE_OP;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(ses);
vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
if (vars == NULL)
@@ -1548,6 +1570,7 @@ smb2_ioctl_query_info(const unsigned int xid,
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, create_options),
.fid = &fid,
+ .replay = !!(retries),
};
if (qi.flags & PASSTHRU_FSCTL) {
@@ -1645,6 +1668,12 @@ smb2_ioctl_query_info(const unsigned int xid,
goto free_req_1;
smb2_set_related(&rqst[2]);
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ smb2_set_replay(server, &rqst[2]);
+ }
+
rc = compound_send_recv(xid, ses, server,
flags, 3, rqst,
resp_buftype, rsp_iov);
@@ -1705,6 +1734,11 @@ free_output_buffer:
kfree(buffer);
free_vars:
kfree(vars);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -1935,7 +1969,6 @@ static int
smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile, __u64 size, bool set_alloc)
{
- __le64 eof = cpu_to_le64(size);
struct inode *inode;
/*
@@ -1952,7 +1985,7 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon,
}
return SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof);
+ cfile->fid.volatile_fid, cfile->pid, size);
}
static int
@@ -2232,8 +2265,14 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_open_parms oparms;
struct smb2_query_directory_rsp *qd_rsp = NULL;
struct smb2_create_rsp *op_rsp = NULL;
- struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
- int retry_count = 0;
+ struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(tcon->ses);
utf16_path = cifs_convert_path_to_utf16(path, cifs_sb);
if (!utf16_path)
@@ -2258,6 +2297,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, 0),
.fid = fid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
@@ -2283,14 +2323,15 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
smb2_set_related(&rqst[1]);
-again:
+ if (retries) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[1]);
+ }
+
rc = compound_send_recv(xid, tcon->ses, server,
flags, 2, rqst,
resp_buftype, rsp_iov);
- if (rc == -EAGAIN && retry_count++ < 10)
- goto again;
-
/* If the open failed there is nothing to do */
op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base;
if (op_rsp == NULL || op_rsp->hdr.Status != STATUS_SUCCESS) {
@@ -2338,6 +2379,11 @@ again:
SMB2_query_directory_free(&rqst[1]);
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -2463,6 +2509,22 @@ smb2_oplock_response(struct cifs_tcon *tcon, __u64 persistent_fid,
}
void
+smb2_set_replay(struct TCP_Server_Info *server, struct smb_rqst *rqst)
+{
+ struct smb2_hdr *shdr;
+
+ if (server->dialect < SMB30_PROT_ID)
+ return;
+
+ shdr = (struct smb2_hdr *)(rqst->rq_iov[0].iov_base);
+ if (shdr == NULL) {
+ cifs_dbg(FYI, "shdr NULL in smb2_set_related\n");
+ return;
+ }
+ shdr->Flags |= SMB2_FLAGS_REPLAY_OPERATION;
+}
+
+void
smb2_set_related(struct smb_rqst *rqst)
{
struct smb2_hdr *shdr;
@@ -2535,6 +2597,27 @@ smb2_set_next_command(struct cifs_tcon *tcon, struct smb_rqst *rqst)
}
/*
+ * helper function for exponential backoff and check if replayable
+ */
+bool smb2_should_replay(struct cifs_tcon *tcon,
+ int *pretries,
+ int *pcur_sleep)
+{
+ if (!pretries || !pcur_sleep)
+ return false;
+
+ if (tcon->retry || (*pretries)++ < tcon->ses->server->retrans) {
+ msleep(*pcur_sleep);
+ (*pcur_sleep) = ((*pcur_sleep) << 1);
+ if ((*pcur_sleep) > CIFS_MAX_SLEEP)
+ (*pcur_sleep) = CIFS_MAX_SLEEP;
+ return true;
+ }
+
+ return false;
+}
+
+/*
* Passes the query info response back to the caller on success.
* Caller need to free this with free_rsp_buf().
*/
@@ -2547,7 +2630,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
{
struct smb2_compound_vars *vars;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = CIFS_CP_CREATE_CLOSE_OP;
struct smb_rqst *rqst;
int resp_buftype[3];
@@ -2558,6 +2641,13 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
__le16 *utf16_path;
struct cached_fid *cfid = NULL;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_CP_CREATE_CLOSE_OP;
+ oplock = SMB2_OPLOCK_LEVEL_NONE;
+ server = cifs_pick_channel(ses);
if (!path)
path = "";
@@ -2594,6 +2684,7 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
.disposition = FILE_OPEN,
.create_options = cifs_create_options(cifs_sb, 0),
.fid = &fid,
+ .replay = !!(retries),
};
rc = SMB2_open_init(tcon, server,
@@ -2638,6 +2729,14 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
goto qic_exit;
smb2_set_related(&rqst[2]);
+ if (retries) {
+ if (!cfid) {
+ smb2_set_replay(server, &rqst[0]);
+ smb2_set_replay(server, &rqst[2]);
+ }
+ smb2_set_replay(server, &rqst[1]);
+ }
+
if (cfid) {
rc = compound_send_recv(xid, ses, server,
flags, 1, &rqst[1],
@@ -2670,6 +2769,11 @@ smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon,
kfree(vars);
out_free_path:
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -2948,18 +3052,6 @@ int parse_reparse_point(struct reparse_data_buffer *buf,
u32 plen, struct cifs_sb_info *cifs_sb,
bool unicode, struct cifs_open_info_data *data)
{
- if (plen < sizeof(*buf)) {
- cifs_dbg(VFS, "%s: reparse buffer is too small. Must be at least 8 bytes but was %d\n",
- __func__, plen);
- return -EIO;
- }
-
- if (plen < le16_to_cpu(buf->ReparseDataLength) + sizeof(*buf)) {
- cifs_dbg(VFS, "%s: invalid reparse buf length: %d\n",
- __func__, plen);
- return -EIO;
- }
-
data->reparse.buf = buf;
/* See MS-FSCC 2.1.2 */
@@ -2997,145 +3089,6 @@ static int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb,
return parse_reparse_point(buf, plen, cifs_sb, true, data);
}
-static int smb2_query_reparse_point(const unsigned int xid,
- struct cifs_tcon *tcon,
- struct cifs_sb_info *cifs_sb,
- const char *full_path,
- u32 *tag, struct kvec *rsp,
- int *rsp_buftype)
-{
- struct smb2_compound_vars *vars;
- int rc;
- __le16 *utf16_path = NULL;
- __u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
- struct cifs_open_parms oparms;
- struct cifs_fid fid;
- struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
- int flags = CIFS_CP_CREATE_CLOSE_OP;
- struct smb_rqst *rqst;
- int resp_buftype[3];
- struct kvec *rsp_iov;
- struct smb2_ioctl_rsp *ioctl_rsp;
- struct reparse_data_buffer *reparse_buf;
- u32 off, count, len;
-
- cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
-
- if (smb3_encryption_required(tcon))
- flags |= CIFS_TRANSFORM_REQ;
-
- utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
- if (!utf16_path)
- return -ENOMEM;
-
- resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
- vars = kzalloc(sizeof(*vars), GFP_KERNEL);
- if (!vars) {
- rc = -ENOMEM;
- goto out_free_path;
- }
- rqst = vars->rqst;
- rsp_iov = vars->rsp_iov;
-
- /*
- * setup smb2open - TODO add optimization to call cifs_get_readable_path
- * to see if there is a handle already open that we can use
- */
- rqst[0].rq_iov = vars->open_iov;
- rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE;
-
- oparms = (struct cifs_open_parms) {
- .tcon = tcon,
- .path = full_path,
- .desired_access = FILE_READ_ATTRIBUTES,
- .disposition = FILE_OPEN,
- .create_options = cifs_create_options(cifs_sb, OPEN_REPARSE_POINT),
- .fid = &fid,
- };
-
- rc = SMB2_open_init(tcon, server,
- &rqst[0], &oplock, &oparms, utf16_path);
- if (rc)
- goto query_rp_exit;
- smb2_set_next_command(tcon, &rqst[0]);
-
-
- /* IOCTL */
- rqst[1].rq_iov = vars->io_iov;
- rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
-
- rc = SMB2_ioctl_init(tcon, server,
- &rqst[1], COMPOUND_FID,
- COMPOUND_FID, FSCTL_GET_REPARSE_POINT, NULL, 0,
- CIFSMaxBufSize -
- MAX_SMB2_CREATE_RESPONSE_SIZE -
- MAX_SMB2_CLOSE_RESPONSE_SIZE);
- if (rc)
- goto query_rp_exit;
-
- smb2_set_next_command(tcon, &rqst[1]);
- smb2_set_related(&rqst[1]);
-
- /* Close */
- rqst[2].rq_iov = &vars->close_iov;
- rqst[2].rq_nvec = 1;
-
- rc = SMB2_close_init(tcon, server,
- &rqst[2], COMPOUND_FID, COMPOUND_FID, false);
- if (rc)
- goto query_rp_exit;
-
- smb2_set_related(&rqst[2]);
-
- rc = compound_send_recv(xid, tcon->ses, server,
- flags, 3, rqst,
- resp_buftype, rsp_iov);
-
- ioctl_rsp = rsp_iov[1].iov_base;
-
- /*
- * Open was successful and we got an ioctl response.
- */
- if (rc == 0) {
- /* See MS-FSCC 2.3.23 */
- off = le32_to_cpu(ioctl_rsp->OutputOffset);
- count = le32_to_cpu(ioctl_rsp->OutputCount);
- if (check_add_overflow(off, count, &len) ||
- len > rsp_iov[1].iov_len) {
- cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n",
- __func__, off, count);
- rc = -EIO;
- goto query_rp_exit;
- }
-
- reparse_buf = (void *)((u8 *)ioctl_rsp + off);
- len = sizeof(*reparse_buf);
- if (count < len ||
- count < le16_to_cpu(reparse_buf->ReparseDataLength) + len) {
- cifs_tcon_dbg(VFS, "%s: invalid ioctl: off=%d count=%d\n",
- __func__, off, count);
- rc = -EIO;
- goto query_rp_exit;
- }
- *tag = le32_to_cpu(reparse_buf->ReparseTag);
- *rsp = rsp_iov[1];
- *rsp_buftype = resp_buftype[1];
- resp_buftype[1] = CIFS_NO_BUFFER;
- }
-
- query_rp_exit:
- SMB2_open_free(&rqst[0]);
- SMB2_ioctl_free(&rqst[1]);
- SMB2_close_free(&rqst[2]);
- free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
- free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
- free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
- kfree(vars);
-out_free_path:
- kfree(utf16_path);
- return rc;
-}
-
static struct cifs_ntsd *
get_smb2_acl_by_fid(struct cifs_sb_info *cifs_sb,
const struct cifs_fid *cifsfid, u32 *pacllen, u32 info)
@@ -3336,7 +3289,6 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
unsigned long long new_size;
long rc;
unsigned int xid;
- __le64 eof;
xid = get_xid();
@@ -3366,11 +3318,13 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
*/
new_size = offset + len;
if (keep_size == false && (unsigned long long)i_size_read(inode) < new_size) {
- eof = cpu_to_le64(new_size);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof);
+ cfile->fid.volatile_fid, cfile->pid, new_size);
if (rc >= 0) {
truncate_setsize(inode, new_size);
+ netfs_resize_file(&cifsi->netfs, new_size, true);
+ if (offset < cifsi->netfs.zero_point)
+ cifsi->netfs.zero_point = offset;
fscache_resize_cookie(cifs_inode_cookie(inode), new_size);
}
}
@@ -3561,7 +3515,7 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
struct cifsFileInfo *cfile = file->private_data;
long rc = -EOPNOTSUPP;
unsigned int xid;
- __le64 eof;
+ loff_t new_eof;
xid = get_xid();
@@ -3590,14 +3544,14 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon,
if (cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)
smb2_set_sparse(xid, tcon, cfile, inode, false);
- eof = cpu_to_le64(off + len);
+ new_eof = off + len;
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof);
+ cfile->fid.volatile_fid, cfile->pid, new_eof);
if (rc == 0) {
- cifsi->server_eof = off + len;
- cifs_setsize(inode, off + len);
+ netfs_resize_file(&cifsi->netfs, new_eof, true);
+ cifs_setsize(inode, new_eof);
cifs_truncate_page(inode->i_mapping, inode->i_size);
- truncate_setsize(inode, off + len);
+ truncate_setsize(inode, new_eof);
}
goto out;
}
@@ -3686,10 +3640,10 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
int rc;
unsigned int xid;
struct inode *inode = file_inode(file);
- struct cifsFileInfo *cfile = file->private_data;
struct cifsInodeInfo *cifsi = CIFS_I(inode);
- __le64 eof;
- loff_t old_eof;
+ struct cifsFileInfo *cfile = file->private_data;
+ struct netfs_inode *ictx = &cifsi->netfs;
+ loff_t old_eof, new_eof;
xid = get_xid();
@@ -3708,23 +3662,25 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon,
goto out_2;
truncate_pagecache_range(inode, off, old_eof);
+ ictx->zero_point = old_eof;
rc = smb2_copychunk_range(xid, cfile, cfile, off + len,
old_eof - off - len, off);
if (rc < 0)
goto out_2;
- eof = cpu_to_le64(old_eof - len);
+ new_eof = old_eof - len;
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof);
+ cfile->fid.volatile_fid, cfile->pid, new_eof);
if (rc < 0)
goto out_2;
rc = 0;
- cifsi->server_eof = i_size_read(inode) - len;
- truncate_setsize(inode, cifsi->server_eof);
- fscache_resize_cookie(cifs_inode_cookie(inode), cifsi->server_eof);
+ truncate_setsize(inode, new_eof);
+ netfs_resize_file(&cifsi->netfs, new_eof, true);
+ ictx->zero_point = new_eof;
+ fscache_resize_cookie(cifs_inode_cookie(inode), new_eof);
out_2:
filemap_invalidate_unlock(inode->i_mapping);
out:
@@ -3740,8 +3696,8 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
unsigned int xid;
struct cifsFileInfo *cfile = file->private_data;
struct inode *inode = file_inode(file);
- __le64 eof;
- __u64 count, old_eof;
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+ __u64 count, old_eof, new_eof;
xid = get_xid();
@@ -3754,20 +3710,21 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon,
}
count = old_eof - off;
- eof = cpu_to_le64(old_eof + len);
+ new_eof = old_eof + len;
filemap_invalidate_lock(inode->i_mapping);
- rc = filemap_write_and_wait_range(inode->i_mapping, off, old_eof + len - 1);
+ rc = filemap_write_and_wait_range(inode->i_mapping, off, new_eof - 1);
if (rc < 0)
goto out_2;
truncate_pagecache_range(inode, off, old_eof);
rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, cfile->pid, &eof);
+ cfile->fid.volatile_fid, cfile->pid, new_eof);
if (rc < 0)
goto out_2;
- truncate_setsize(inode, old_eof + len);
+ truncate_setsize(inode, new_eof);
+ netfs_resize_file(&cifsi->netfs, i_size_read(inode), true);
fscache_resize_cookie(cifs_inode_cookie(inode), i_size_read(inode));
rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len);
@@ -5171,11 +5128,158 @@ int cifs_sfu_make_node(unsigned int xid, struct inode *inode,
return rc;
}
+static inline u64 mode_nfs_type(mode_t mode)
+{
+ switch (mode & S_IFMT) {
+ case S_IFBLK: return NFS_SPECFILE_BLK;
+ case S_IFCHR: return NFS_SPECFILE_CHR;
+ case S_IFIFO: return NFS_SPECFILE_FIFO;
+ case S_IFSOCK: return NFS_SPECFILE_SOCK;
+ }
+ return 0;
+}
+
+static int nfs_set_reparse_buf(struct reparse_posix_data *buf,
+ mode_t mode, dev_t dev,
+ struct kvec *iov)
+{
+ u64 type;
+ u16 len, dlen;
+
+ len = sizeof(*buf);
+
+ switch ((type = mode_nfs_type(mode))) {
+ case NFS_SPECFILE_BLK:
+ case NFS_SPECFILE_CHR:
+ dlen = sizeof(__le64);
+ break;
+ case NFS_SPECFILE_FIFO:
+ case NFS_SPECFILE_SOCK:
+ dlen = 0;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_NFS);
+ buf->Reserved = 0;
+ buf->InodeType = cpu_to_le64(type);
+ buf->ReparseDataLength = cpu_to_le16(len + dlen -
+ sizeof(struct reparse_data_buffer));
+ *(__le64 *)buf->DataBuffer = cpu_to_le64(((u64)MAJOR(dev) << 32) |
+ MINOR(dev));
+ iov->iov_base = buf;
+ iov->iov_len = len + dlen;
+ return 0;
+}
+
+static int nfs_make_node(unsigned int xid, struct inode *inode,
+ struct dentry *dentry, struct cifs_tcon *tcon,
+ const char *full_path, umode_t mode, dev_t dev)
+{
+ struct cifs_open_info_data data;
+ struct reparse_posix_data *p;
+ struct inode *new;
+ struct kvec iov;
+ __u8 buf[sizeof(*p) + sizeof(__le64)];
+ int rc;
+
+ p = (struct reparse_posix_data *)buf;
+ rc = nfs_set_reparse_buf(p, mode, dev, &iov);
+ if (rc)
+ return rc;
+
+ data = (struct cifs_open_info_data) {
+ .reparse_point = true,
+ .reparse = { .tag = IO_REPARSE_TAG_NFS, .posix = p, },
+ };
+
+ new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ tcon, full_path, &iov);
+ if (!IS_ERR(new))
+ d_instantiate(dentry, new);
+ else
+ rc = PTR_ERR(new);
+ cifs_free_open_info(&data);
+ return rc;
+}
+
+static int smb2_create_reparse_symlink(const unsigned int xid,
+ struct inode *inode,
+ struct dentry *dentry,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ const char *symname)
+{
+ struct reparse_symlink_data_buffer *buf = NULL;
+ struct cifs_open_info_data data;
+ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ struct inode *new;
+ struct kvec iov;
+ __le16 *path;
+ char *sym, sep = CIFS_DIR_SEP(cifs_sb);
+ u16 len, plen;
+ int rc = 0;
+
+ sym = kstrdup(symname, GFP_KERNEL);
+ if (!sym)
+ return -ENOMEM;
+
+ data = (struct cifs_open_info_data) {
+ .reparse_point = true,
+ .reparse = { .tag = IO_REPARSE_TAG_SYMLINK, },
+ .symlink_target = sym,
+ };
+
+ convert_delimiter(sym, sep);
+ path = cifs_convert_path_to_utf16(sym, cifs_sb);
+ if (!path) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ plen = 2 * UniStrnlen((wchar_t *)path, PATH_MAX);
+ len = sizeof(*buf) + plen * 2;
+ buf = kzalloc(len, GFP_KERNEL);
+ if (!buf) {
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ buf->ReparseTag = cpu_to_le32(IO_REPARSE_TAG_SYMLINK);
+ buf->ReparseDataLength = cpu_to_le16(len - sizeof(struct reparse_data_buffer));
+ buf->SubstituteNameOffset = cpu_to_le16(plen);
+ buf->SubstituteNameLength = cpu_to_le16(plen);
+ memcpy(&buf->PathBuffer[plen], path, plen);
+ buf->PrintNameOffset = 0;
+ buf->PrintNameLength = cpu_to_le16(plen);
+ memcpy(buf->PathBuffer, path, plen);
+ buf->Flags = cpu_to_le32(*symname != '/' ? SYMLINK_FLAG_RELATIVE : 0);
+ if (*sym != sep)
+ buf->Flags = cpu_to_le32(SYMLINK_FLAG_RELATIVE);
+
+ convert_delimiter(sym, '/');
+ iov.iov_base = buf;
+ iov.iov_len = len;
+ new = smb2_get_reparse_inode(&data, inode->i_sb, xid,
+ tcon, full_path, &iov);
+ if (!IS_ERR(new))
+ d_instantiate(dentry, new);
+ else
+ rc = PTR_ERR(new);
+out:
+ kfree(path);
+ cifs_free_open_info(&data);
+ kfree(buf);
+ return rc;
+}
+
static int smb2_make_node(unsigned int xid, struct inode *inode,
struct dentry *dentry, struct cifs_tcon *tcon,
const char *full_path, umode_t mode, dev_t dev)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+ int rc;
/*
* Check if mounted with mount parm 'sfu' mount parm.
@@ -5183,15 +5287,14 @@ static int smb2_make_node(unsigned int xid, struct inode *inode,
* supports block and char device (no socket & fifo),
* and was used by default in earlier versions of Windows
*/
- if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL))
- return -EPERM;
- /*
- * TODO: Add ability to create instead via reparse point. Windows (e.g.
- * their current NFS server) uses this approach to expose special files
- * over SMB2/SMB3 and Samba will do this with SMB3.1.1 POSIX Extensions
- */
- return cifs_sfu_make_node(xid, inode, dentry, tcon,
- full_path, mode, dev);
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
+ rc = cifs_sfu_make_node(xid, inode, dentry, tcon,
+ full_path, mode, dev);
+ } else {
+ rc = nfs_make_node(xid, inode, dentry, tcon,
+ full_path, mode, dev);
+ }
+ return rc;
}
#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
@@ -5247,6 +5350,7 @@ struct smb_version_operations smb20_operations = {
.parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
+ .create_reparse_symlink = smb2_create_reparse_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5349,6 +5453,7 @@ struct smb_version_operations smb21_operations = {
.parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
+ .create_reparse_symlink = smb2_create_reparse_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5454,6 +5559,7 @@ struct smb_version_operations smb30_operations = {
.parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
+ .create_reparse_symlink = smb2_create_reparse_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
@@ -5568,6 +5674,7 @@ struct smb_version_operations smb311_operations = {
.parse_reparse_point = smb2_parse_reparse_point,
.query_mf_symlink = smb3_query_mf_symlink,
.create_mf_symlink = smb3_create_mf_symlink,
+ .create_reparse_symlink = smb2_create_reparse_symlink,
.open = smb2_open_file,
.set_fid = smb2_set_fid,
.close = smb2_close_file,
diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c
index 4f971c1061f0..608ee05491e2 100644
--- a/fs/smb/client/smb2pdu.c
+++ b/fs/smb/client/smb2pdu.c
@@ -156,6 +156,56 @@ out:
return;
}
+/* helper function for code reuse */
+static int
+cifs_chan_skip_or_disable(struct cifs_ses *ses,
+ struct TCP_Server_Info *server,
+ bool from_reconnect)
+{
+ struct TCP_Server_Info *pserver;
+ unsigned int chan_index;
+
+ if (SERVER_IS_CHAN(server)) {
+ cifs_dbg(VFS,
+ "server %s does not support multichannel anymore. Skip secondary channel\n",
+ ses->server->hostname);
+
+ spin_lock(&ses->chan_lock);
+ chan_index = cifs_ses_get_chan_index(ses, server);
+ if (chan_index == CIFS_INVAL_CHAN_INDEX) {
+ spin_unlock(&ses->chan_lock);
+ goto skip_terminate;
+ }
+
+ ses->chans[chan_index].server = NULL;
+ server->terminate = true;
+ spin_unlock(&ses->chan_lock);
+
+ /*
+ * the above reference of server by channel
+ * needs to be dropped without holding chan_lock
+ * as cifs_put_tcp_session takes a higher lock
+ * i.e. cifs_tcp_ses_lock
+ */
+ cifs_put_tcp_session(server, from_reconnect);
+
+ cifs_signal_cifsd_for_reconnect(server, false);
+
+ /* mark primary server as needing reconnect */
+ pserver = server->primary_server;
+ cifs_signal_cifsd_for_reconnect(pserver, false);
+skip_terminate:
+ return -EHOSTDOWN;
+ }
+
+ cifs_server_dbg(VFS,
+ "server does not support multichannel anymore. Disable all other channels\n");
+ cifs_disable_secondary_channels(ses);
+
+
+ return 0;
+}
+
static int
smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct TCP_Server_Info *server, bool from_reconnect)
@@ -164,8 +214,6 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon,
struct nls_table *nls_codepage = NULL;
struct cifs_ses *ses;
int xid;
- struct TCP_Server_Info *pserver;
- unsigned int chan_index;
/*
* SMB2s NegProt, SessSetup, Logoff do not have tcon yet so
@@ -310,44 +358,11 @@ again:
*/
if (ses->chan_count > 1 &&
!(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
- if (SERVER_IS_CHAN(server)) {
- cifs_dbg(VFS, "server %s does not support " \
- "multichannel anymore. skipping secondary channel\n",
- ses->server->hostname);
-
- spin_lock(&ses->chan_lock);
- chan_index = cifs_ses_get_chan_index(ses, server);
- if (chan_index == CIFS_INVAL_CHAN_INDEX) {
- spin_unlock(&ses->chan_lock);
- goto skip_terminate;
- }
-
- ses->chans[chan_index].server = NULL;
- spin_unlock(&ses->chan_lock);
-
- /*
- * the above reference of server by channel
- * needs to be dropped without holding chan_lock
- * as cifs_put_tcp_session takes a higher lock
- * i.e. cifs_tcp_ses_lock
- */
- cifs_put_tcp_session(server, from_reconnect);
-
- server->terminate = true;
- cifs_signal_cifsd_for_reconnect(server, false);
-
- /* mark primary server as needing reconnect */
- pserver = server->primary_server;
- cifs_signal_cifsd_for_reconnect(pserver, false);
-
-skip_terminate:
+ rc = cifs_chan_skip_or_disable(ses, server,
+ from_reconnect);
+ if (rc) {
mutex_unlock(&ses->session_mutex);
- rc = -EHOSTDOWN;
goto out;
- } else {
- cifs_server_dbg(VFS, "does not support " \
- "multichannel anymore. disabling all other channels\n");
- cifs_disable_secondary_channels(ses);
}
}
@@ -384,6 +399,15 @@ skip_sess_setup:
goto out;
}
+ spin_lock(&ses->ses_lock);
+ if (ses->flags & CIFS_SES_FLAG_SCALE_CHANNELS) {
+ spin_unlock(&ses->ses_lock);
+ mutex_unlock(&ses->session_mutex);
+ goto skip_add_channels;
+ }
+ ses->flags |= CIFS_SES_FLAG_SCALE_CHANNELS;
+ spin_unlock(&ses->ses_lock);
+
if (!rc &&
(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) {
mutex_unlock(&ses->session_mutex);
@@ -395,14 +419,29 @@ skip_sess_setup:
rc = SMB3_request_interfaces(xid, tcon, false);
free_xid(xid);
- if (rc)
+ if (rc == -EOPNOTSUPP && ses->chan_count > 1) {
+ /*
+ * some servers like Azure SMB server do not advertise
+ * that multichannel has been disabled with server
+ * capabilities, rather return STATUS_NOT_IMPLEMENTED.
+ * treat this as server not supporting multichannel
+ */
+
+ rc = cifs_chan_skip_or_disable(ses, server,
+ from_reconnect);
+ goto skip_add_channels;
+ } else if (rc)
cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n",
__func__, rc);
if (ses->chan_max > ses->chan_count &&
+ ses->iface_count &&
!SERVER_IS_CHAN(server)) {
- if (ses->chan_count == 1)
+ if (ses->chan_count == 1) {
cifs_server_dbg(VFS, "supports multichannel now\n");
+ queue_delayed_work(cifsiod_wq, &tcon->query_interfaces,
+ (SMB_INTERFACE_POLL_INTERVAL * HZ));
+ }
cifs_try_adding_channels(ses);
}
@@ -410,6 +449,11 @@ skip_sess_setup:
mutex_unlock(&ses->session_mutex);
}
+skip_add_channels:
+ spin_lock(&ses->ses_lock);
+ ses->flags &= ~CIFS_SES_FLAG_SCALE_CHANNELS;
+ spin_unlock(&ses->ses_lock);
+
if (smb2_command != SMB2_INTERNAL_CMD)
mod_delayed_work(cifsiod_wq, &server->reconnect, 0);
@@ -1958,10 +2002,7 @@ SMB2_tcon(const unsigned int xid, struct cifs_ses *ses, const char *tree,
__le16 *unc_path = NULL;
int flags = 0;
unsigned int total_len;
- struct TCP_Server_Info *server;
-
- /* always use master channel */
- server = ses->server;
+ struct TCP_Server_Info *server = cifs_pick_channel(ses);
cifs_dbg(FYI, "TCON\n");
@@ -2094,6 +2135,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
struct smb2_tree_disconnect_req *req; /* response is trivial */
int rc = 0;
struct cifs_ses *ses = tcon->ses;
+ struct TCP_Server_Info *server = cifs_pick_channel(ses);
int flags = 0;
unsigned int total_len;
struct kvec iov[1];
@@ -2116,7 +2158,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
invalidate_all_cached_dirs(tcon);
- rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server,
+ rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, server,
(void **) &req,
&total_len);
if (rc)
@@ -2134,7 +2176,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
- rc = cifs_send_recv(xid, ses, ses->server,
+ rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
if (rc) {
@@ -2279,7 +2321,7 @@ int smb2_parse_contexts(struct TCP_Server_Info *server,
noff = le16_to_cpu(cc->NameOffset);
nlen = le16_to_cpu(cc->NameLength);
- if (noff + nlen >= doff)
+ if (noff + nlen > doff)
return -EINVAL;
name = (char *)cc + noff;
@@ -2362,8 +2404,13 @@ create_durable_v2_buf(struct cifs_open_parms *oparms)
*/
buf->dcontext.Timeout = cpu_to_le32(oparms->tcon->handle_timeout);
buf->dcontext.Flags = cpu_to_le32(SMB2_DHANDLE_FLAG_PERSISTENT);
- generate_random_uuid(buf->dcontext.CreateGuid);
- memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+
+ /* for replay, we should not overwrite the existing create guid */
+ if (!oparms->replay) {
+ generate_random_uuid(buf->dcontext.CreateGuid);
+ memcpy(pfid->create_guid, buf->dcontext.CreateGuid, 16);
+ } else
+ memcpy(buf->dcontext.CreateGuid, pfid->create_guid, 16);
/* SMB2_CREATE_DURABLE_HANDLE_REQUEST is "DH2Q" */
buf->Name[0] = 'D';
@@ -2736,7 +2783,14 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
int flags = 0;
unsigned int total_len;
__le16 *utf16_path = NULL;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ n_iov = 2;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "mkdir\n");
@@ -2840,6 +2894,10 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode,
/* no need to inc num_remote_opens because we close it just below */
trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, full_path, CREATE_NOT_FILE,
FILE_WRITE_ATTRIBUTES);
+
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
/* resource #4: response buffer */
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -2877,6 +2935,11 @@ err_free_req:
cifs_small_buf_release(req);
err_free_path:
kfree(utf16_path);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3072,12 +3135,19 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
struct smb2_create_rsp *rsp = NULL;
struct cifs_tcon *tcon = oparms->tcon;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct kvec iov[SMB2_CREATE_IOV_SIZE];
struct kvec rsp_iov = {NULL, 0};
int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
+ oparms->replay = !!(retries);
cifs_dbg(FYI, "create/open\n");
if (!ses || !server)
@@ -3099,6 +3169,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
trace_smb3_open_enter(xid, tcon->tid, tcon->ses->Suid, oparms->path,
oparms->create_options, oparms->desired_access);
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags,
&rsp_iov);
@@ -3152,6 +3225,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
creat_exit:
SMB2_open_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3276,15 +3354,7 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
int flags = 0;
-
- cifs_dbg(FYI, "SMB2 IOCTL\n");
-
- if (out_data != NULL)
- *out_data = NULL;
-
- /* zero out returned data len, in case of error */
- if (plen)
- *plen = 0;
+ int retries = 0, cur_sleep = 1;
if (!tcon)
return -EIO;
@@ -3293,10 +3363,23 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (!ses)
return -EIO;
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
server = cifs_pick_channel(ses);
+
if (!server)
return -EIO;
+ cifs_dbg(FYI, "SMB2 IOCTL\n");
+
+ if (out_data != NULL)
+ *out_data = NULL;
+
+ /* zero out returned data len, in case of error */
+ if (plen)
+ *plen = 0;
+
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
@@ -3311,6 +3394,9 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
if (rc)
goto ioctl_exit;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags,
&rsp_iov);
@@ -3380,6 +3466,11 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
ioctl_exit:
SMB2_ioctl_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3451,13 +3542,20 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
struct smb_rqst rqst;
struct smb2_close_rsp *rsp = NULL;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct kvec iov[1];
struct kvec rsp_iov;
int resp_buftype = CIFS_NO_BUFFER;
int rc = 0;
int flags = 0;
bool query_attrs = false;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ query_attrs = false;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "Close\n");
@@ -3483,6 +3581,9 @@ __SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto close_exit;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_close_rsp *)rsp_iov.iov_base;
@@ -3516,6 +3617,11 @@ close_exit:
cifs_dbg(VFS, "handle cancelled close fid 0x%llx returned error %d\n",
persistent_fid, tmp_rc);
}
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3646,12 +3752,19 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
struct TCP_Server_Info *server;
int flags = 0;
bool allocated = false;
+ int retries = 0, cur_sleep = 1;
cifs_dbg(FYI, "Query Info\n");
if (!ses)
return -EIO;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ allocated = false;
server = cifs_pick_channel(ses);
+
if (!server)
return -EIO;
@@ -3673,6 +3786,9 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
trace_smb3_query_info_enter(xid, persistent_fid, tcon->tid,
ses->Suid, info_class, (__u32)info_type);
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base;
@@ -3715,6 +3831,11 @@ query_info(const unsigned int xid, struct cifs_tcon *tcon,
qinf_exit:
SMB2_query_info_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3815,7 +3936,7 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
u32 *plen /* returned data len */)
{
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct smb_rqst rqst;
struct smb2_change_notify_rsp *smb_rsp;
struct kvec iov[1];
@@ -3823,6 +3944,12 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
int resp_buftype = CIFS_NO_BUFFER;
int flags = 0;
int rc = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "change notify\n");
if (!ses || !server)
@@ -3847,6 +3974,10 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
trace_smb3_notify_enter(xid, persistent_fid, tcon->tid, ses->Suid,
(u8)watch_tree, completion_filter);
+
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -3881,6 +4012,11 @@ SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
if (rqst.rq_iov)
cifs_small_buf_release(rqst.rq_iov[0].iov_base); /* request */
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -3918,7 +4054,7 @@ void smb2_reconnect_server(struct work_struct *work)
struct cifs_ses *ses, *ses2;
struct cifs_tcon *tcon, *tcon2;
struct list_head tmp_list, tmp_ses_list;
- bool tcon_exist = false, ses_exist = false;
+ bool ses_exist = false;
bool tcon_selected = false;
int rc;
bool resched = false;
@@ -3964,7 +4100,7 @@ void smb2_reconnect_server(struct work_struct *work)
if (tcon->need_reconnect || tcon->need_reopen_files) {
tcon->tc_count++;
list_add_tail(&tcon->rlist, &tmp_list);
- tcon_selected = tcon_exist = true;
+ tcon_selected = true;
}
}
/*
@@ -3973,7 +4109,7 @@ void smb2_reconnect_server(struct work_struct *work)
*/
if (ses->tcon_ipc && ses->tcon_ipc->need_reconnect) {
list_add_tail(&ses->tcon_ipc->rlist, &tmp_list);
- tcon_selected = tcon_exist = true;
+ tcon_selected = true;
cifs_smb_ses_inc_refcount(ses);
}
/*
@@ -4123,10 +4259,16 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
struct smb_rqst rqst;
struct kvec iov[1];
struct kvec rsp_iov = {NULL, 0};
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int resp_buftype = CIFS_NO_BUFFER;
int flags = 0;
int rc = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "flush\n");
if (!ses || !(ses->server))
@@ -4146,6 +4288,10 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
goto flush_exit;
trace_smb3_flush_enter(xid, persistent_fid, tcon->tid, ses->Suid);
+
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
@@ -4160,6 +4306,11 @@ SMB2_flush(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
flush_exit:
SMB2_flush_free(&rqst);
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -4639,7 +4790,7 @@ smb2_async_writev(struct cifs_writedata *wdata,
struct cifs_io_parms *io_parms = NULL;
int credit_request;
- if (!wdata->server)
+ if (!wdata->server || wdata->replay)
server = wdata->server = cifs_pick_channel(tcon->ses);
/*
@@ -4724,6 +4875,8 @@ smb2_async_writev(struct cifs_writedata *wdata,
rqst.rq_nvec = 1;
rqst.rq_iter = wdata->iter;
rqst.rq_iter_size = iov_iter_count(&rqst.rq_iter);
+ if (wdata->replay)
+ smb2_set_replay(server, &rqst);
#ifdef CONFIG_CIFS_SMB_DIRECT
if (wdata->mr)
iov[0].iov_len += sizeof(struct smbd_buffer_descriptor_v1);
@@ -4797,18 +4950,21 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
int flags = 0;
unsigned int total_len;
struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
*nbytes = 0;
-
- if (n_vec < 1)
- return rc;
-
if (!io_parms->server)
io_parms->server = cifs_pick_channel(io_parms->tcon->ses);
server = io_parms->server;
if (server == NULL)
return -ECONNABORTED;
+ if (n_vec < 1)
+ return rc;
+
rc = smb2_plain_req_init(SMB2_WRITE, io_parms->tcon, server,
(void **) &req, &total_len);
if (rc)
@@ -4842,6 +4998,9 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
rqst.rq_iov = iov;
rqst.rq_nvec = n_vec + 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, io_parms->tcon->ses, server,
&rqst,
&resp_buftype, flags, &rsp_iov);
@@ -4866,6 +5025,11 @@ SMB2_write(const unsigned int xid, struct cifs_io_parms *io_parms,
cifs_small_buf_release(req);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(io_parms->tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5048,6 +5212,9 @@ int SMB2_query_directory_init(const unsigned int xid,
case SMB_FIND_FILE_POSIX_INFO:
req->FileInformationClass = SMB_FIND_FILE_POSIX_INFO;
break;
+ case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
+ req->FileInformationClass = FILE_FULL_DIRECTORY_INFORMATION;
+ break;
default:
cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
info_level);
@@ -5117,6 +5284,9 @@ smb2_parse_query_directory(struct cifs_tcon *tcon,
/* note that posix payload are variable size */
info_buf_size = sizeof(struct smb2_posix_info);
break;
+ case SMB_FIND_FILE_FULL_DIRECTORY_INFO:
+ info_buf_size = sizeof(FILE_FULL_DIRECTORY_INFO);
+ break;
default:
cifs_tcon_dbg(VFS, "info level %u isn't supported\n",
srch_inf->info_level);
@@ -5177,8 +5347,14 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
struct kvec rsp_iov;
int rc = 0;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
if (!ses || !(ses->server))
return -EIO;
@@ -5198,6 +5374,9 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
if (rc)
goto qdir_exit;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base;
@@ -5232,6 +5411,11 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon,
qdir_exit:
SMB2_query_directory_free(&rqst);
free_rsp_buf(resp_buftype, rsp);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5298,8 +5482,14 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
if (!ses || !server)
return -EIO;
@@ -5327,6 +5517,8 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
+ if (retries)
+ smb2_set_replay(server, &rqst);
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags,
@@ -5342,23 +5534,28 @@ send_set_info(const unsigned int xid, struct cifs_tcon *tcon,
free_rsp_buf(resp_buftype, rsp);
kfree(iov);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
int
SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 pid, __le64 *eof)
+ u64 volatile_fid, u32 pid, loff_t new_eof)
{
struct smb2_file_eof_info info;
void *data;
unsigned int size;
- info.EndOfFile = *eof;
+ info.EndOfFile = cpu_to_le64(new_eof);
data = &info;
size = sizeof(struct smb2_file_eof_info);
- trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, le64_to_cpu(*eof));
+ trace_smb3_set_eof(xid, persistent_fid, tcon->tid, tcon->ses->Suid, new_eof);
return send_set_info(xid, tcon, persistent_fid, volatile_fid,
pid, FILE_END_OF_FILE_INFORMATION, SMB2_O_INFO_FILE,
@@ -5394,12 +5591,18 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
int rc;
struct smb2_oplock_break *req = NULL;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
int flags = CIFS_OBREAK_OP;
unsigned int total_len;
struct kvec iov[1];
struct kvec rsp_iov;
int resp_buf_type;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_OBREAK_OP;
+ server = cifs_pick_channel(ses);
cifs_dbg(FYI, "SMB2_oplock_break\n");
rc = smb2_plain_req_init(SMB2_OPLOCK_BREAK, tcon, server,
@@ -5424,15 +5627,21 @@ SMB2_oplock_break(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buf_type, flags, &rsp_iov);
cifs_small_buf_release(req);
-
if (rc) {
cifs_stats_fail_inc(tcon, SMB2_OPLOCK_BREAK_HE);
cifs_dbg(FYI, "Send error in Oplock Break = %d\n", rc);
}
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5518,9 +5727,15 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
FILE_SYSTEM_POSIX_INFO *info = NULL;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
rc = build_qfs_info_req(&iov, tcon, server,
FS_POSIX_INFORMATION,
@@ -5536,6 +5751,9 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = &iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
free_qfs_info_req(&iov);
@@ -5555,6 +5773,11 @@ SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon,
posix_qfsinf_exit:
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5569,9 +5792,15 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
struct smb2_fs_full_size_info *info = NULL;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
rc = build_qfs_info_req(&iov, tcon, server,
FS_FULL_SIZE_INFORMATION,
@@ -5587,6 +5816,9 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = &iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
free_qfs_info_req(&iov);
@@ -5606,6 +5838,11 @@ SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon,
qfsinf_exit:
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5620,9 +5857,15 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
int rc = 0;
int resp_buftype, max_len, min_len;
struct cifs_ses *ses = tcon->ses;
- struct TCP_Server_Info *server = cifs_pick_channel(ses);
+ struct TCP_Server_Info *server;
unsigned int rsp_len, offset;
int flags = 0;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = 0;
+ server = cifs_pick_channel(ses);
if (level == FS_DEVICE_INFORMATION) {
max_len = sizeof(FILE_SYSTEM_DEVICE_INFO);
@@ -5654,6 +5897,9 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = &iov;
rqst.rq_nvec = 1;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, ses, server,
&rqst, &resp_buftype, flags, &rsp_iov);
free_qfs_info_req(&iov);
@@ -5691,6 +5937,11 @@ SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon,
qfsattr_exit:
free_rsp_buf(resp_buftype, rsp_iov.iov_base);
+
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
@@ -5708,7 +5959,13 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
unsigned int count;
int flags = CIFS_NO_RSP_BUF;
unsigned int total_len;
- struct TCP_Server_Info *server = cifs_pick_channel(tcon->ses);
+ struct TCP_Server_Info *server;
+ int retries = 0, cur_sleep = 1;
+
+replay_again:
+ /* reinitialize for possible replay */
+ flags = CIFS_NO_RSP_BUF;
+ server = cifs_pick_channel(tcon->ses);
cifs_dbg(FYI, "smb2_lockv num lock %d\n", num_lock);
@@ -5739,6 +5996,9 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
rqst.rq_iov = iov;
rqst.rq_nvec = 2;
+ if (retries)
+ smb2_set_replay(server, &rqst);
+
rc = cifs_send_recv(xid, tcon->ses, server,
&rqst, &resp_buf_type, flags,
&rsp_iov);
@@ -5750,6 +6010,10 @@ smb2_lockv(const unsigned int xid, struct cifs_tcon *tcon,
tcon->ses->Suid, rc);
}
+ if (is_replayable_error(rc) &&
+ smb2_should_replay(tcon, &retries, &cur_sleep))
+ goto replay_again;
+
return rc;
}
diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h
index 0e371f7e2854..b3069911e9dd 100644
--- a/fs/smb/client/smb2proto.h
+++ b/fs/smb/client/smb2proto.h
@@ -56,6 +56,18 @@ extern int smb3_handle_read_data(struct TCP_Server_Info *server,
extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const char *path,
__u32 *reparse_tag);
+struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data,
+ struct super_block *sb,
+ const unsigned int xid,
+ struct cifs_tcon *tcon,
+ const char *full_path,
+ struct kvec *iov);
+int smb2_query_reparse_point(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct cifs_sb_info *cifs_sb,
+ const char *full_path,
+ u32 *tag, struct kvec *rsp,
+ int *rsp_buftype);
int smb2_query_path_info(const unsigned int xid,
struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
@@ -80,12 +92,16 @@ extern int smb2_rmdir(const unsigned int xid, struct cifs_tcon *tcon,
const char *name, struct cifs_sb_info *cifs_sb);
extern int smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon,
const char *name, struct cifs_sb_info *cifs_sb);
-extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb);
-extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
- const char *from_name, const char *to_name,
- struct cifs_sb_info *cifs_sb);
+int smb2_rename_path(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
+int smb2_create_hardlink(const unsigned int xid,
+ struct cifs_tcon *tcon,
+ struct dentry *source_dentry,
+ const char *from_name, const char *to_name,
+ struct cifs_sb_info *cifs_sb);
extern int smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb, const unsigned char *path,
char *pbuf, unsigned int *pbytes_written);
@@ -106,6 +122,11 @@ extern unsigned long smb_rqst_len(struct TCP_Server_Info *server,
extern void smb2_set_next_command(struct cifs_tcon *tcon,
struct smb_rqst *rqst);
extern void smb2_set_related(struct smb_rqst *rqst);
+extern void smb2_set_replay(struct TCP_Server_Info *server,
+ struct smb_rqst *rqst);
+extern bool smb2_should_replay(struct cifs_tcon *tcon,
+ int *pretries,
+ int *pcur_sleep);
/*
* SMB2 Worker functions - most of protocol specific implementation details
@@ -205,7 +226,7 @@ extern int SMB2_query_directory_init(unsigned int xid, struct cifs_tcon *tcon,
extern void SMB2_query_directory_free(struct smb_rqst *rqst);
extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 pid,
- __le64 *eof);
+ loff_t new_eof);
extern int SMB2_set_info_init(struct cifs_tcon *tcon,
struct TCP_Server_Info *server,
struct smb_rqst *rqst,
@@ -283,10 +304,9 @@ int smb311_posix_query_path_info(const unsigned int xid,
struct cifs_tcon *tcon,
struct cifs_sb_info *cifs_sb,
const char *full_path,
- struct cifs_open_info_data *data,
- struct cifs_sid *owner,
- struct cifs_sid *group);
+ struct cifs_open_info_data *data);
int posix_info_parse(const void *beg, const void *end,
struct smb2_posix_info_parsed *out);
int posix_info_sid_size(const void *beg, const void *end);
+
#endif /* _SMB2PROTO_H */
diff --git a/fs/smb/client/smb2status.h b/fs/smb/client/smb2status.h
index a9e958166fc5..9c6d79b0bd49 100644
--- a/fs/smb/client/smb2status.h
+++ b/fs/smb/client/smb2status.h
@@ -982,6 +982,8 @@ struct ntstatus {
#define STATUS_INVALID_TASK_INDEX cpu_to_le32(0xC0000501)
#define STATUS_THREAD_ALREADY_IN_TASK cpu_to_le32(0xC0000502)
#define STATUS_CALLBACK_BYPASS cpu_to_le32(0xC0000503)
+#define STATUS_SERVER_UNAVAILABLE cpu_to_le32(0xC0000466)
+#define STATUS_FILE_NOT_AVAILABLE cpu_to_le32(0xC0000467)
#define STATUS_PORT_CLOSED cpu_to_le32(0xC0000700)
#define STATUS_MESSAGE_LOST cpu_to_le32(0xC0000701)
#define STATUS_INVALID_MESSAGE cpu_to_le32(0xC0000702)
diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c
index 94df9eec3d8d..d74e829de51c 100644
--- a/fs/smb/client/smbdirect.c
+++ b/fs/smb/client/smbdirect.c
@@ -2136,7 +2136,7 @@ static int allocate_mr_list(struct smbd_connection *info)
for (i = 0; i < info->responder_resources * 2; i++) {
smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL);
if (!smbdirect_mr)
- goto out;
+ goto cleanup_entries;
smbdirect_mr->mr = ib_alloc_mr(info->pd, info->mr_type,
info->max_frmr_depth);
if (IS_ERR(smbdirect_mr->mr)) {
@@ -2162,7 +2162,7 @@ static int allocate_mr_list(struct smbd_connection *info)
out:
kfree(smbdirect_mr);
-
+cleanup_entries:
list_for_each_entry_safe(smbdirect_mr, tmp, &info->mr_list, list) {
list_del(&smbdirect_mr->list);
ib_dereg_mr(smbdirect_mr->mr);
diff --git a/fs/smb/client/smbencrypt.c b/fs/smb/client/smbencrypt.c
index f0ce26414f17..1d1ee9f18f37 100644
--- a/fs/smb/client/smbencrypt.c
+++ b/fs/smb/client/smbencrypt.c
@@ -26,13 +26,6 @@
#include "cifsproto.h"
#include "../common/md4.h"
-#ifndef false
-#define false 0
-#endif
-#ifndef true
-#define true 1
-#endif
-
/* following came from the other byteorder.h to avoid include conflicts */
#define CVAL(buf,pos) (((unsigned char *)(buf))[pos])
#define SSVALX(buf,pos,val) (CVAL(buf,pos)=(val)&0xFF,CVAL(buf,pos+1)=(val)>>8)
diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h
index de199ec9f726..522fa387fcfd 100644
--- a/fs/smb/client/trace.h
+++ b/fs/smb/client/trace.h
@@ -370,11 +370,12 @@ DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rename_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(rmdir_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_eof_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_info_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(set_reparse_compound_enter);
+DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(get_reparse_compound_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(delete_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(mkdir_enter);
DEFINE_SMB3_INF_COMPOUND_ENTER_EVENT(tdis_enter);
-
DECLARE_EVENT_CLASS(smb3_inf_compound_done_class,
TP_PROTO(unsigned int xid,
__u32 tid,
@@ -408,6 +409,8 @@ DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rename_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(rmdir_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_eof_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_info_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(set_reparse_compound_done);
+DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(get_reparse_compound_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(delete_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(mkdir_done);
DEFINE_SMB3_INF_COMPOUND_DONE_EVENT(tdis_done);
@@ -451,6 +454,8 @@ DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rename_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(rmdir_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_eof_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_info_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(set_reparse_compound_err);
+DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(get_reparse_compound_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(mkdir_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(delete_err);
DEFINE_SMB3_INF_COMPOUND_ERR_EVENT(tdis_err);
diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c
index 4f717ad7c21b..994d70193432 100644
--- a/fs/smb/client/transport.c
+++ b/fs/smb/client/transport.c
@@ -400,10 +400,17 @@ unmask:
server->conn_id, server->hostname);
}
smbd_done:
- if (rc < 0 && rc != -EINTR)
+ /*
+ * there's hardly any use for the layers above to know the
+ * actual error code here. All they should do at this point is
+ * to retry the connection and hope it goes away.
+ */
+ if (rc < 0 && rc != -EINTR && rc != -EAGAIN) {
cifs_server_dbg(VFS, "Error %d sending data on socket to server\n",
rc);
- else if (rc > 0)
+ rc = -ECONNABORTED;
+ cifs_signal_cifsd_for_reconnect(server, false);
+ } else if (rc > 0)
rc = 0;
out:
cifs_in_send_dec(server);
@@ -428,8 +435,8 @@ smb_send_rqst(struct TCP_Server_Info *server, int num_rqst,
if (!(flags & CIFS_TRANSFORM_REQ))
return __smb_send_rqst(server, num_rqst, rqst);
- if (num_rqst > MAX_COMPOUND - 1)
- return -ENOMEM;
+ if (WARN_ON_ONCE(num_rqst > MAX_COMPOUND - 1))
+ return -EIO;
if (!server->ops->init_transform_rq) {
cifs_server_dbg(VFS, "Encryption requested but transform callback is missing\n");
@@ -1026,6 +1033,9 @@ struct TCP_Server_Info *cifs_pick_channel(struct cifs_ses *ses)
if (!server || server->terminate)
continue;
+ if (CIFS_CHAN_NEEDS_RECONNECT(ses, i))
+ continue;
+
/*
* strictly speaking, we should pick up req_lock to read
* server->in_flight. But it shouldn't matter much here if we
diff --git a/fs/smb/server/asn1.c b/fs/smb/server/asn1.c
index 4a4b2b03ff33..b931a99ab9c8 100644
--- a/fs/smb/server/asn1.c
+++ b/fs/smb/server/asn1.c
@@ -214,10 +214,15 @@ static int ksmbd_neg_token_alloc(void *context, size_t hdrlen,
{
struct ksmbd_conn *conn = context;
+ if (!vlen)
+ return -EINVAL;
+
conn->mechToken = kmemdup_nul(value, vlen, GFP_KERNEL);
if (!conn->mechToken)
return -ENOMEM;
+ conn->mechTokenLen = (unsigned int)vlen;
+
return 0;
}
diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c
index 229a6527870d..09b20039636e 100644
--- a/fs/smb/server/auth.c
+++ b/fs/smb/server/auth.c
@@ -208,10 +208,12 @@ out:
/**
* ksmbd_auth_ntlmv2() - NTLMv2 authentication handler
- * @sess: session of connection
+ * @conn: connection
+ * @sess: session of connection
* @ntlmv2: NTLMv2 challenge response
* @blen: NTLMv2 blob length
* @domain_name: domain name
+ * @cryptkey: session crypto key
*
* Return: 0 on success, error number on error
*/
@@ -294,7 +296,8 @@ out:
* ksmbd_decode_ntlmssp_auth_blob() - helper function to construct
* authenticate blob
* @authblob: authenticate blob source pointer
- * @usr: user details
+ * @blob_len: length of the @authblob message
+ * @conn: connection
* @sess: session of connection
*
* Return: 0 on success, error number on error
@@ -376,8 +379,8 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob,
* ksmbd_decode_ntlmssp_neg_blob() - helper function to construct
* negotiate blob
* @negblob: negotiate blob source pointer
- * @rsp: response header pointer to be updated
- * @sess: session of connection
+ * @blob_len: length of the @authblob message
+ * @conn: connection
*
*/
int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob,
@@ -403,8 +406,7 @@ int ksmbd_decode_ntlmssp_neg_blob(struct negotiate_message *negblob,
* ksmbd_build_ntlmssp_challenge_blob() - helper function to construct
* challenge blob
* @chgblob: challenge blob source pointer to initialize
- * @rsp: response header pointer to be updated
- * @sess: session of connection
+ * @conn: connection
*
*/
unsigned int
diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c
index b6fa1e285c40..09e1e7771592 100644
--- a/fs/smb/server/connection.c
+++ b/fs/smb/server/connection.c
@@ -284,6 +284,7 @@ int ksmbd_conn_handler_loop(void *p)
goto out;
conn->last_active = jiffies;
+ set_freezable();
while (ksmbd_conn_alive(conn)) {
if (try_to_freeze())
continue;
@@ -415,13 +416,7 @@ static void stop_sessions(void)
again:
down_read(&conn_list_lock);
list_for_each_entry(conn, &conn_list, conns_list) {
- struct task_struct *task;
-
t = conn->transport;
- task = t->handler;
- if (task)
- ksmbd_debug(CONN, "Stop session handler %s/%d\n",
- task->comm, task_pid_nr(task));
ksmbd_conn_set_exiting(conn);
if (t->ops->shutdown) {
up_read(&conn_list_lock);
diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h
index 3c005246a32e..0e04cf8b1d89 100644
--- a/fs/smb/server/connection.h
+++ b/fs/smb/server/connection.h
@@ -88,6 +88,7 @@ struct ksmbd_conn {
__u16 dialect;
char *mechToken;
+ unsigned int mechTokenLen;
struct ksmbd_conn_ops *conn_ops;
@@ -134,7 +135,6 @@ struct ksmbd_transport_ops {
struct ksmbd_transport {
struct ksmbd_conn *conn;
struct ksmbd_transport_ops *ops;
- struct task_struct *handler;
};
#define KSMBD_TCP_RECV_TIMEOUT (7 * HZ)
diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h
index b7521e41402e..0ebf91ffa236 100644
--- a/fs/smb/server/ksmbd_netlink.h
+++ b/fs/smb/server/ksmbd_netlink.h
@@ -304,7 +304,8 @@ enum ksmbd_event {
KSMBD_EVENT_SPNEGO_AUTHEN_REQUEST,
KSMBD_EVENT_SPNEGO_AUTHEN_RESPONSE = 15,
- KSMBD_EVENT_MAX
+ __KSMBD_EVENT_MAX,
+ KSMBD_EVENT_MAX = __KSMBD_EVENT_MAX - 1
};
/*
diff --git a/fs/smb/server/mgmt/ksmbd_ida.c b/fs/smb/server/mgmt/ksmbd_ida.c
index 54194d959a5e..a18e27e9e0cd 100644
--- a/fs/smb/server/mgmt/ksmbd_ida.c
+++ b/fs/smb/server/mgmt/ksmbd_ida.c
@@ -5,42 +5,33 @@
#include "ksmbd_ida.h"
-static inline int __acquire_id(struct ida *ida, int from, int to)
-{
- return ida_simple_get(ida, from, to, GFP_KERNEL);
-}
-
int ksmbd_acquire_smb2_tid(struct ida *ida)
{
- int id;
-
- id = __acquire_id(ida, 1, 0xFFFFFFFF);
-
- return id;
+ return ida_alloc_range(ida, 1, 0xFFFFFFFE, GFP_KERNEL);
}
int ksmbd_acquire_smb2_uid(struct ida *ida)
{
int id;
- id = __acquire_id(ida, 1, 0);
+ id = ida_alloc_min(ida, 1, GFP_KERNEL);
if (id == 0xFFFE)
- id = __acquire_id(ida, 1, 0);
+ id = ida_alloc_min(ida, 1, GFP_KERNEL);
return id;
}
int ksmbd_acquire_async_msg_id(struct ida *ida)
{
- return __acquire_id(ida, 1, 0);
+ return ida_alloc_min(ida, 1, GFP_KERNEL);
}
int ksmbd_acquire_id(struct ida *ida)
{
- return __acquire_id(ida, 0, 0);
+ return ida_alloc(ida, GFP_KERNEL);
}
void ksmbd_release_id(struct ida *ida, int id)
{
- ida_simple_remove(ida, id);
+ ida_free(ida, id);
}
diff --git a/fs/smb/server/misc.c b/fs/smb/server/misc.c
index 9e8afaa686e3..1a5faa6f6e7b 100644
--- a/fs/smb/server/misc.c
+++ b/fs/smb/server/misc.c
@@ -261,6 +261,7 @@ out_ascii:
/**
* ksmbd_extract_sharename() - get share name from tree connect request
+ * @um: pointer to a unicode_map structure for character encoding handling
* @treename: buffer containing tree name and share name
*
* Return: share name on success, otherwise error
diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c
index 562b180459a1..53dfaac425c6 100644
--- a/fs/smb/server/oplock.c
+++ b/fs/smb/server/oplock.c
@@ -105,7 +105,7 @@ static int alloc_lease(struct oplock_info *opinfo, struct lease_ctx_info *lctx)
lease->is_dir = lctx->is_dir;
memcpy(lease->parent_lease_key, lctx->parent_lease_key, SMB2_LEASE_KEY_SIZE);
lease->version = lctx->version;
- lease->epoch = le16_to_cpu(lctx->epoch);
+ lease->epoch = le16_to_cpu(lctx->epoch) + 1;
INIT_LIST_HEAD(&opinfo->lease_entry);
opinfo->o_lease = lease;
@@ -546,6 +546,7 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
atomic_read(&ci->sop_count)) == 1) {
if (lease->state != SMB2_LEASE_NONE_LE &&
lease->state == (lctx->req_state & lease->state)) {
+ lease->epoch++;
lease->state |= lctx->req_state;
if (lctx->req_state &
SMB2_LEASE_WRITE_CACHING_LE)
@@ -556,13 +557,17 @@ static struct oplock_info *same_client_has_lease(struct ksmbd_inode *ci,
atomic_read(&ci->sop_count)) > 1) {
if (lctx->req_state ==
(SMB2_LEASE_READ_CACHING_LE |
- SMB2_LEASE_HANDLE_CACHING_LE))
+ SMB2_LEASE_HANDLE_CACHING_LE)) {
+ lease->epoch++;
lease->state = lctx->req_state;
+ }
}
if (lctx->req_state && lease->state ==
- SMB2_LEASE_NONE_LE)
+ SMB2_LEASE_NONE_LE) {
+ lease->epoch++;
lease_none_upgrade(opinfo, lctx->req_state);
+ }
}
read_lock(&ci->m_lock);
}
@@ -1035,7 +1040,8 @@ static void copy_lease(struct oplock_info *op1, struct oplock_info *op2)
SMB2_LEASE_KEY_SIZE);
lease2->duration = lease1->duration;
lease2->flags = lease1->flags;
- lease2->epoch = lease1->epoch++;
+ lease2->epoch = lease1->epoch;
+ lease2->version = lease1->version;
}
static int add_lease_global_list(struct oplock_info *opinfo)
@@ -1191,6 +1197,12 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid,
bool prev_op_has_lease;
__le32 prev_op_state = 0;
+ /* Only v2 leases handle the directory */
+ if (S_ISDIR(file_inode(fp->filp)->i_mode)) {
+ if (!lctx || lctx->version != 2)
+ return 0;
+ }
+
opinfo = alloc_opinfo(work, pid, tid);
if (!opinfo)
return -ENOMEM;
@@ -1447,7 +1459,7 @@ void create_lease_buf(u8 *rbuf, struct lease *lease)
memcpy(buf->lcontext.LeaseKey, lease->lease_key,
SMB2_LEASE_KEY_SIZE);
buf->lcontext.LeaseFlags = lease->flags;
- buf->lcontext.Epoch = cpu_to_le16(++lease->epoch);
+ buf->lcontext.Epoch = cpu_to_le16(lease->epoch);
buf->lcontext.LeaseState = lease->state;
memcpy(buf->lcontext.ParentLeaseKey, lease->parent_lease_key,
SMB2_LEASE_KEY_SIZE);
diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c
index 652ab429bf2e..0c97d3c86072 100644
--- a/fs/smb/server/smb2pdu.c
+++ b/fs/smb/server/smb2pdu.c
@@ -1414,7 +1414,10 @@ static struct ksmbd_user *session_user(struct ksmbd_conn *conn,
char *name;
unsigned int name_off, name_len, secbuf_len;
- secbuf_len = le16_to_cpu(req->SecurityBufferLength);
+ if (conn->use_spnego && conn->mechToken)
+ secbuf_len = conn->mechTokenLen;
+ else
+ secbuf_len = le16_to_cpu(req->SecurityBufferLength);
if (secbuf_len < sizeof(struct authenticate_message)) {
ksmbd_debug(SMB, "blob len %d too small\n", secbuf_len);
return NULL;
@@ -1505,7 +1508,10 @@ static int ntlm_authenticate(struct ksmbd_work *work,
struct authenticate_message *authblob;
authblob = user_authblob(conn, req);
- sz = le16_to_cpu(req->SecurityBufferLength);
+ if (conn->use_spnego && conn->mechToken)
+ sz = conn->mechTokenLen;
+ else
+ sz = le16_to_cpu(req->SecurityBufferLength);
rc = ksmbd_decode_ntlmssp_auth_blob(authblob, sz, conn, sess);
if (rc) {
set_user_flag(sess->user, KSMBD_USER_FLAG_BAD_PASSWORD);
@@ -1778,8 +1784,7 @@ int smb2_sess_setup(struct ksmbd_work *work)
negblob_off = le16_to_cpu(req->SecurityBufferOffset);
negblob_len = le16_to_cpu(req->SecurityBufferLength);
- if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer) ||
- negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) {
+ if (negblob_off < offsetof(struct smb2_sess_setup_req, Buffer)) {
rc = -EINVAL;
goto out_err;
}
@@ -1788,8 +1793,15 @@ int smb2_sess_setup(struct ksmbd_work *work)
negblob_off);
if (decode_negotiation_token(conn, negblob, negblob_len) == 0) {
- if (conn->mechToken)
+ if (conn->mechToken) {
negblob = (struct negotiate_message *)conn->mechToken;
+ negblob_len = conn->mechTokenLen;
+ }
+ }
+
+ if (negblob_len < offsetof(struct negotiate_message, NegotiateFlags)) {
+ rc = -EINVAL;
+ goto out_err;
}
if (server_conf.auth_mechs & conn->auth_mechs) {
@@ -2311,11 +2323,12 @@ out:
* @eabuf: set info command buffer
* @buf_len: set info command buffer length
* @path: dentry path for get ea
+ * @get_write: get write access to a mount
*
* Return: 0 on success, otherwise error
*/
static int smb2_set_ea(struct smb2_ea_info *eabuf, unsigned int buf_len,
- const struct path *path)
+ const struct path *path, bool get_write)
{
struct mnt_idmap *idmap = mnt_idmap(path->mnt);
char *attr_name = NULL, *value;
@@ -2971,7 +2984,7 @@ int smb2_open(struct ksmbd_work *work)
&may_flags);
if (!test_tree_conn_flag(tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
- if (open_flags & O_CREAT) {
+ if (open_flags & (O_CREAT | O_TRUNC)) {
ksmbd_debug(SMB,
"User does not have write permission\n");
rc = -EACCES;
@@ -3003,7 +3016,7 @@ int smb2_open(struct ksmbd_work *work)
rc = smb2_set_ea(&ea_buf->ea,
le32_to_cpu(ea_buf->ccontext.DataLength),
- &path);
+ &path, false);
if (rc == -EOPNOTSUPP)
rc = 0;
else if (rc)
@@ -5568,6 +5581,7 @@ static int smb2_rename(struct ksmbd_work *work,
if (!file_info->ReplaceIfExists)
flags = RENAME_NOREPLACE;
+ smb_break_all_levII_oplock(work, fp, 0);
rc = ksmbd_vfs_rename(work, &fp->filp->f_path, new_name, flags);
out:
kfree(new_name);
@@ -5943,12 +5957,6 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
}
case FILE_RENAME_INFORMATION:
{
- if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
- ksmbd_debug(SMB,
- "User does not have write permission\n");
- return -EACCES;
- }
-
if (buf_len < sizeof(struct smb2_file_rename_info))
return -EINVAL;
@@ -5968,12 +5976,6 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
}
case FILE_DISPOSITION_INFORMATION:
{
- if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
- ksmbd_debug(SMB,
- "User does not have write permission\n");
- return -EACCES;
- }
-
if (buf_len < sizeof(struct smb2_file_disposition_info))
return -EINVAL;
@@ -5992,7 +5994,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp,
return -EINVAL;
return smb2_set_ea((struct smb2_ea_info *)req->Buffer,
- buf_len, &fp->filp->f_path);
+ buf_len, &fp->filp->f_path, true);
}
case FILE_POSITION_INFORMATION:
{
@@ -6035,7 +6037,7 @@ int smb2_set_info(struct ksmbd_work *work)
{
struct smb2_set_info_req *req;
struct smb2_set_info_rsp *rsp;
- struct ksmbd_file *fp;
+ struct ksmbd_file *fp = NULL;
int rc = 0;
unsigned int id = KSMBD_NO_FID, pid = KSMBD_NO_FID;
@@ -6055,6 +6057,13 @@ int smb2_set_info(struct ksmbd_work *work)
rsp = smb2_get_msg(work->response_buf);
}
+ if (!test_tree_conn_flag(work->tcon, KSMBD_TREE_CONN_FLAG_WRITABLE)) {
+ ksmbd_debug(SMB, "User does not have write permission\n");
+ pr_err("User does not have write permission\n");
+ rc = -EACCES;
+ goto err_out;
+ }
+
if (!has_file_id(id)) {
id = req->VolatileFileId;
pid = req->PersistentFileId;
@@ -6164,8 +6173,10 @@ static noinline int smb2_read_pipe(struct ksmbd_work *work)
err = ksmbd_iov_pin_rsp_read(work, (void *)rsp,
offsetof(struct smb2_read_rsp, Buffer),
aux_payload_buf, nbytes);
- if (err)
+ if (err) {
+ kvfree(aux_payload_buf);
goto out;
+ }
kvfree(rpc_resp);
} else {
err = ksmbd_iov_pin_rsp(work, (void *)rsp,
@@ -6375,8 +6386,10 @@ int smb2_read(struct ksmbd_work *work)
err = ksmbd_iov_pin_rsp_read(work, (void *)rsp,
offsetof(struct smb2_read_rsp, Buffer),
aux_payload_buf, nbytes);
- if (err)
+ if (err) {
+ kvfree(aux_payload_buf);
goto out;
+ }
ksmbd_fd_put(work, fp);
return 0;
diff --git a/fs/smb/server/smb_common.c b/fs/smb/server/smb_common.c
index 6691ae68af0c..7c98bf699772 100644
--- a/fs/smb/server/smb_common.c
+++ b/fs/smb/server/smb_common.c
@@ -158,8 +158,12 @@ int ksmbd_verify_smb_message(struct ksmbd_work *work)
*/
bool ksmbd_smb_request(struct ksmbd_conn *conn)
{
- __le32 *proto = (__le32 *)smb2_get_msg(conn->request_buf);
+ __le32 *proto;
+ if (conn->request_buf[0] != 0)
+ return false;
+
+ proto = (__le32 *)smb2_get_msg(conn->request_buf);
if (*proto == SMB2_COMPRESSION_TRANSFORM_ID) {
pr_err_ratelimited("smb2 compression not support yet");
return false;
diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c
index 1164365533f0..1c9775f1efa5 100644
--- a/fs/smb/server/smbacl.c
+++ b/fs/smb/server/smbacl.c
@@ -401,10 +401,6 @@ static void parse_dacl(struct mnt_idmap *idmap,
if (num_aces > ULONG_MAX / sizeof(struct smb_ace *))
return;
- ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL);
- if (!ppace)
- return;
-
ret = init_acl_state(&acl_state, num_aces);
if (ret)
return;
@@ -414,6 +410,13 @@ static void parse_dacl(struct mnt_idmap *idmap,
return;
}
+ ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL);
+ if (!ppace) {
+ free_acl_state(&default_acl_state);
+ free_acl_state(&acl_state);
+ return;
+ }
+
/*
* reset rwx permissions for user/group/other.
* Also, if num_aces is 0 i.e. DACL has no ACEs,
diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c
index b49d47bdafc9..f29bb03f0dc4 100644
--- a/fs/smb/server/transport_ipc.c
+++ b/fs/smb/server/transport_ipc.c
@@ -74,7 +74,7 @@ static int handle_unsupported_event(struct sk_buff *skb, struct genl_info *info)
static int handle_generic_event(struct sk_buff *skb, struct genl_info *info);
static int ksmbd_ipc_heartbeat_request(void);
-static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX] = {
+static const struct nla_policy ksmbd_nl_policy[KSMBD_EVENT_MAX + 1] = {
[KSMBD_EVENT_UNSPEC] = {
.len = 0,
},
@@ -403,7 +403,7 @@ static int handle_generic_event(struct sk_buff *skb, struct genl_info *info)
return -EPERM;
#endif
- if (type >= KSMBD_EVENT_MAX) {
+ if (type > KSMBD_EVENT_MAX) {
WARN_ON(1);
return -EINVAL;
}
diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c
index c5629a68c8b7..8faa25c6e129 100644
--- a/fs/smb/server/transport_rdma.c
+++ b/fs/smb/server/transport_rdma.c
@@ -2039,6 +2039,7 @@ static bool rdma_frwr_is_supported(struct ib_device_attr *attrs)
static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
{
struct smb_direct_transport *t;
+ struct task_struct *handler;
int ret;
if (!rdma_frwr_is_supported(&new_cm_id->device->attrs)) {
@@ -2056,11 +2057,11 @@ static int smb_direct_handle_connect_request(struct rdma_cm_id *new_cm_id)
if (ret)
goto out_err;
- KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop,
- KSMBD_TRANS(t)->conn, "ksmbd:r%u",
- smb_direct_port);
- if (IS_ERR(KSMBD_TRANS(t)->handler)) {
- ret = PTR_ERR(KSMBD_TRANS(t)->handler);
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn, "ksmbd:r%u",
+ smb_direct_port);
+ if (IS_ERR(handler)) {
+ ret = PTR_ERR(handler);
pr_err("Can't start thread\n");
goto out_err;
}
diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c
index eff7a1d793f0..002a3f0dc7c5 100644
--- a/fs/smb/server/transport_tcp.c
+++ b/fs/smb/server/transport_tcp.c
@@ -185,6 +185,7 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk)
struct sockaddr *csin;
int rc = 0;
struct tcp_transport *t;
+ struct task_struct *handler;
t = alloc_transport(client_sk);
if (!t) {
@@ -199,13 +200,13 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk)
goto out_error;
}
- KSMBD_TRANS(t)->handler = kthread_run(ksmbd_conn_handler_loop,
- KSMBD_TRANS(t)->conn,
- "ksmbd:%u",
- ksmbd_tcp_get_port(csin));
- if (IS_ERR(KSMBD_TRANS(t)->handler)) {
+ handler = kthread_run(ksmbd_conn_handler_loop,
+ KSMBD_TRANS(t)->conn,
+ "ksmbd:%u",
+ ksmbd_tcp_get_port(csin));
+ if (IS_ERR(handler)) {
pr_err("cannot start conn thread\n");
- rc = PTR_ERR(KSMBD_TRANS(t)->handler);
+ rc = PTR_ERR(handler);
free_transport(t);
}
return rc;
@@ -364,6 +365,7 @@ static int ksmbd_tcp_readv(struct tcp_transport *t, struct kvec *iov_orig,
* @t: TCP transport instance
* @buf: buffer to store read data from socket
* @to_read: number of bytes to read from socket
+ * @max_retries: number of retries if reading from socket fails
*
* Return: on success return number of bytes read from socket,
* otherwise return error number
@@ -415,6 +417,7 @@ static void tcp_destroy_socket(struct socket *ksmbd_socket)
/**
* create_socket - create socket for ksmbd/0
+ * @iface: interface to bind the created socket to
*
* Return: 0 on success, error number otherwise
*/
diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c
index 4277750a6da1..a6961bfe3e13 100644
--- a/fs/smb/server/vfs.c
+++ b/fs/smb/server/vfs.c
@@ -49,6 +49,10 @@ static void ksmbd_vfs_inherit_owner(struct ksmbd_work *work,
/**
* ksmbd_vfs_lock_parent() - lock parent dentry if it is stable
+ * @parent: parent dentry
+ * @child: child dentry
+ *
+ * Returns: %0 on success, %-ENOENT if the parent dentry is not stable
*/
int ksmbd_vfs_lock_parent(struct dentry *parent, struct dentry *child)
{
@@ -360,7 +364,7 @@ out:
/**
* ksmbd_vfs_read() - vfs helper for smb file read
* @work: smb work
- * @fid: file id of open file
+ * @fp: ksmbd file pointer
* @count: read byte count
* @pos: file pos
* @rbuf: read data buffer
@@ -474,7 +478,7 @@ out:
/**
* ksmbd_vfs_write() - vfs helper for smb file write
* @work: work
- * @fid: file id of open file
+ * @fp: ksmbd file pointer
* @buf: buf containing data for writing
* @count: read byte count
* @pos: file pos
@@ -545,10 +549,8 @@ out:
/**
* ksmbd_vfs_getattr() - vfs helper for smb getattr
- * @work: work
- * @fid: file id of open file
- * @attrs: inode attributes
- *
+ * @path: path of dentry
+ * @stat: pointer to returned kernel stat structure
* Return: 0 on success, otherwise error
*/
int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat)
@@ -565,6 +567,7 @@ int ksmbd_vfs_getattr(const struct path *path, struct kstat *stat)
* ksmbd_vfs_fsync() - vfs helper for smb fsync
* @work: work
* @fid: file id of open file
+ * @p_id: persistent file id
*
* Return: 0 on success, otherwise error
*/
@@ -587,7 +590,8 @@ int ksmbd_vfs_fsync(struct ksmbd_work *work, u64 fid, u64 p_id)
/**
* ksmbd_vfs_remove_file() - vfs helper for smb rmdir or unlink
- * @name: directory or file name that is relative to share
+ * @work: work
+ * @path: path of dentry
*
* Return: 0 on success, otherwise error
*/
@@ -623,6 +627,7 @@ out_err:
/**
* ksmbd_vfs_link() - vfs helper for creating smb hardlink
+ * @work: work
* @oldname: source file name
* @newname: hardlink name that is relative to share
*
@@ -715,6 +720,10 @@ retry:
goto out2;
trap = lock_rename_child(old_child, new_path.dentry);
+ if (IS_ERR(trap)) {
+ err = PTR_ERR(trap);
+ goto out_drop_write;
+ }
old_parent = dget(old_child->d_parent);
if (d_unhashed(old_child)) {
@@ -777,6 +786,7 @@ out4:
out3:
dput(old_parent);
unlock_rename(old_parent, new_path.dentry);
+out_drop_write:
mnt_drop_write(old_path->mnt);
out2:
path_put(&new_path);
@@ -795,7 +805,7 @@ revert_fsids:
/**
* ksmbd_vfs_truncate() - vfs helper for smb file truncate
* @work: work
- * @fid: file id of old file
+ * @fp: ksmbd file pointer
* @size: truncate to given size
*
* Return: 0 on success, otherwise error
@@ -838,7 +848,6 @@ int ksmbd_vfs_truncate(struct ksmbd_work *work,
* ksmbd_vfs_listxattr() - vfs helper for smb list extended attributes
* @dentry: dentry of file for listing xattrs
* @list: destination buffer
- * @size: destination buffer length
*
* Return: xattr list length on success, otherwise error
*/
@@ -947,7 +956,7 @@ int ksmbd_vfs_setxattr(struct mnt_idmap *idmap,
/**
* ksmbd_vfs_set_fadvise() - convert smb IO caching options to linux options
* @filp: file pointer for IO
- * @options: smb IO options
+ * @option: smb IO options
*/
void ksmbd_vfs_set_fadvise(struct file *filp, __le32 option)
{
@@ -1159,6 +1168,7 @@ static bool __caseless_lookup(struct dir_context *ctx, const char *name,
* @dir: path info
* @name: filename to lookup
* @namelen: filename length
+ * @um: &struct unicode_map to use
*
* Return: 0 on success, otherwise error
*/
@@ -1189,6 +1199,7 @@ static int ksmbd_vfs_lookup_in_dir(const struct path *dir, char *name,
/**
* ksmbd_vfs_kern_path_locked() - lookup a file and get path info
+ * @work: work
* @name: file path that is relative to share
* @flags: lookup flags
* @parent_path: if lookup succeed, return parent_path info
@@ -1636,6 +1647,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap,
* ksmbd_vfs_init_kstat() - convert unix stat information to smb stat format
* @p: destination buffer
* @ksmbd_kstat: ksmbd kstat wrapper
+ *
+ * Returns: pointer to the converted &struct file_directory_info
*/
void *ksmbd_vfs_init_kstat(char **p, struct ksmbd_kstat *ksmbd_kstat)
{
diff --git a/fs/splice.c b/fs/splice.c
index d983d375ff11..218e24b1ac40 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -201,7 +201,8 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe,
unsigned int tail = pipe->tail;
unsigned int head = pipe->head;
unsigned int mask = pipe->ring_size - 1;
- int ret = 0, page_nr = 0;
+ ssize_t ret = 0;
+ int page_nr = 0;
if (!spd_pages)
return 0;
@@ -673,10 +674,13 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
.u.file = out,
};
int nbufs = pipe->max_usage;
- struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
- GFP_KERNEL);
+ struct bio_vec *array;
ssize_t ret;
+ if (!out->f_op->write_iter)
+ return -EINVAL;
+
+ array = kcalloc(nbufs, sizeof(struct bio_vec), GFP_KERNEL);
if (unlikely(!array))
return -ENOMEM;
@@ -684,6 +688,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
splice_from_pipe_begin(&sd);
while (sd.total_len) {
+ struct kiocb kiocb;
struct iov_iter from;
unsigned int head, tail, mask;
size_t left;
@@ -733,7 +738,10 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
}
iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left);
- ret = vfs_iter_write(out, &from, &sd.pos, 0);
+ init_sync_kiocb(&kiocb, out);
+ kiocb.ki_pos = sd.pos;
+ ret = call_write_iter(out, &kiocb, &from);
+ sd.pos = kiocb.ki_pos;
if (ret <= 0)
break;
@@ -925,8 +933,8 @@ static int warn_unsupported(struct file *file, const char *op)
/*
* Attempt to initiate a splice from pipe to file.
*/
-static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
- loff_t *ppos, size_t len, unsigned int flags)
+static ssize_t do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+ loff_t *ppos, size_t len, unsigned int flags)
{
if (unlikely(!out->f_op->splice_write))
return warn_unsupported(out, "write");
@@ -944,27 +952,15 @@ static void do_splice_eof(struct splice_desc *sd)
sd->splice_eof(sd);
}
-/**
- * vfs_splice_read - Read data from a file and splice it into a pipe
- * @in: File to splice from
- * @ppos: Input file offset
- * @pipe: Pipe to splice to
- * @len: Number of bytes to splice
- * @flags: Splice modifier flags (SPLICE_F_*)
- *
- * Splice the requested amount of data from the input file to the pipe. This
- * is synchronous as the caller must hold the pipe lock across the entire
- * operation.
- *
- * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
- * a hole and a negative error code otherwise.
+/*
+ * Callers already called rw_verify_area() on the entire range.
+ * No need to call it for sub ranges.
*/
-long vfs_splice_read(struct file *in, loff_t *ppos,
- struct pipe_inode_info *pipe, size_t len,
- unsigned int flags)
+static ssize_t do_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
{
unsigned int p_space;
- int ret;
if (unlikely(!(in->f_mode & FMODE_READ)))
return -EBADF;
@@ -975,10 +971,6 @@ long vfs_splice_read(struct file *in, loff_t *ppos,
p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail);
len = min_t(size_t, len, p_space << PAGE_SHIFT);
- ret = rw_verify_area(READ, in, ppos, len);
- if (unlikely(ret < 0))
- return ret;
-
if (unlikely(len > MAX_RW_COUNT))
len = MAX_RW_COUNT;
@@ -992,6 +984,34 @@ long vfs_splice_read(struct file *in, loff_t *ppos,
return copy_splice_read(in, ppos, pipe, len, flags);
return in->f_op->splice_read(in, ppos, pipe, len, flags);
}
+
+/**
+ * vfs_splice_read - Read data from a file and splice it into a pipe
+ * @in: File to splice from
+ * @ppos: Input file offset
+ * @pipe: Pipe to splice to
+ * @len: Number of bytes to splice
+ * @flags: Splice modifier flags (SPLICE_F_*)
+ *
+ * Splice the requested amount of data from the input file to the pipe. This
+ * is synchronous as the caller must hold the pipe lock across the entire
+ * operation.
+ *
+ * If successful, it returns the amount of data spliced, 0 if it hit the EOF or
+ * a hole and a negative error code otherwise.
+ */
+ssize_t vfs_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ ssize_t ret;
+
+ ret = rw_verify_area(READ, in, ppos, len);
+ if (unlikely(ret < 0))
+ return ret;
+
+ return do_splice_read(in, ppos, pipe, len, flags);
+}
EXPORT_SYMBOL_GPL(vfs_splice_read);
/**
@@ -1011,7 +1031,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
splice_direct_actor *actor)
{
struct pipe_inode_info *pipe;
- long ret, bytes;
+ ssize_t ret, bytes;
size_t len;
int i, flags, more;
@@ -1066,7 +1086,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd,
size_t read_len;
loff_t pos = sd->pos, prev_pos = pos;
- ret = vfs_splice_read(in, &pos, pipe, len, flags);
+ ret = do_splice_read(in, &pos, pipe, len, flags);
if (unlikely(ret <= 0))
goto read_failure;
@@ -1138,9 +1158,20 @@ static int direct_splice_actor(struct pipe_inode_info *pipe,
struct splice_desc *sd)
{
struct file *file = sd->u.file;
+ long ret;
+
+ file_start_write(file);
+ ret = do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
+ file_end_write(file);
+ return ret;
+}
+
+static int splice_file_range_actor(struct pipe_inode_info *pipe,
+ struct splice_desc *sd)
+{
+ struct file *file = sd->u.file;
- return do_splice_from(pipe, file, sd->opos, sd->total_len,
- sd->flags);
+ return do_splice_from(pipe, file, sd->opos, sd->total_len, sd->flags);
}
static void direct_file_splice_eof(struct splice_desc *sd)
@@ -1151,24 +1182,10 @@ static void direct_file_splice_eof(struct splice_desc *sd)
file->f_op->splice_eof(file);
}
-/**
- * do_splice_direct - splices data directly between two files
- * @in: file to splice from
- * @ppos: input file offset
- * @out: file to splice to
- * @opos: output file offset
- * @len: number of bytes to splice
- * @flags: splice modifier flags
- *
- * Description:
- * For use by do_sendfile(). splice can easily emulate sendfile, but
- * doing it in the application would incur an extra system call
- * (splice in + splice out, as compared to just sendfile()). So this helper
- * can splice directly through a process-private pipe.
- *
- */
-long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
- loff_t *opos, size_t len, unsigned int flags)
+static ssize_t do_splice_direct_actor(struct file *in, loff_t *ppos,
+ struct file *out, loff_t *opos,
+ size_t len, unsigned int flags,
+ splice_direct_actor *actor)
{
struct splice_desc sd = {
.len = len,
@@ -1179,7 +1196,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
.splice_eof = direct_file_splice_eof,
.opos = opos,
};
- long ret;
+ ssize_t ret;
if (unlikely(!(out->f_mode & FMODE_WRITE)))
return -EBADF;
@@ -1187,18 +1204,63 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
if (unlikely(out->f_flags & O_APPEND))
return -EINVAL;
- ret = rw_verify_area(WRITE, out, opos, len);
- if (unlikely(ret < 0))
- return ret;
-
- ret = splice_direct_to_actor(in, &sd, direct_splice_actor);
+ ret = splice_direct_to_actor(in, &sd, actor);
if (ret > 0)
*ppos = sd.pos;
return ret;
}
+/**
+ * do_splice_direct - splices data directly between two files
+ * @in: file to splice from
+ * @ppos: input file offset
+ * @out: file to splice to
+ * @opos: output file offset
+ * @len: number of bytes to splice
+ * @flags: splice modifier flags
+ *
+ * Description:
+ * For use by do_sendfile(). splice can easily emulate sendfile, but
+ * doing it in the application would incur an extra system call
+ * (splice in + splice out, as compared to just sendfile()). So this helper
+ * can splice directly through a process-private pipe.
+ *
+ * Callers already called rw_verify_area() on the entire range.
+ */
+ssize_t do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+ loff_t *opos, size_t len, unsigned int flags)
+{
+ return do_splice_direct_actor(in, ppos, out, opos, len, flags,
+ direct_splice_actor);
+}
EXPORT_SYMBOL(do_splice_direct);
+/**
+ * splice_file_range - splices data between two files for copy_file_range()
+ * @in: file to splice from
+ * @ppos: input file offset
+ * @out: file to splice to
+ * @opos: output file offset
+ * @len: number of bytes to splice
+ *
+ * Description:
+ * For use by ->copy_file_range() methods.
+ * Like do_splice_direct(), but vfs_copy_file_range() already holds
+ * start_file_write() on @out file.
+ *
+ * Callers already called rw_verify_area() on the entire range.
+ */
+ssize_t splice_file_range(struct file *in, loff_t *ppos, struct file *out,
+ loff_t *opos, size_t len)
+{
+ lockdep_assert(file_write_started(out));
+
+ return do_splice_direct_actor(in, ppos, out, opos,
+ min_t(size_t, len, MAX_RW_COUNT),
+ 0, splice_file_range_actor);
+}
+EXPORT_SYMBOL(splice_file_range);
+
static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
{
for (;;) {
@@ -1220,17 +1282,17 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
struct pipe_inode_info *opipe,
size_t len, unsigned int flags);
-long splice_file_to_pipe(struct file *in,
- struct pipe_inode_info *opipe,
- loff_t *offset,
- size_t len, unsigned int flags)
+ssize_t splice_file_to_pipe(struct file *in,
+ struct pipe_inode_info *opipe,
+ loff_t *offset,
+ size_t len, unsigned int flags)
{
- long ret;
+ ssize_t ret;
pipe_lock(opipe);
ret = wait_for_space(opipe, flags);
if (!ret)
- ret = vfs_splice_read(in, offset, opipe, len, flags);
+ ret = do_splice_read(in, offset, opipe, len, flags);
pipe_unlock(opipe);
if (ret > 0)
wakeup_pipe_readers(opipe);
@@ -1240,13 +1302,13 @@ long splice_file_to_pipe(struct file *in,
/*
* Determine where to splice to/from.
*/
-long do_splice(struct file *in, loff_t *off_in, struct file *out,
- loff_t *off_out, size_t len, unsigned int flags)
+ssize_t do_splice(struct file *in, loff_t *off_in, struct file *out,
+ loff_t *off_out, size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
loff_t offset;
- long ret;
+ ssize_t ret;
if (unlikely(!(in->f_mode & FMODE_READ) ||
!(out->f_mode & FMODE_WRITE)))
@@ -1307,6 +1369,10 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
offset = in->f_pos;
}
+ ret = rw_verify_area(READ, in, &offset, len);
+ if (unlikely(ret < 0))
+ return ret;
+
if (out->f_flags & O_NONBLOCK)
flags |= SPLICE_F_NONBLOCK;
@@ -1333,14 +1399,14 @@ long do_splice(struct file *in, loff_t *off_in, struct file *out,
return ret;
}
-static long __do_splice(struct file *in, loff_t __user *off_in,
- struct file *out, loff_t __user *off_out,
- size_t len, unsigned int flags)
+static ssize_t __do_splice(struct file *in, loff_t __user *off_in,
+ struct file *out, loff_t __user *off_out,
+ size_t len, unsigned int flags)
{
struct pipe_inode_info *ipipe;
struct pipe_inode_info *opipe;
loff_t offset, *__off_in = NULL, *__off_out = NULL;
- long ret;
+ ssize_t ret;
ipipe = get_pipe_info(in, true);
opipe = get_pipe_info(out, true);
@@ -1379,16 +1445,16 @@ static long __do_splice(struct file *in, loff_t __user *off_in,
return ret;
}
-static int iter_to_pipe(struct iov_iter *from,
- struct pipe_inode_info *pipe,
- unsigned flags)
+static ssize_t iter_to_pipe(struct iov_iter *from,
+ struct pipe_inode_info *pipe,
+ unsigned int flags)
{
struct pipe_buffer buf = {
.ops = &user_page_pipe_buf_ops,
.flags = flags
};
size_t total = 0;
- int ret = 0;
+ ssize_t ret = 0;
while (iov_iter_count(from)) {
struct page *pages[16];
@@ -1437,8 +1503,8 @@ static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
* For lack of a better implementation, implement vmsplice() to userspace
* as a simple copy of the pipes pages to the user iov.
*/
-static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
- unsigned int flags)
+static ssize_t vmsplice_to_user(struct file *file, struct iov_iter *iter,
+ unsigned int flags)
{
struct pipe_inode_info *pipe = get_pipe_info(file, true);
struct splice_desc sd = {
@@ -1446,7 +1512,7 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
.flags = flags,
.u.data = iter
};
- long ret = 0;
+ ssize_t ret = 0;
if (!pipe)
return -EBADF;
@@ -1470,11 +1536,11 @@ static long vmsplice_to_user(struct file *file, struct iov_iter *iter,
* as splice-from-memory, where the regular splice is splice-from-file (or
* to file). In both cases the output is a pipe, naturally.
*/
-static long vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
- unsigned int flags)
+static ssize_t vmsplice_to_pipe(struct file *file, struct iov_iter *iter,
+ unsigned int flags)
{
struct pipe_inode_info *pipe;
- long ret = 0;
+ ssize_t ret = 0;
unsigned buf_flag = 0;
if (flags & SPLICE_F_GIFT)
@@ -1570,7 +1636,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
size_t, len, unsigned int, flags)
{
struct fd in, out;
- long error;
+ ssize_t error;
if (unlikely(!len))
return 0;
@@ -1584,7 +1650,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
out = fdget(fd_out);
if (out.file) {
error = __do_splice(in.file, off_in, out.file, off_out,
- len, flags);
+ len, flags);
fdput(out);
}
fdput(in);
@@ -1807,15 +1873,15 @@ retry:
/*
* Link contents of ipipe to opipe.
*/
-static int link_pipe(struct pipe_inode_info *ipipe,
- struct pipe_inode_info *opipe,
- size_t len, unsigned int flags)
+static ssize_t link_pipe(struct pipe_inode_info *ipipe,
+ struct pipe_inode_info *opipe,
+ size_t len, unsigned int flags)
{
struct pipe_buffer *ibuf, *obuf;
unsigned int i_head, o_head;
unsigned int i_tail, o_tail;
unsigned int i_mask, o_mask;
- int ret = 0;
+ ssize_t ret = 0;
/*
* Potential ABBA deadlock, work around it by ordering lock
@@ -1898,11 +1964,12 @@ static int link_pipe(struct pipe_inode_info *ipipe,
* The 'flags' used are the SPLICE_F_* variants, currently the only
* applicable one is SPLICE_F_NONBLOCK.
*/
-long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
+ssize_t do_tee(struct file *in, struct file *out, size_t len,
+ unsigned int flags)
{
struct pipe_inode_info *ipipe = get_pipe_info(in, true);
struct pipe_inode_info *opipe = get_pipe_info(out, true);
- int ret = -EINVAL;
+ ssize_t ret = -EINVAL;
if (unlikely(!(in->f_mode & FMODE_READ) ||
!(out->f_mode & FMODE_WRITE)))
@@ -1939,7 +2006,7 @@ long do_tee(struct file *in, struct file *out, size_t len, unsigned int flags)
SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags)
{
struct fd in, out;
- int error;
+ ssize_t error;
if (unlikely(flags & ~SPLICE_F_ALL))
return -EINVAL;
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c
index 8ba8c4c50770..e8df6430444b 100644
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -544,7 +544,8 @@ static void squashfs_readahead(struct readahead_control *ractl)
struct squashfs_page_actor *actor;
unsigned int nr_pages = 0;
struct page **pages;
- int i, file_end = i_size_read(inode) >> msblk->block_log;
+ int i;
+ loff_t file_end = i_size_read(inode) >> msblk->block_log;
unsigned int max_pages = 1UL << shift;
readahead_expand(ractl, start, (len | mask) + 1);
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c
index f1ccad519e28..763a3f7a75f6 100644
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -26,10 +26,10 @@ int squashfs_readpage_block(struct page *target_page, u64 block, int bsize,
struct inode *inode = target_page->mapping->host;
struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
- int file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+ loff_t file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
- int start_index = target_page->index & ~mask;
- int end_index = start_index | mask;
+ loff_t start_index = target_page->index & ~mask;
+ loff_t end_index = start_index | mask;
int i, n, pages, bytes, res = -ENOMEM;
struct page **page;
struct squashfs_page_actor *actor;
diff --git a/fs/stat.c b/fs/stat.c
index f721d26ec3f7..77cdc69eb422 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -41,7 +41,7 @@
* the vfsmount must be passed through @idmap. This function will then
* take care to map the inode according to @idmap before filling in the
* uid and gid filds. On non-idmapped mounts or if permission checking is to be
- * performed on the raw inode simply passs @nop_mnt_idmap.
+ * performed on the raw inode simply pass @nop_mnt_idmap.
*/
void generic_fillattr(struct mnt_idmap *idmap, u32 request_mask,
struct inode *inode, struct kstat *stat)
@@ -247,8 +247,13 @@ retry:
error = vfs_getattr(&path, stat, request_mask, flags);
- stat->mnt_id = real_mount(path.mnt)->mnt_id;
- stat->result_mask |= STATX_MNT_ID;
+ if (request_mask & STATX_MNT_ID_UNIQUE) {
+ stat->mnt_id = real_mount(path.mnt)->mnt_id_unique;
+ stat->result_mask |= STATX_MNT_ID_UNIQUE;
+ } else {
+ stat->mnt_id = real_mount(path.mnt)->mnt_id;
+ stat->result_mask |= STATX_MNT_ID;
+ }
if (path.mnt->mnt_root == path.dentry)
stat->attributes |= STATX_ATTR_MOUNT_ROOT;
diff --git a/fs/super.c b/fs/super.c
index 076392396e72..d6efeba0d0ce 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -81,16 +81,13 @@ static inline void super_unlock_shared(struct super_block *sb)
super_unlock(sb, false);
}
-static inline bool wait_born(struct super_block *sb)
+static bool super_flags(const struct super_block *sb, unsigned int flags)
{
- unsigned int flags;
-
/*
* Pairs with smp_store_release() in super_wake() and ensures
- * that we see SB_BORN or SB_DYING after we're woken.
+ * that we see @flags after we're woken.
*/
- flags = smp_load_acquire(&sb->s_flags);
- return flags & (SB_BORN | SB_DYING);
+ return smp_load_acquire(&sb->s_flags) & flags;
}
/**
@@ -105,15 +102,21 @@ static inline bool wait_born(struct super_block *sb)
*
* The caller must have acquired a temporary reference on @sb->s_count.
*
- * Return: This returns true if SB_BORN was set, false if SB_DYING was
- * set. The function acquires s_umount and returns with it held.
+ * Return: The function returns true if SB_BORN was set and with
+ * s_umount held. The function returns false if SB_DYING was
+ * set and without s_umount held.
*/
static __must_check bool super_lock(struct super_block *sb, bool excl)
{
-
lockdep_assert_not_held(&sb->s_umount);
-relock:
+ /* wait until the superblock is ready or dying */
+ wait_var_event(&sb->s_flags, super_flags(sb, SB_BORN | SB_DYING));
+
+ /* Don't pointlessly acquire s_umount. */
+ if (super_flags(sb, SB_DYING))
+ return false;
+
__super_lock(sb, excl);
/*
@@ -121,32 +124,22 @@ relock:
* @sb->s_root is NULL and @sb->s_active is 0. No one needs to
* grab a reference to this. Tell them so.
*/
- if (sb->s_flags & SB_DYING)
+ if (sb->s_flags & SB_DYING) {
+ super_unlock(sb, excl);
return false;
+ }
- /* Has called ->get_tree() successfully. */
- if (sb->s_flags & SB_BORN)
- return true;
-
- super_unlock(sb, excl);
-
- /* wait until the superblock is ready or dying */
- wait_var_event(&sb->s_flags, wait_born(sb));
-
- /*
- * Neither SB_BORN nor SB_DYING are ever unset so we never loop.
- * Just reacquire @sb->s_umount for the caller.
- */
- goto relock;
+ WARN_ON_ONCE(!(sb->s_flags & SB_BORN));
+ return true;
}
-/* wait and acquire read-side of @sb->s_umount */
+/* wait and try to acquire read-side of @sb->s_umount */
static inline bool super_lock_shared(struct super_block *sb)
{
return super_lock(sb, false);
}
-/* wait and acquire write-side of @sb->s_umount */
+/* wait and try to acquire write-side of @sb->s_umount */
static inline bool super_lock_excl(struct super_block *sb)
{
return super_lock(sb, true);
@@ -281,9 +274,10 @@ static void destroy_super_work(struct work_struct *work)
{
struct super_block *s = container_of(work, struct super_block,
destroy_work);
- int i;
-
- for (i = 0; i < SB_FREEZE_LEVELS; i++)
+ security_sb_free(s);
+ put_user_ns(s->s_user_ns);
+ kfree(s->s_subtype);
+ for (int i = 0; i < SB_FREEZE_LEVELS; i++)
percpu_free_rwsem(&s->s_writers.rw_sem[i]);
kfree(s);
}
@@ -303,9 +297,6 @@ static void destroy_unused_super(struct super_block *s)
super_unlock_excl(s);
list_lru_destroy(&s->s_dentry_lru);
list_lru_destroy(&s->s_inode_lru);
- security_sb_free(s);
- put_user_ns(s->s_user_ns);
- kfree(s->s_subtype);
shrinker_free(s->s_shrink);
/* no delays needed */
destroy_super_work(&s->destroy_work);
@@ -323,7 +314,7 @@ static void destroy_unused_super(struct super_block *s)
static struct super_block *alloc_super(struct file_system_type *type, int flags,
struct user_namespace *user_ns)
{
- struct super_block *s = kzalloc(sizeof(struct super_block), GFP_USER);
+ struct super_block *s = kzalloc(sizeof(struct super_block), GFP_KERNEL);
static const struct super_operations default_op;
int i;
@@ -416,9 +407,6 @@ static void __put_super(struct super_block *s)
WARN_ON(s->s_dentry_lru.node);
WARN_ON(s->s_inode_lru.node);
WARN_ON(!list_empty(&s->s_mounts));
- security_sb_free(s);
- put_user_ns(s->s_user_ns);
- kfree(s->s_subtype);
call_rcu(&s->rcu, destroy_super_rcu);
}
}
@@ -521,48 +509,7 @@ void deactivate_super(struct super_block *s)
EXPORT_SYMBOL(deactivate_super);
/**
- * grab_super - acquire an active reference
- * @s: reference we are trying to make active
- *
- * Tries to acquire an active reference. grab_super() is used when we
- * had just found a superblock in super_blocks or fs_type->fs_supers
- * and want to turn it into a full-blown active reference. grab_super()
- * is called with sb_lock held and drops it. Returns 1 in case of
- * success, 0 if we had failed (superblock contents was already dead or
- * dying when grab_super() had been called). Note that this is only
- * called for superblocks not in rundown mode (== ones still on ->fs_supers
- * of their type), so increment of ->s_count is OK here.
- */
-static int grab_super(struct super_block *s) __releases(sb_lock)
-{
- bool born;
-
- s->s_count++;
- spin_unlock(&sb_lock);
- born = super_lock_excl(s);
- if (born && atomic_inc_not_zero(&s->s_active)) {
- put_super(s);
- return 1;
- }
- super_unlock_excl(s);
- put_super(s);
- return 0;
-}
-
-static inline bool wait_dead(struct super_block *sb)
-{
- unsigned int flags;
-
- /*
- * Pairs with memory barrier in super_wake() and ensures
- * that we see SB_DEAD after we're woken.
- */
- flags = smp_load_acquire(&sb->s_flags);
- return flags & SB_DEAD;
-}
-
-/**
- * grab_super_dead - acquire an active reference to a superblock
+ * grab_super - acquire an active reference to a superblock
* @sb: superblock to acquire
*
* Acquire a temporary reference on a superblock and try to trade it for
@@ -573,17 +520,21 @@ static inline bool wait_dead(struct super_block *sb)
* Return: This returns true if an active reference could be acquired,
* false if not.
*/
-static bool grab_super_dead(struct super_block *sb)
+static bool grab_super(struct super_block *sb)
{
+ bool locked;
sb->s_count++;
- if (grab_super(sb)) {
- put_super(sb);
- lockdep_assert_held(&sb->s_umount);
- return true;
+ spin_unlock(&sb_lock);
+ locked = super_lock_excl(sb);
+ if (locked) {
+ if (atomic_inc_not_zero(&sb->s_active)) {
+ put_super(sb);
+ return true;
+ }
+ super_unlock_excl(sb);
}
- wait_var_event(&sb->s_flags, wait_dead(sb));
- lockdep_assert_not_held(&sb->s_umount);
+ wait_var_event(&sb->s_flags, super_flags(sb, SB_DEAD));
put_super(sb);
return false;
}
@@ -681,12 +632,6 @@ void generic_shutdown_super(struct super_block *sb)
fsnotify_sb_delete(sb);
security_sb_delete(sb);
- /*
- * Now that all potentially-encrypted inodes have been evicted,
- * the fscrypt keyring can be destroyed.
- */
- fscrypt_destroy_keyring(sb);
-
if (sb->s_dio_done_wq) {
destroy_workqueue(sb->s_dio_done_wq);
sb->s_dio_done_wq = NULL;
@@ -695,6 +640,12 @@ void generic_shutdown_super(struct super_block *sb)
if (sop->put_super)
sop->put_super(sb);
+ /*
+ * Now that all potentially-encrypted inodes have been evicted,
+ * the fscrypt keyring can be destroyed.
+ */
+ fscrypt_destroy_keyring(sb);
+
if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes),
"VFS: Busy inodes after unmount of %s (%s)",
sb->s_id, sb->s_type->name)) {
@@ -834,7 +785,7 @@ share_extant_sb:
warnfc(fc, "reusing existing filesystem in another namespace not allowed");
return ERR_PTR(-EBUSY);
}
- if (!grab_super_dead(old))
+ if (!grab_super(old))
goto retry;
destroy_unused_super(s);
return old;
@@ -878,7 +829,7 @@ retry:
destroy_unused_super(s);
return ERR_PTR(-EBUSY);
}
- if (!grab_super_dead(old))
+ if (!grab_super(old))
goto retry;
destroy_unused_super(s);
return old;
@@ -930,8 +881,7 @@ static void __iterate_supers(void (*f)(struct super_block *))
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- /* Pairs with memory marrier in super_wake(). */
- if (smp_load_acquire(&sb->s_flags) & SB_DYING)
+ if (super_flags(sb, SB_DYING))
continue;
sb->s_count++;
spin_unlock(&sb_lock);
@@ -961,15 +911,17 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
- bool born;
+ bool locked;
sb->s_count++;
spin_unlock(&sb_lock);
- born = super_lock_shared(sb);
- if (born && sb->s_root)
- f(sb, arg);
- super_unlock_shared(sb);
+ locked = super_lock_shared(sb);
+ if (locked) {
+ if (sb->s_root)
+ f(sb, arg);
+ super_unlock_shared(sb);
+ }
spin_lock(&sb_lock);
if (p)
@@ -997,15 +949,17 @@ void iterate_supers_type(struct file_system_type *type,
spin_lock(&sb_lock);
hlist_for_each_entry(sb, &type->fs_supers, s_instances) {
- bool born;
+ bool locked;
sb->s_count++;
spin_unlock(&sb_lock);
- born = super_lock_shared(sb);
- if (born && sb->s_root)
- f(sb, arg);
- super_unlock_shared(sb);
+ locked = super_lock_shared(sb);
+ if (locked) {
+ if (sb->s_root)
+ f(sb, arg);
+ super_unlock_shared(sb);
+ }
spin_lock(&sb_lock);
if (p)
@@ -1019,34 +973,6 @@ void iterate_supers_type(struct file_system_type *type,
EXPORT_SYMBOL(iterate_supers_type);
-/**
- * get_active_super - get an active reference to the superblock of a device
- * @bdev: device to get the superblock for
- *
- * Scans the superblock list and finds the superblock of the file system
- * mounted on the device given. Returns the superblock with an active
- * reference or %NULL if none was found.
- */
-struct super_block *get_active_super(struct block_device *bdev)
-{
- struct super_block *sb;
-
- if (!bdev)
- return NULL;
-
- spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (sb->s_bdev == bdev) {
- if (!grab_super(sb))
- return NULL;
- super_unlock_excl(sb);
- return sb;
- }
- }
- spin_unlock(&sb_lock);
- return NULL;
-}
-
struct super_block *user_get_super(dev_t dev, bool excl)
{
struct super_block *sb;
@@ -1054,15 +980,17 @@ struct super_block *user_get_super(dev_t dev, bool excl)
spin_lock(&sb_lock);
list_for_each_entry(sb, &super_blocks, s_list) {
if (sb->s_dev == dev) {
- bool born;
+ bool locked;
sb->s_count++;
spin_unlock(&sb_lock);
/* still alive? */
- born = super_lock(sb, excl);
- if (born && sb->s_root)
- return sb;
- super_unlock(sb, excl);
+ locked = super_lock(sb, excl);
+ if (locked) {
+ if (sb->s_root)
+ return sb;
+ super_unlock(sb, excl);
+ }
/* nope, got unmounted */
spin_lock(&sb_lock);
__put_super(sb);
@@ -1173,9 +1101,9 @@ cancel_readonly:
static void do_emergency_remount_callback(struct super_block *sb)
{
- bool born = super_lock_excl(sb);
+ bool locked = super_lock_excl(sb);
- if (born && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
+ if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) {
struct fs_context *fc;
fc = fs_context_for_reconfigure(sb->s_root,
@@ -1186,7 +1114,8 @@ static void do_emergency_remount_callback(struct super_block *sb)
put_fs_context(fc);
}
}
- super_unlock_excl(sb);
+ if (locked)
+ super_unlock_excl(sb);
}
static void do_emergency_remount(struct work_struct *work)
@@ -1209,16 +1138,17 @@ void emergency_remount(void)
static void do_thaw_all_callback(struct super_block *sb)
{
- bool born = super_lock_excl(sb);
+ bool locked = super_lock_excl(sb);
- if (born && sb->s_root) {
+ if (locked && sb->s_root) {
if (IS_ENABLED(CONFIG_BLOCK))
- while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
+ while (sb->s_bdev && !bdev_thaw(sb->s_bdev))
pr_warn("Emergency Thaw on %pg\n", sb->s_bdev);
thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE);
- } else {
- super_unlock_excl(sb);
+ return;
}
+ if (locked)
+ super_unlock_excl(sb);
}
static void do_thaw_all(struct work_struct *work)
@@ -1428,11 +1358,11 @@ EXPORT_SYMBOL(sget_dev);
*
* The function must be called with bdev->bd_holder_lock and releases it.
*/
-static struct super_block *bdev_super_lock_shared(struct block_device *bdev)
+static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
__releases(&bdev->bd_holder_lock)
{
struct super_block *sb = bdev->bd_holder;
- bool born;
+ bool locked;
lockdep_assert_held(&bdev->bd_holder_lock);
lockdep_assert_not_held(&sb->s_umount);
@@ -1442,19 +1372,25 @@ static struct super_block *bdev_super_lock_shared(struct block_device *bdev)
spin_lock(&sb_lock);
sb->s_count++;
spin_unlock(&sb_lock);
+
mutex_unlock(&bdev->bd_holder_lock);
- born = super_lock_shared(sb);
- if (!born || !sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
- super_unlock_shared(sb);
- put_super(sb);
- return NULL;
- }
+ locked = super_lock(sb, excl);
+
/*
- * The superblock is active and we hold s_umount, we can drop our
- * temporary reference now.
- */
+ * If the superblock wasn't already SB_DYING then we hold
+ * s_umount and can safely drop our temporary reference.
+ */
put_super(sb);
+
+ if (!locked)
+ return NULL;
+
+ if (!sb->s_root || !(sb->s_flags & SB_ACTIVE)) {
+ super_unlock(sb, excl);
+ return NULL;
+ }
+
return sb;
}
@@ -1462,7 +1398,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
{
struct super_block *sb;
- sb = bdev_super_lock_shared(bdev);
+ sb = bdev_super_lock(bdev, false);
if (!sb)
return;
@@ -1480,16 +1416,110 @@ static void fs_bdev_sync(struct block_device *bdev)
{
struct super_block *sb;
- sb = bdev_super_lock_shared(bdev);
+ sb = bdev_super_lock(bdev, false);
if (!sb)
return;
+
sync_filesystem(sb);
super_unlock_shared(sb);
}
+static struct super_block *get_bdev_super(struct block_device *bdev)
+{
+ bool active = false;
+ struct super_block *sb;
+
+ sb = bdev_super_lock(bdev, true);
+ if (sb) {
+ active = atomic_inc_not_zero(&sb->s_active);
+ super_unlock_excl(sb);
+ }
+ if (!active)
+ return NULL;
+ return sb;
+}
+
+/**
+ * fs_bdev_freeze - freeze owning filesystem of block device
+ * @bdev: block device
+ *
+ * Freeze the filesystem that owns this block device if it is still
+ * active.
+ *
+ * A filesystem that owns multiple block devices may be frozen from each
+ * block device and won't be unfrozen until all block devices are
+ * unfrozen. Each block device can only freeze the filesystem once as we
+ * nest freezes for block devices in the block layer.
+ *
+ * Return: If the freeze was successful zero is returned. If the freeze
+ * failed a negative error code is returned.
+ */
+static int fs_bdev_freeze(struct block_device *bdev)
+{
+ struct super_block *sb;
+ int error = 0;
+
+ lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
+
+ sb = get_bdev_super(bdev);
+ if (!sb)
+ return -EINVAL;
+
+ if (sb->s_op->freeze_super)
+ error = sb->s_op->freeze_super(sb,
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ else
+ error = freeze_super(sb,
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ if (!error)
+ error = sync_blockdev(bdev);
+ deactivate_super(sb);
+ return error;
+}
+
+/**
+ * fs_bdev_thaw - thaw owning filesystem of block device
+ * @bdev: block device
+ *
+ * Thaw the filesystem that owns this block device.
+ *
+ * A filesystem that owns multiple block devices may be frozen from each
+ * block device and won't be unfrozen until all block devices are
+ * unfrozen. Each block device can only freeze the filesystem once as we
+ * nest freezes for block devices in the block layer.
+ *
+ * Return: If the thaw was successful zero is returned. If the thaw
+ * failed a negative error code is returned. If this function
+ * returns zero it doesn't mean that the filesystem is unfrozen
+ * as it may have been frozen multiple times (kernel may hold a
+ * freeze or might be frozen from other block devices).
+ */
+static int fs_bdev_thaw(struct block_device *bdev)
+{
+ struct super_block *sb;
+ int error;
+
+ lockdep_assert_held(&bdev->bd_fsfreeze_mutex);
+
+ sb = get_bdev_super(bdev);
+ if (WARN_ON_ONCE(!sb))
+ return -EINVAL;
+
+ if (sb->s_op->thaw_super)
+ error = sb->s_op->thaw_super(sb,
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ else
+ error = thaw_super(sb,
+ FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE);
+ deactivate_super(sb);
+ return error;
+}
+
const struct blk_holder_ops fs_holder_ops = {
.mark_dead = fs_bdev_mark_dead,
.sync = fs_bdev_sync,
+ .freeze = fs_bdev_freeze,
+ .thaw = fs_bdev_thaw,
};
EXPORT_SYMBOL_GPL(fs_holder_ops);
@@ -1519,15 +1549,10 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
}
/*
- * Until SB_BORN flag is set, there can be no active superblock
- * references and thus no filesystem freezing. get_active_super() will
- * just loop waiting for SB_BORN so even freeze_bdev() cannot proceed.
- *
- * It is enough to check bdev was not frozen before we set s_bdev.
+ * It is enough to check bdev was not frozen before we set
+ * s_bdev as freezing will wait until SB_BORN is set.
*/
- mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (bdev->bd_fsfreeze_count > 0) {
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ if (atomic_read(&bdev->bd_fsfreeze_count) > 0) {
if (fc)
warnf(fc, "%pg: Can't mount, blockdev is frozen", bdev);
bdev_release(bdev_handle);
@@ -1540,7 +1565,6 @@ int setup_bdev_super(struct super_block *sb, int sb_flags,
if (bdev_stable_writes(bdev))
sb->s_iflags |= SB_I_STABLE_WRITES;
spin_unlock(&sb_lock);
- mutex_unlock(&bdev->bd_fsfreeze_mutex);
snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev);
shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name,
@@ -1585,15 +1609,7 @@ int get_tree_bdev(struct fs_context *fc,
return -EBUSY;
}
} else {
- /*
- * We drop s_umount here because we need to open the bdev and
- * bdev->open_mutex ranks above s_umount (blkdev_put() ->
- * bdev_mark_dead()). It is safe because we have active sb
- * reference and SB_BORN is not set yet.
- */
- super_unlock_excl(s);
error = setup_bdev_super(s, fc->sb_flags, fc);
- __super_lock_excl(s);
if (!error)
error = fill_super(s, fc);
if (error) {
@@ -1637,15 +1653,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
return ERR_PTR(-EBUSY);
}
} else {
- /*
- * We drop s_umount here because we need to open the bdev and
- * bdev->open_mutex ranks above s_umount (blkdev_put() ->
- * bdev_mark_dead()). It is safe because we have active sb
- * reference and SB_BORN is not set yet.
- */
- super_unlock_excl(s);
error = setup_bdev_super(s, flags, NULL);
- __super_lock_excl(s);
if (!error)
error = fill_super(s, data, flags & SB_SILENT ? 1 : 0);
if (error) {
@@ -1914,6 +1922,47 @@ static int wait_for_partially_frozen(struct super_block *sb)
return ret;
}
+#define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE)
+#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST)
+
+static inline int freeze_inc(struct super_block *sb, enum freeze_holder who)
+{
+ WARN_ON_ONCE((who & ~FREEZE_FLAGS));
+ WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+
+ if (who & FREEZE_HOLDER_KERNEL)
+ ++sb->s_writers.freeze_kcount;
+ if (who & FREEZE_HOLDER_USERSPACE)
+ ++sb->s_writers.freeze_ucount;
+ return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
+}
+
+static inline int freeze_dec(struct super_block *sb, enum freeze_holder who)
+{
+ WARN_ON_ONCE((who & ~FREEZE_FLAGS));
+ WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+
+ if ((who & FREEZE_HOLDER_KERNEL) && sb->s_writers.freeze_kcount)
+ --sb->s_writers.freeze_kcount;
+ if ((who & FREEZE_HOLDER_USERSPACE) && sb->s_writers.freeze_ucount)
+ --sb->s_writers.freeze_ucount;
+ return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount;
+}
+
+static inline bool may_freeze(struct super_block *sb, enum freeze_holder who)
+{
+ WARN_ON_ONCE((who & ~FREEZE_FLAGS));
+ WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1);
+
+ if (who & FREEZE_HOLDER_KERNEL)
+ return (who & FREEZE_MAY_NEST) ||
+ sb->s_writers.freeze_kcount == 0;
+ if (who & FREEZE_HOLDER_USERSPACE)
+ return (who & FREEZE_MAY_NEST) ||
+ sb->s_writers.freeze_ucount == 0;
+ return false;
+}
+
/**
* freeze_super - lock the filesystem and force it into a consistent state
* @sb: the super to lock
@@ -1926,6 +1975,7 @@ static int wait_for_partially_frozen(struct super_block *sb)
* @who should be:
* * %FREEZE_HOLDER_USERSPACE if userspace wants to freeze the fs;
* * %FREEZE_HOLDER_KERNEL if the kernel wants to freeze the fs.
+ * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed.
*
* The @who argument distinguishes between the kernel and userspace trying to
* freeze the filesystem. Although there cannot be multiple kernel freezes or
@@ -1933,6 +1983,13 @@ static int wait_for_partially_frozen(struct super_block *sb)
* userspace can both hold a filesystem frozen. The filesystem remains frozen
* until there are no kernel or userspace freezes in effect.
*
+ * A filesystem may hold multiple devices and thus a filesystems may be
+ * frozen through the block layer via multiple block devices. In this
+ * case the request is marked as being allowed to nest by passing
+ * FREEZE_MAY_NEST. The filesystem remains frozen until all block
+ * devices are unfrozen. If multiple freezes are attempted without
+ * FREEZE_MAY_NEST -EBUSY will be returned.
+ *
* During this function, sb->s_writers.frozen goes through these values:
*
* SB_UNFROZEN: File system is normal, all writes progress as usual.
@@ -1957,31 +2014,29 @@ static int wait_for_partially_frozen(struct super_block *sb)
* mostly auxiliary for filesystems to verify they do not modify frozen fs.
*
* sb->s_writers.frozen is protected by sb->s_umount.
+ *
+ * Return: If the freeze was successful zero is returned. If the freeze
+ * failed a negative error code is returned.
*/
int freeze_super(struct super_block *sb, enum freeze_holder who)
{
int ret;
+ if (!super_lock_excl(sb)) {
+ WARN_ON_ONCE("Dying superblock while freezing!");
+ return -EINVAL;
+ }
atomic_inc(&sb->s_active);
- if (!super_lock_excl(sb))
- WARN(1, "Dying superblock while freezing!");
retry:
if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
- if (sb->s_writers.freeze_holders & who) {
- deactivate_locked_super(sb);
- return -EBUSY;
- }
-
- WARN_ON(sb->s_writers.freeze_holders == 0);
-
- /*
- * Someone else already holds this type of freeze; share the
- * freeze and assign the active ref to the freeze.
- */
- sb->s_writers.freeze_holders |= who;
- super_unlock_excl(sb);
- return 0;
+ if (may_freeze(sb, who))
+ ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1);
+ else
+ ret = -EBUSY;
+ /* All freezers share a single active reference. */
+ deactivate_locked_super(sb);
+ return ret;
}
if (sb->s_writers.frozen != SB_UNFROZEN) {
@@ -1994,14 +2049,9 @@ retry:
goto retry;
}
- if (!(sb->s_flags & SB_BORN)) {
- super_unlock_excl(sb);
- return 0; /* sic - it's "nothing to do" */
- }
-
if (sb_rdonly(sb)) {
/* Nothing to do really... */
- sb->s_writers.freeze_holders |= who;
+ WARN_ON_ONCE(freeze_inc(sb, who) > 1);
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
wake_up_var(&sb->s_writers.frozen);
super_unlock_excl(sb);
@@ -2012,8 +2062,7 @@ retry:
/* Release s_umount to preserve sb_start_write -> s_umount ordering */
super_unlock_excl(sb);
sb_wait_write(sb, SB_FREEZE_WRITE);
- if (!super_lock_excl(sb))
- WARN(1, "Dying superblock while freezing!");
+ __super_lock_excl(sb);
/* Now we go and block page faults... */
sb->s_writers.frozen = SB_FREEZE_PAGEFAULT;
@@ -2049,7 +2098,7 @@ retry:
* For debugging purposes so that fs can warn if it sees write activity
* when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super().
*/
- sb->s_writers.freeze_holders |= who;
+ WARN_ON_ONCE(freeze_inc(sb, who) > 1);
sb->s_writers.frozen = SB_FREEZE_COMPLETE;
wake_up_var(&sb->s_writers.frozen);
lockdep_sb_freeze_release(sb);
@@ -2066,34 +2115,22 @@ EXPORT_SYMBOL(freeze_super);
*/
static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
{
- int error;
+ int error = -EINVAL;
- if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) {
- if (!(sb->s_writers.freeze_holders & who)) {
- super_unlock_excl(sb);
- return -EINVAL;
- }
+ if (sb->s_writers.frozen != SB_FREEZE_COMPLETE)
+ goto out_unlock;
- /*
- * Freeze is shared with someone else. Release our hold and
- * drop the active ref that freeze_super assigned to the
- * freezer.
- */
- if (sb->s_writers.freeze_holders & ~who) {
- sb->s_writers.freeze_holders &= ~who;
- deactivate_locked_super(sb);
- return 0;
- }
- } else {
- super_unlock_excl(sb);
- return -EINVAL;
- }
+ /*
+ * All freezers share a single active reference.
+ * So just unlock in case there are any left.
+ */
+ if (freeze_dec(sb, who))
+ goto out_unlock;
if (sb_rdonly(sb)) {
- sb->s_writers.freeze_holders &= ~who;
sb->s_writers.frozen = SB_UNFROZEN;
wake_up_var(&sb->s_writers.frozen);
- goto out;
+ goto out_deactivate;
}
lockdep_sb_freeze_acquire(sb);
@@ -2101,20 +2138,23 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who)
if (sb->s_op->unfreeze_fs) {
error = sb->s_op->unfreeze_fs(sb);
if (error) {
- printk(KERN_ERR "VFS:Filesystem thaw failed\n");
+ pr_err("VFS: Filesystem thaw failed\n");
+ freeze_inc(sb, who);
lockdep_sb_freeze_release(sb);
- super_unlock_excl(sb);
- return error;
+ goto out_unlock;
}
}
- sb->s_writers.freeze_holders &= ~who;
sb->s_writers.frozen = SB_UNFROZEN;
wake_up_var(&sb->s_writers.frozen);
sb_freeze_unlock(sb, SB_FREEZE_FS);
-out:
+out_deactivate:
deactivate_locked_super(sb);
return 0;
+
+out_unlock:
+ super_unlock_excl(sb);
+ return error;
}
/**
@@ -2128,11 +2168,18 @@ out:
* @who should be:
* * %FREEZE_HOLDER_USERSPACE if userspace wants to thaw the fs;
* * %FREEZE_HOLDER_KERNEL if the kernel wants to thaw the fs.
+ * * %FREEZE_MAY_NEST whether nesting freeze and thaw requests is allowed
+ *
+ * A filesystem may hold multiple devices and thus a filesystems may
+ * have been frozen through the block layer via multiple block devices.
+ * The filesystem remains frozen until all block devices are unfrozen.
*/
int thaw_super(struct super_block *sb, enum freeze_holder who)
{
- if (!super_lock_excl(sb))
- WARN(1, "Dying superblock while thawing!");
+ if (!super_lock_excl(sb)) {
+ WARN_ON_ONCE("Dying superblock while thawing!");
+ return -EINVAL;
+ }
return thaw_super_locked(sb, who);
}
EXPORT_SYMBOL(thaw_super);
diff --git a/fs/sysctls.c b/fs/sysctls.c
index 76a0aee8c229..8dbde9a802fa 100644
--- a/fs/sysctls.c
+++ b/fs/sysctls.c
@@ -26,7 +26,6 @@ static struct ctl_table fs_shared_sysctls[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_MAXOLDUID,
},
- { }
};
static int __init init_fs_sysctls(void)
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index b6b6796e1616..4df2afa551dc 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -81,7 +81,7 @@ void sysfs_remove_dir(struct kobject *kobj)
struct kernfs_node *kn = kobj->sd;
/*
- * In general, kboject owner is responsible for ensuring removal
+ * In general, kobject owner is responsible for ensuring removal
* doesn't race with other operations and sysfs doesn't provide any
* protection; however, when @kobj is used as a symlink target, the
* symlinking entity usually doesn't own @kobj and thus has no
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index 725981474e5f..410ab2a44d2f 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -8,6 +8,7 @@
#include <linux/buffer_head.h>
#include <linux/mount.h>
+#include <linux/mpage.h>
#include <linux/string.h>
#include "sysv.h"
@@ -456,9 +457,10 @@ int sysv_getattr(struct mnt_idmap *idmap, const struct path *path,
return 0;
}
-static int sysv_writepage(struct page *page, struct writeback_control *wbc)
+static int sysv_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page,get_block,wbc);
+ return mpage_writepages(mapping, wbc, get_block);
}
static int sysv_read_folio(struct file *file, struct folio *folio)
@@ -503,8 +505,9 @@ const struct address_space_operations sysv_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = sysv_read_folio,
- .writepage = sysv_writepage,
+ .writepages = sysv_writepages,
.write_begin = sysv_write_begin,
.write_end = generic_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = sysv_bmap
};
diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c
index f0677ea0ec24..110e8a272189 100644
--- a/fs/tracefs/event_inode.c
+++ b/fs/tracefs/event_inode.c
@@ -32,6 +32,18 @@
*/
static DEFINE_MUTEX(eventfs_mutex);
+/* Choose something "unique" ;-) */
+#define EVENTFS_FILE_INODE_INO 0x12c4e37
+
+/* Just try to make something consistent and unique */
+static int eventfs_dir_ino(struct eventfs_inode *ei)
+{
+ if (!ei->ino)
+ ei->ino = get_next_ino();
+
+ return ei->ino;
+}
+
/*
* The eventfs_inode (ei) itself is protected by SRCU. It is released from
* its parent's list and will have is_freed set (under eventfs_mutex).
@@ -45,16 +57,55 @@ enum {
EVENTFS_SAVE_MODE = BIT(16),
EVENTFS_SAVE_UID = BIT(17),
EVENTFS_SAVE_GID = BIT(18),
+ EVENTFS_TOPLEVEL = BIT(19),
};
#define EVENTFS_MODE_MASK (EVENTFS_SAVE_MODE - 1)
+/*
+ * eventfs_inode reference count management.
+ *
+ * NOTE! We count only references from dentries, in the
+ * form 'dentry->d_fsdata'. There are also references from
+ * directory inodes ('ti->private'), but the dentry reference
+ * count is always a superset of the inode reference count.
+ */
+static void release_ei(struct kref *ref)
+{
+ struct eventfs_inode *ei = container_of(ref, struct eventfs_inode, kref);
+
+ WARN_ON_ONCE(!ei->is_freed);
+
+ kfree(ei->entry_attrs);
+ kfree_const(ei->name);
+ kfree_rcu(ei, rcu);
+}
+
+static inline void put_ei(struct eventfs_inode *ei)
+{
+ if (ei)
+ kref_put(&ei->kref, release_ei);
+}
+
+static inline void free_ei(struct eventfs_inode *ei)
+{
+ if (ei) {
+ ei->is_freed = 1;
+ put_ei(ei);
+ }
+}
+
+static inline struct eventfs_inode *get_ei(struct eventfs_inode *ei)
+{
+ if (ei)
+ kref_get(&ei->kref);
+ return ei;
+}
+
static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags);
-static int dcache_dir_open_wrapper(struct inode *inode, struct file *file);
-static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx);
-static int eventfs_release(struct inode *inode, struct file *file);
+static int eventfs_iterate(struct file *file, struct dir_context *ctx);
static void update_attr(struct eventfs_attr *attr, struct iattr *iattr)
{
@@ -94,7 +145,7 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
/* Preallocate the children mode array if necessary */
if (!(dentry->d_inode->i_mode & S_IFDIR)) {
if (!ei->entry_attrs) {
- ei->entry_attrs = kzalloc(sizeof(*ei->entry_attrs) * ei->nr_entries,
+ ei->entry_attrs = kcalloc(ei->nr_entries, sizeof(*ei->entry_attrs),
GFP_NOFS);
if (!ei->entry_attrs) {
ret = -ENOMEM;
@@ -117,10 +168,17 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
* The events directory dentry is never freed, unless its
* part of an instance that is deleted. It's attr is the
* default for its child files and directories.
- * Do not update it. It's not used for its own mode or ownership
+ * Do not update it. It's not used for its own mode or ownership.
*/
- if (!ei->is_events)
+ if (ei->is_events) {
+ /* But it still needs to know if it was modified */
+ if (iattr->ia_valid & ATTR_UID)
+ ei->attr.mode |= EVENTFS_SAVE_UID;
+ if (iattr->ia_valid & ATTR_GID)
+ ei->attr.mode |= EVENTFS_SAVE_GID;
+ } else {
update_attr(&ei->attr, iattr);
+ }
} else {
name = dentry->d_name.name;
@@ -138,9 +196,63 @@ static int eventfs_set_attr(struct mnt_idmap *idmap, struct dentry *dentry,
return ret;
}
+static void update_top_events_attr(struct eventfs_inode *ei, struct super_block *sb)
+{
+ struct inode *root;
+
+ /* Only update if the "events" was on the top level */
+ if (!ei || !(ei->attr.mode & EVENTFS_TOPLEVEL))
+ return;
+
+ /* Get the tracefs root inode. */
+ root = d_inode(sb->s_root);
+ ei->attr.uid = root->i_uid;
+ ei->attr.gid = root->i_gid;
+}
+
+static void set_top_events_ownership(struct inode *inode)
+{
+ struct tracefs_inode *ti = get_tracefs(inode);
+ struct eventfs_inode *ei = ti->private;
+
+ /* The top events directory doesn't get automatically updated */
+ if (!ei || !ei->is_events || !(ei->attr.mode & EVENTFS_TOPLEVEL))
+ return;
+
+ update_top_events_attr(ei, inode->i_sb);
+
+ if (!(ei->attr.mode & EVENTFS_SAVE_UID))
+ inode->i_uid = ei->attr.uid;
+
+ if (!(ei->attr.mode & EVENTFS_SAVE_GID))
+ inode->i_gid = ei->attr.gid;
+}
+
+static int eventfs_get_attr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ struct dentry *dentry = path->dentry;
+ struct inode *inode = d_backing_inode(dentry);
+
+ set_top_events_ownership(inode);
+
+ generic_fillattr(idmap, request_mask, inode, stat);
+ return 0;
+}
+
+static int eventfs_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
+{
+ set_top_events_ownership(inode);
+ return generic_permission(idmap, inode, mask);
+}
+
static const struct inode_operations eventfs_root_dir_inode_operations = {
.lookup = eventfs_root_lookup,
.setattr = eventfs_set_attr,
+ .getattr = eventfs_get_attr,
+ .permission = eventfs_permission,
};
static const struct inode_operations eventfs_file_inode_operations = {
@@ -148,11 +260,9 @@ static const struct inode_operations eventfs_file_inode_operations = {
};
static const struct file_operations eventfs_file_operations = {
- .open = dcache_dir_open_wrapper,
.read = generic_read_dir,
- .iterate_shared = dcache_readdir_wrapper,
+ .iterate_shared = eventfs_iterate,
.llseek = generic_file_llseek,
- .release = eventfs_release,
};
/* Return the evenfs_inode of the "events" directory */
@@ -160,10 +270,11 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
{
struct eventfs_inode *ei;
- mutex_lock(&eventfs_mutex);
do {
- /* The parent always has an ei, except for events itself */
- ei = dentry->d_parent->d_fsdata;
+ // The parent is stable because we do not do renames
+ dentry = dentry->d_parent;
+ // ... and directories always have d_fsdata
+ ei = dentry->d_fsdata;
/*
* If the ei is being freed, the ownership of the children
@@ -173,10 +284,10 @@ static struct eventfs_inode *eventfs_find_events(struct dentry *dentry)
ei = NULL;
break;
}
-
- dentry = ei->dentry;
+ // Walk upwards until you find the events inode
} while (!ei->is_events);
- mutex_unlock(&eventfs_mutex);
+
+ update_top_events_attr(ei, dentry->d_sb);
return ei;
}
@@ -206,50 +317,11 @@ static void update_inode_attr(struct dentry *dentry, struct inode *inode,
inode->i_gid = attr->gid;
}
-static void update_gid(struct eventfs_inode *ei, kgid_t gid, int level)
-{
- struct eventfs_inode *ei_child;
-
- /* at most we have events/system/event */
- if (WARN_ON_ONCE(level > 3))
- return;
-
- ei->attr.gid = gid;
-
- if (ei->entry_attrs) {
- for (int i = 0; i < ei->nr_entries; i++) {
- ei->entry_attrs[i].gid = gid;
- }
- }
-
- /*
- * Only eventfs_inode with dentries are updated, make sure
- * all eventfs_inodes are updated. If one of the children
- * do not have a dentry, this function must traverse it.
- */
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- if (!ei_child->dentry)
- update_gid(ei_child, gid, level + 1);
- }
-}
-
-void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
-{
- struct eventfs_inode *ei = dentry->d_fsdata;
- int idx;
-
- idx = srcu_read_lock(&eventfs_srcu);
- update_gid(ei, gid, 0);
- srcu_read_unlock(&eventfs_srcu, idx);
-}
-
/**
- * create_file - create a file in the tracefs filesystem
- * @name: the name of the file to create.
+ * lookup_file - look up a file in the tracefs filesystem
+ * @dentry: the dentry to look up
* @mode: the permission that the file should have.
* @attr: saved attributes changed by user
- * @parent: parent dentry for this file.
* @data: something that the caller will want to get to later on.
* @fop: struct file_operations that should be used for this file.
*
@@ -257,30 +329,25 @@ void eventfs_update_gid(struct dentry *dentry, kgid_t gid)
* directory. The inode.i_private pointer will point to @data in the open()
* call.
*/
-static struct dentry *create_file(const char *name, umode_t mode,
+static struct dentry *lookup_file(struct eventfs_inode *parent_ei,
+ struct dentry *dentry,
+ umode_t mode,
struct eventfs_attr *attr,
- struct dentry *parent, void *data,
+ void *data,
const struct file_operations *fop)
{
struct tracefs_inode *ti;
- struct dentry *dentry;
struct inode *inode;
if (!(mode & S_IFMT))
mode |= S_IFREG;
if (WARN_ON_ONCE(!S_ISREG(mode)))
- return NULL;
-
- WARN_ON_ONCE(!parent);
- dentry = eventfs_start_creating(name, parent);
-
- if (IS_ERR(dentry))
- return dentry;
+ return ERR_PTR(-EIO);
inode = tracefs_get_inode(dentry->d_sb);
if (unlikely(!inode))
- return eventfs_failed_creating(dentry);
+ return ERR_PTR(-ENOMEM);
/* If the user updated the directory's attributes, use them */
update_inode_attr(dentry, inode, attr, mode);
@@ -289,34 +356,36 @@ static struct dentry *create_file(const char *name, umode_t mode,
inode->i_fop = fop;
inode->i_private = data;
+ /* All files will have the same inode number */
+ inode->i_ino = EVENTFS_FILE_INODE_INO;
+
ti = get_tracefs(inode);
ti->flags |= TRACEFS_EVENT_INODE;
- d_instantiate(dentry, inode);
- fsnotify_create(dentry->d_parent->d_inode, dentry);
- return eventfs_end_creating(dentry);
+
+ // Files have their parent's ei as their fsdata
+ dentry->d_fsdata = get_ei(parent_ei);
+
+ d_add(dentry, inode);
+ return NULL;
};
/**
- * create_dir - create a dir in the tracefs filesystem
+ * lookup_dir_entry - look up a dir in the tracefs filesystem
+ * @dentry: the directory to look up
* @ei: the eventfs_inode that represents the directory to create
- * @parent: parent dentry for this file.
*
- * This function will create a dentry for a directory represented by
+ * This function will look up a dentry for a directory represented by
* a eventfs_inode.
*/
-static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent)
+static struct dentry *lookup_dir_entry(struct dentry *dentry,
+ struct eventfs_inode *pei, struct eventfs_inode *ei)
{
struct tracefs_inode *ti;
- struct dentry *dentry;
struct inode *inode;
- dentry = eventfs_start_creating(ei->name, parent);
- if (IS_ERR(dentry))
- return dentry;
-
inode = tracefs_get_inode(dentry->d_sb);
if (unlikely(!inode))
- return eventfs_failed_creating(dentry);
+ return ERR_PTR(-ENOMEM);
/* If the user updated the directory's attributes, use them */
update_inode_attr(dentry, inode, &ei->attr,
@@ -325,247 +394,72 @@ static struct dentry *create_dir(struct eventfs_inode *ei, struct dentry *parent
inode->i_op = &eventfs_root_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
+ /* All directories will have the same inode number */
+ inode->i_ino = eventfs_dir_ino(ei);
+
ti = get_tracefs(inode);
ti->flags |= TRACEFS_EVENT_INODE;
+ /* Only directories have ti->private set to an ei, not files */
+ ti->private = ei;
- inc_nlink(inode);
- d_instantiate(dentry, inode);
- inc_nlink(dentry->d_parent->d_inode);
- fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
- return eventfs_end_creating(dentry);
+ dentry->d_fsdata = get_ei(ei);
+
+ d_add(dentry, inode);
+ return NULL;
}
-static void free_ei(struct eventfs_inode *ei)
+static inline struct eventfs_inode *alloc_ei(const char *name)
{
- kfree_const(ei->name);
- kfree(ei->d_children);
- kfree(ei->entry_attrs);
- kfree(ei);
+ struct eventfs_inode *ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+
+ if (!ei)
+ return NULL;
+
+ ei->name = kstrdup_const(name, GFP_KERNEL);
+ if (!ei->name) {
+ kfree(ei);
+ return NULL;
+ }
+ kref_init(&ei->kref);
+ return ei;
}
/**
- * eventfs_set_ei_status_free - remove the dentry reference from an eventfs_inode
- * @ti: the tracefs_inode of the dentry
+ * eventfs_d_release - dentry is going away
* @dentry: dentry which has the reference to remove.
*
* Remove the association between a dentry from an eventfs_inode.
*/
-void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry)
+void eventfs_d_release(struct dentry *dentry)
{
- struct eventfs_inode *ei;
- int i;
-
- mutex_lock(&eventfs_mutex);
-
- ei = dentry->d_fsdata;
- if (!ei)
- goto out;
-
- /* This could belong to one of the files of the ei */
- if (ei->dentry != dentry) {
- for (i = 0; i < ei->nr_entries; i++) {
- if (ei->d_children[i] == dentry)
- break;
- }
- if (WARN_ON_ONCE(i == ei->nr_entries))
- goto out;
- ei->d_children[i] = NULL;
- } else if (ei->is_freed) {
- free_ei(ei);
- } else {
- ei->dentry = NULL;
- }
-
- dentry->d_fsdata = NULL;
- out:
- mutex_unlock(&eventfs_mutex);
+ put_ei(dentry->d_fsdata);
}
/**
- * create_file_dentry - create a dentry for a file of an eventfs_inode
+ * lookup_file_dentry - create a dentry for a file of an eventfs_inode
* @ei: the eventfs_inode that the file will be created under
- * @idx: the index into the d_children[] of the @ei
+ * @idx: the index into the entry_attrs[] of the @ei
* @parent: The parent dentry of the created file.
* @name: The name of the file to create
* @mode: The mode of the file.
* @data: The data to use to set the inode of the file with on open()
* @fops: The fops of the file to be created.
- * @lookup: If called by the lookup routine, in which case, dput() the created dentry.
*
* Create a dentry for a file of an eventfs_inode @ei and place it into the
- * address located at @e_dentry. If the @e_dentry already has a dentry, then
- * just do a dget() on it and return. Otherwise create the dentry and attach it.
+ * address located at @e_dentry.
*/
static struct dentry *
-create_file_dentry(struct eventfs_inode *ei, int idx,
- struct dentry *parent, const char *name, umode_t mode, void *data,
- const struct file_operations *fops, bool lookup)
+lookup_file_dentry(struct dentry *dentry,
+ struct eventfs_inode *ei, int idx,
+ umode_t mode, void *data,
+ const struct file_operations *fops)
{
struct eventfs_attr *attr = NULL;
- struct dentry **e_dentry = &ei->d_children[idx];
- struct dentry *dentry;
-
- WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
- mutex_lock(&eventfs_mutex);
- if (ei->is_freed) {
- mutex_unlock(&eventfs_mutex);
- return NULL;
- }
- /* If the e_dentry already has a dentry, use it */
- if (*e_dentry) {
- /* lookup does not need to up the ref count */
- if (!lookup)
- dget(*e_dentry);
- mutex_unlock(&eventfs_mutex);
- return *e_dentry;
- }
-
- /* ei->entry_attrs are protected by SRCU */
if (ei->entry_attrs)
attr = &ei->entry_attrs[idx];
- mutex_unlock(&eventfs_mutex);
-
- dentry = create_file(name, mode, attr, parent, data, fops);
-
- mutex_lock(&eventfs_mutex);
-
- if (IS_ERR_OR_NULL(dentry)) {
- /*
- * When the mutex was released, something else could have
- * created the dentry for this e_dentry. In which case
- * use that one.
- *
- * If ei->is_freed is set, the e_dentry is currently on its
- * way to being freed, don't return it. If e_dentry is NULL
- * it means it was already freed.
- */
- if (ei->is_freed)
- dentry = NULL;
- else
- dentry = *e_dentry;
- /* The lookup does not need to up the dentry refcount */
- if (dentry && !lookup)
- dget(dentry);
- mutex_unlock(&eventfs_mutex);
- return dentry;
- }
-
- if (!*e_dentry && !ei->is_freed) {
- *e_dentry = dentry;
- dentry->d_fsdata = ei;
- } else {
- /*
- * Should never happen unless we get here due to being freed.
- * Otherwise it means two dentries exist with the same name.
- */
- WARN_ON_ONCE(!ei->is_freed);
- dentry = NULL;
- }
- mutex_unlock(&eventfs_mutex);
-
- if (lookup)
- dput(dentry);
-
- return dentry;
-}
-
-/**
- * eventfs_post_create_dir - post create dir routine
- * @ei: eventfs_inode of recently created dir
- *
- * Map the meta-data of files within an eventfs dir to their parent dentry
- */
-static void eventfs_post_create_dir(struct eventfs_inode *ei)
-{
- struct eventfs_inode *ei_child;
- struct tracefs_inode *ti;
-
- lockdep_assert_held(&eventfs_mutex);
-
- /* srcu lock already held */
- /* fill parent-child relation */
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- ei_child->d_parent = ei->dentry;
- }
-
- ti = get_tracefs(ei->dentry->d_inode);
- ti->private = ei;
-}
-
-/**
- * create_dir_dentry - Create a directory dentry for the eventfs_inode
- * @pei: The eventfs_inode parent of ei.
- * @ei: The eventfs_inode to create the directory for
- * @parent: The dentry of the parent of this directory
- * @lookup: True if this is called by the lookup code
- *
- * This creates and attaches a directory dentry to the eventfs_inode @ei.
- */
-static struct dentry *
-create_dir_dentry(struct eventfs_inode *pei, struct eventfs_inode *ei,
- struct dentry *parent, bool lookup)
-{
- struct dentry *dentry = NULL;
-
- WARN_ON_ONCE(!inode_is_locked(parent->d_inode));
-
- mutex_lock(&eventfs_mutex);
- if (pei->is_freed || ei->is_freed) {
- mutex_unlock(&eventfs_mutex);
- return NULL;
- }
- if (ei->dentry) {
- /* If the dentry already has a dentry, use it */
- dentry = ei->dentry;
- /* lookup does not need to up the ref count */
- if (!lookup)
- dget(dentry);
- mutex_unlock(&eventfs_mutex);
- return dentry;
- }
- mutex_unlock(&eventfs_mutex);
-
- dentry = create_dir(ei, parent);
-
- mutex_lock(&eventfs_mutex);
-
- if (IS_ERR_OR_NULL(dentry) && !ei->is_freed) {
- /*
- * When the mutex was released, something else could have
- * created the dentry for this e_dentry. In which case
- * use that one.
- *
- * If ei->is_freed is set, the e_dentry is currently on its
- * way to being freed.
- */
- dentry = ei->dentry;
- if (dentry && !lookup)
- dget(dentry);
- mutex_unlock(&eventfs_mutex);
- return dentry;
- }
-
- if (!ei->dentry && !ei->is_freed) {
- ei->dentry = dentry;
- eventfs_post_create_dir(ei);
- dentry->d_fsdata = ei;
- } else {
- /*
- * Should never happen unless we get here due to being freed.
- * Otherwise it means two dentries exist with the same name.
- */
- WARN_ON_ONCE(!ei->is_freed);
- dentry = NULL;
- }
- mutex_unlock(&eventfs_mutex);
-
- if (lookup)
- dput(dentry);
-
- return dentry;
+ return lookup_file(ei, dentry, mode, attr, data, fops);
}
/**
@@ -582,250 +476,153 @@ static struct dentry *eventfs_root_lookup(struct inode *dir,
struct dentry *dentry,
unsigned int flags)
{
- const struct file_operations *fops;
- const struct eventfs_entry *entry;
struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
struct eventfs_inode *ei;
- struct dentry *ei_dentry = NULL;
- struct dentry *ret = NULL;
const char *name = dentry->d_name.name;
- bool created = false;
- umode_t mode;
- void *data;
- int idx;
- int i;
- int r;
+ struct dentry *result = NULL;
ti = get_tracefs(dir);
if (!(ti->flags & TRACEFS_EVENT_INODE))
- return NULL;
-
- /* Grab srcu to prevent the ei from going away */
- idx = srcu_read_lock(&eventfs_srcu);
+ return ERR_PTR(-EIO);
- /*
- * Grab the eventfs_mutex to consistent value from ti->private.
- * This s
- */
mutex_lock(&eventfs_mutex);
- ei = READ_ONCE(ti->private);
- if (ei && !ei->is_freed)
- ei_dentry = READ_ONCE(ei->dentry);
- mutex_unlock(&eventfs_mutex);
- if (!ei || !ei_dentry)
+ ei = ti->private;
+ if (!ei || ei->is_freed)
goto out;
- data = ei->data;
-
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
+ list_for_each_entry(ei_child, &ei->children, list) {
if (strcmp(ei_child->name, name) != 0)
continue;
- ret = simple_lookup(dir, dentry, flags);
- if (IS_ERR(ret))
+ if (ei_child->is_freed)
goto out;
- create_dir_dentry(ei, ei_child, ei_dentry, true);
- created = true;
- break;
- }
-
- if (created)
+ result = lookup_dir_entry(dentry, ei, ei_child);
goto out;
-
- for (i = 0; i < ei->nr_entries; i++) {
- entry = &ei->entries[i];
- if (strcmp(name, entry->name) == 0) {
- void *cdata = data;
- mutex_lock(&eventfs_mutex);
- /* If ei->is_freed, then the event itself may be too */
- if (!ei->is_freed)
- r = entry->callback(name, &mode, &cdata, &fops);
- else
- r = -1;
- mutex_unlock(&eventfs_mutex);
- if (r <= 0)
- continue;
- ret = simple_lookup(dir, dentry, flags);
- if (IS_ERR(ret))
- goto out;
- create_file_dentry(ei, i, ei_dentry, name, mode, cdata,
- fops, true);
- break;
- }
}
- out:
- srcu_read_unlock(&eventfs_srcu, idx);
- return ret;
-}
-
-struct dentry_list {
- void *cursor;
- struct dentry **dentries;
-};
-/**
- * eventfs_release - called to release eventfs file/dir
- * @inode: inode to be released
- * @file: file to be released (not used)
- */
-static int eventfs_release(struct inode *inode, struct file *file)
-{
- struct tracefs_inode *ti;
- struct dentry_list *dlist = file->private_data;
- void *cursor;
- int i;
+ for (int i = 0; i < ei->nr_entries; i++) {
+ void *data;
+ umode_t mode;
+ const struct file_operations *fops;
+ const struct eventfs_entry *entry = &ei->entries[i];
- ti = get_tracefs(inode);
- if (!(ti->flags & TRACEFS_EVENT_INODE))
- return -EINVAL;
+ if (strcmp(name, entry->name) != 0)
+ continue;
- if (WARN_ON_ONCE(!dlist))
- return -EINVAL;
+ data = ei->data;
+ if (entry->callback(name, &mode, &data, &fops) <= 0)
+ goto out;
- for (i = 0; dlist->dentries && dlist->dentries[i]; i++) {
- dput(dlist->dentries[i]);
+ result = lookup_file_dentry(dentry, ei, i, mode, data, fops);
+ goto out;
}
-
- cursor = dlist->cursor;
- kfree(dlist->dentries);
- kfree(dlist);
- file->private_data = cursor;
- return dcache_dir_close(inode, file);
-}
-
-static int add_dentries(struct dentry ***dentries, struct dentry *d, int cnt)
-{
- struct dentry **tmp;
-
- tmp = krealloc(*dentries, sizeof(d) * (cnt + 2), GFP_NOFS);
- if (!tmp)
- return -1;
- tmp[cnt] = d;
- tmp[cnt + 1] = NULL;
- *dentries = tmp;
- return 0;
+ out:
+ mutex_unlock(&eventfs_mutex);
+ return result;
}
-/**
- * dcache_dir_open_wrapper - eventfs open wrapper
- * @inode: not used
- * @file: dir to be opened (to create it's children)
- *
- * Used to dynamic create file/dir with-in @file, all the
- * file/dir will be created. If already created then references
- * will be increased
+/*
+ * Walk the children of a eventfs_inode to fill in getdents().
*/
-static int dcache_dir_open_wrapper(struct inode *inode, struct file *file)
+static int eventfs_iterate(struct file *file, struct dir_context *ctx)
{
const struct file_operations *fops;
+ struct inode *f_inode = file_inode(file);
const struct eventfs_entry *entry;
struct eventfs_inode *ei_child;
struct tracefs_inode *ti;
struct eventfs_inode *ei;
- struct dentry_list *dlist;
- struct dentry **dentries = NULL;
- struct dentry *parent = file_dentry(file);
- struct dentry *d;
- struct inode *f_inode = file_inode(file);
- const char *name = parent->d_name.name;
+ const char *name;
umode_t mode;
- void *data;
- int cnt = 0;
int idx;
- int ret;
- int i;
- int r;
+ int ret = -EINVAL;
+ int ino;
+ int i, r, c;
+
+ if (!dir_emit_dots(file, ctx))
+ return 0;
ti = get_tracefs(f_inode);
if (!(ti->flags & TRACEFS_EVENT_INODE))
return -EINVAL;
- if (WARN_ON_ONCE(file->private_data))
- return -EINVAL;
+ c = ctx->pos - 2;
idx = srcu_read_lock(&eventfs_srcu);
mutex_lock(&eventfs_mutex);
ei = READ_ONCE(ti->private);
+ if (ei && ei->is_freed)
+ ei = NULL;
mutex_unlock(&eventfs_mutex);
- if (!ei) {
- srcu_read_unlock(&eventfs_srcu, idx);
- return -EINVAL;
- }
-
+ if (!ei)
+ goto out;
- data = ei->data;
+ /*
+ * Need to create the dentries and inodes to have a consistent
+ * inode number.
+ */
+ ret = 0;
- dlist = kmalloc(sizeof(*dlist), GFP_KERNEL);
- if (!dlist) {
- srcu_read_unlock(&eventfs_srcu, idx);
- return -ENOMEM;
- }
+ /* Start at 'c' to jump over already read entries */
+ for (i = c; i < ei->nr_entries; i++, ctx->pos++) {
+ void *cdata = ei->data;
- inode_lock(parent->d_inode);
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- srcu_read_lock_held(&eventfs_srcu)) {
- d = create_dir_dentry(ei, ei_child, parent, false);
- if (d) {
- ret = add_dentries(&dentries, d, cnt);
- if (ret < 0)
- break;
- cnt++;
- }
- }
-
- for (i = 0; i < ei->nr_entries; i++) {
- void *cdata = data;
entry = &ei->entries[i];
name = entry->name;
+
mutex_lock(&eventfs_mutex);
- /* If ei->is_freed, then the event itself may be too */
- if (!ei->is_freed)
- r = entry->callback(name, &mode, &cdata, &fops);
- else
- r = -1;
+ /* If ei->is_freed then just bail here, nothing more to do */
+ if (ei->is_freed) {
+ mutex_unlock(&eventfs_mutex);
+ goto out;
+ }
+ r = entry->callback(name, &mode, &cdata, &fops);
mutex_unlock(&eventfs_mutex);
if (r <= 0)
continue;
- d = create_file_dentry(ei, i, parent, name, mode, cdata, fops, false);
- if (d) {
- ret = add_dentries(&dentries, d, cnt);
- if (ret < 0)
- break;
- cnt++;
+
+ ino = EVENTFS_FILE_INODE_INO;
+
+ if (!dir_emit(ctx, name, strlen(name), ino, DT_REG))
+ goto out;
+ }
+
+ /* Subtract the skipped entries above */
+ c -= min((unsigned int)c, (unsigned int)ei->nr_entries);
+
+ list_for_each_entry_srcu(ei_child, &ei->children, list,
+ srcu_read_lock_held(&eventfs_srcu)) {
+
+ if (c > 0) {
+ c--;
+ continue;
}
+
+ ctx->pos++;
+
+ if (ei_child->is_freed)
+ continue;
+
+ name = ei_child->name;
+
+ ino = eventfs_dir_ino(ei_child);
+
+ if (!dir_emit(ctx, name, strlen(name), ino, DT_DIR))
+ goto out_dec;
}
- inode_unlock(parent->d_inode);
+ ret = 1;
+ out:
srcu_read_unlock(&eventfs_srcu, idx);
- ret = dcache_dir_open(inode, file);
- /*
- * dcache_dir_open() sets file->private_data to a dentry cursor.
- * Need to save that but also save all the dentries that were
- * opened by this function.
- */
- dlist->cursor = file->private_data;
- dlist->dentries = dentries;
- file->private_data = dlist;
return ret;
-}
-
-/*
- * This just sets the file->private_data back to the cursor and back.
- */
-static int dcache_readdir_wrapper(struct file *file, struct dir_context *ctx)
-{
- struct dentry_list *dlist = file->private_data;
- int ret;
- file->private_data = dlist->cursor;
- ret = dcache_readdir(file, ctx);
- dlist->cursor = file->private_data;
- file->private_data = dlist;
- return ret;
+ out_dec:
+ /* Incremented ctx->pos without adding something, reset it */
+ ctx->pos--;
+ goto out;
}
/**
@@ -872,25 +669,10 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
if (!parent)
return ERR_PTR(-EINVAL);
- ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ ei = alloc_ei(name);
if (!ei)
return ERR_PTR(-ENOMEM);
- ei->name = kstrdup_const(name, GFP_KERNEL);
- if (!ei->name) {
- kfree(ei);
- return ERR_PTR(-ENOMEM);
- }
-
- if (size) {
- ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
- if (!ei->d_children) {
- kfree_const(ei->name);
- kfree(ei);
- return ERR_PTR(-ENOMEM);
- }
- }
-
ei->entries = entries;
ei->nr_entries = size;
ei->data = data;
@@ -898,10 +680,8 @@ struct eventfs_inode *eventfs_create_dir(const char *name, struct eventfs_inode
INIT_LIST_HEAD(&ei->list);
mutex_lock(&eventfs_mutex);
- if (!parent->is_freed) {
+ if (!parent->is_freed)
list_add_tail(&ei->list, &parent->children);
- ei->d_parent = parent->dentry;
- }
mutex_unlock(&eventfs_mutex);
/* Was the parent freed? */
@@ -941,33 +721,33 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
if (IS_ERR(dentry))
return ERR_CAST(dentry);
- ei = kzalloc(sizeof(*ei), GFP_KERNEL);
+ ei = alloc_ei(name);
if (!ei)
- goto fail_ei;
+ goto fail;
inode = tracefs_get_inode(dentry->d_sb);
if (unlikely(!inode))
goto fail;
- if (size) {
- ei->d_children = kzalloc(sizeof(*ei->d_children) * size, GFP_KERNEL);
- if (!ei->d_children)
- goto fail;
- }
-
- ei->dentry = dentry;
+ // Note: we have a ref to the dentry from tracefs_start_creating()
+ ei->events_dir = dentry;
ei->entries = entries;
ei->nr_entries = size;
ei->is_events = 1;
ei->data = data;
- ei->name = kstrdup_const(name, GFP_KERNEL);
- if (!ei->name)
- goto fail;
/* Save the ownership of this directory */
uid = d_inode(dentry->d_parent)->i_uid;
gid = d_inode(dentry->d_parent)->i_gid;
+ /*
+ * If the events directory is of the top instance, then parent
+ * is NULL. Set the attr.mode to reflect this and its permissions will
+ * default to the tracefs root dentry.
+ */
+ if (!parent)
+ ei->attr.mode = EVENTFS_TOPLEVEL;
+
/* This is used as the default ownership of the files and directories */
ei->attr.uid = uid;
ei->attr.gid = gid;
@@ -985,11 +765,19 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
inode->i_op = &eventfs_root_dir_inode_operations;
inode->i_fop = &eventfs_file_operations;
- dentry->d_fsdata = ei;
+ dentry->d_fsdata = get_ei(ei);
- /* directory inodes start off with i_nlink == 2 (for "." entry) */
- inc_nlink(inode);
+ /*
+ * Keep all eventfs directories with i_nlink == 1.
+ * Due to the dynamic nature of the dentry creations and not
+ * wanting to add a pointer to the parent eventfs_inode in the
+ * eventfs_inode structure, keeping the i_nlink in sync with the
+ * number of directories would cause too much complexity for
+ * something not worth much. Keeping directory links at 1
+ * tells userspace not to trust the link number.
+ */
d_instantiate(dentry, inode);
+ /* The dentry of the "events" parent does keep track though */
inc_nlink(dentry->d_parent->d_inode);
fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
tracefs_end_creating(dentry);
@@ -997,72 +785,11 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry
return ei;
fail:
- kfree(ei->d_children);
- kfree(ei);
- fail_ei:
+ free_ei(ei);
tracefs_failed_creating(dentry);
return ERR_PTR(-ENOMEM);
}
-static LLIST_HEAD(free_list);
-
-static void eventfs_workfn(struct work_struct *work)
-{
- struct eventfs_inode *ei, *tmp;
- struct llist_node *llnode;
-
- llnode = llist_del_all(&free_list);
- llist_for_each_entry_safe(ei, tmp, llnode, llist) {
- /* This dput() matches the dget() from unhook_dentry() */
- for (int i = 0; i < ei->nr_entries; i++) {
- if (ei->d_children[i])
- dput(ei->d_children[i]);
- }
- /* This should only get here if it had a dentry */
- if (!WARN_ON_ONCE(!ei->dentry))
- dput(ei->dentry);
- }
-}
-
-static DECLARE_WORK(eventfs_work, eventfs_workfn);
-
-static void free_rcu_ei(struct rcu_head *head)
-{
- struct eventfs_inode *ei = container_of(head, struct eventfs_inode, rcu);
-
- if (ei->dentry) {
- /* Do not free the ei until all references of dentry are gone */
- if (llist_add(&ei->llist, &free_list))
- queue_work(system_unbound_wq, &eventfs_work);
- return;
- }
-
- /* If the ei doesn't have a dentry, neither should its children */
- for (int i = 0; i < ei->nr_entries; i++) {
- WARN_ON_ONCE(ei->d_children[i]);
- }
-
- free_ei(ei);
-}
-
-static void unhook_dentry(struct dentry *dentry)
-{
- if (!dentry)
- return;
- /*
- * Need to add a reference to the dentry that is expected by
- * simple_recursive_removal(), which will include a dput().
- */
- dget(dentry);
-
- /*
- * Also add a reference for the dput() in eventfs_workfn().
- * That is required as that dput() will free the ei after
- * the SRCU grace period is over.
- */
- dget(dentry);
-}
-
/**
* eventfs_remove_rec - remove eventfs dir or file from list
* @ei: eventfs_inode to be removed.
@@ -1075,8 +802,6 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
{
struct eventfs_inode *ei_child;
- if (!ei)
- return;
/*
* Check recursion depth. It should never be greater than 3:
* 0 - events/
@@ -1088,28 +813,11 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
return;
/* search for nested folders or files */
- list_for_each_entry_srcu(ei_child, &ei->children, list,
- lockdep_is_held(&eventfs_mutex)) {
- /* Children only have dentry if parent does */
- WARN_ON_ONCE(ei_child->dentry && !ei->dentry);
+ list_for_each_entry(ei_child, &ei->children, list)
eventfs_remove_rec(ei_child, level + 1);
- }
-
-
- ei->is_freed = 1;
- for (int i = 0; i < ei->nr_entries; i++) {
- if (ei->d_children[i]) {
- /* Children only have dentry if parent does */
- WARN_ON_ONCE(!ei->dentry);
- unhook_dentry(ei->d_children[i]);
- }
- }
-
- unhook_dentry(ei->dentry);
-
- list_del_rcu(&ei->list);
- call_srcu(&eventfs_srcu, &ei->rcu, free_rcu_ei);
+ list_del(&ei->list);
+ free_ei(ei);
}
/**
@@ -1120,22 +828,12 @@ static void eventfs_remove_rec(struct eventfs_inode *ei, int level)
*/
void eventfs_remove_dir(struct eventfs_inode *ei)
{
- struct dentry *dentry;
-
if (!ei)
return;
mutex_lock(&eventfs_mutex);
- dentry = ei->dentry;
eventfs_remove_rec(ei, 0);
mutex_unlock(&eventfs_mutex);
-
- /*
- * If any of the ei children has a dentry, then the ei itself
- * must have a dentry.
- */
- if (dentry)
- simple_recursive_removal(dentry, NULL);
}
/**
@@ -1148,7 +846,11 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei)
{
struct dentry *dentry;
- dentry = ei->dentry;
+ dentry = ei->events_dir;
+ if (!dentry)
+ return;
+
+ ei->events_dir = NULL;
eventfs_remove_dir(ei);
/*
@@ -1158,5 +860,6 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei)
* sticks around while the other ei->dentry are created
* and destroyed dynamically.
*/
+ d_invalidate(dentry);
dput(dentry);
}
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index bc86ffdb103b..d65ffad4c327 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -38,8 +38,6 @@ static struct inode *tracefs_alloc_inode(struct super_block *sb)
if (!ti)
return NULL;
- ti->flags = 0;
-
return &ti->vfs_inode;
}
@@ -91,6 +89,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
struct inode *inode, struct dentry *dentry,
umode_t mode)
{
+ struct tracefs_inode *ti;
char *name;
int ret;
@@ -99,6 +98,15 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap,
return -ENOMEM;
/*
+ * This is a new directory that does not take the default of
+ * the rootfs. It becomes the default permissions for all the
+ * files and directories underneath it.
+ */
+ ti = get_tracefs(inode);
+ ti->flags |= TRACEFS_INSTANCE_INODE;
+ ti->private = inode;
+
+ /*
* The mkdir call can call the generic functions that create
* the files within the tracefs system. It is up to the individual
* mkdir routine to handle races.
@@ -141,10 +149,76 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
return ret;
}
-static const struct inode_operations tracefs_dir_inode_operations = {
+static void set_tracefs_inode_owner(struct inode *inode)
+{
+ struct tracefs_inode *ti = get_tracefs(inode);
+ struct inode *root_inode = ti->private;
+
+ /*
+ * If this inode has never been referenced, then update
+ * the permissions to the superblock.
+ */
+ if (!(ti->flags & TRACEFS_UID_PERM_SET))
+ inode->i_uid = root_inode->i_uid;
+
+ if (!(ti->flags & TRACEFS_GID_PERM_SET))
+ inode->i_gid = root_inode->i_gid;
+}
+
+static int tracefs_permission(struct mnt_idmap *idmap,
+ struct inode *inode, int mask)
+{
+ set_tracefs_inode_owner(inode);
+ return generic_permission(idmap, inode, mask);
+}
+
+static int tracefs_getattr(struct mnt_idmap *idmap,
+ const struct path *path, struct kstat *stat,
+ u32 request_mask, unsigned int flags)
+{
+ struct inode *inode = d_backing_inode(path->dentry);
+
+ set_tracefs_inode_owner(inode);
+ generic_fillattr(idmap, request_mask, inode, stat);
+ return 0;
+}
+
+static int tracefs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
+ struct iattr *attr)
+{
+ unsigned int ia_valid = attr->ia_valid;
+ struct inode *inode = d_inode(dentry);
+ struct tracefs_inode *ti = get_tracefs(inode);
+
+ if (ia_valid & ATTR_UID)
+ ti->flags |= TRACEFS_UID_PERM_SET;
+
+ if (ia_valid & ATTR_GID)
+ ti->flags |= TRACEFS_GID_PERM_SET;
+
+ return simple_setattr(idmap, dentry, attr);
+}
+
+static const struct inode_operations tracefs_instance_dir_inode_operations = {
.lookup = simple_lookup,
.mkdir = tracefs_syscall_mkdir,
.rmdir = tracefs_syscall_rmdir,
+ .permission = tracefs_permission,
+ .getattr = tracefs_getattr,
+ .setattr = tracefs_setattr,
+};
+
+static const struct inode_operations tracefs_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .permission = tracefs_permission,
+ .getattr = tracefs_getattr,
+ .setattr = tracefs_setattr,
+};
+
+static const struct inode_operations tracefs_file_inode_operations = {
+ .permission = tracefs_permission,
+ .getattr = tracefs_getattr,
+ .setattr = tracefs_setattr,
};
struct inode *tracefs_get_inode(struct super_block *sb)
@@ -183,87 +257,6 @@ struct tracefs_fs_info {
struct tracefs_mount_opts mount_opts;
};
-static void change_gid(struct dentry *dentry, kgid_t gid)
-{
- if (!dentry->d_inode)
- return;
- dentry->d_inode->i_gid = gid;
-}
-
-/*
- * Taken from d_walk, but without he need for handling renames.
- * Nothing can be renamed while walking the list, as tracefs
- * does not support renames. This is only called when mounting
- * or remounting the file system, to set all the files to
- * the given gid.
- */
-static void set_gid(struct dentry *parent, kgid_t gid)
-{
- struct dentry *this_parent;
- struct list_head *next;
-
- this_parent = parent;
- spin_lock(&this_parent->d_lock);
-
- change_gid(this_parent, gid);
-repeat:
- next = this_parent->d_subdirs.next;
-resume:
- while (next != &this_parent->d_subdirs) {
- struct tracefs_inode *ti;
- struct list_head *tmp = next;
- struct dentry *dentry = list_entry(tmp, struct dentry, d_child);
- next = tmp->next;
-
- /* Note, getdents() can add a cursor dentry with no inode */
- if (!dentry->d_inode)
- continue;
-
- spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-
- change_gid(dentry, gid);
-
- /* If this is the events directory, update that too */
- ti = get_tracefs(dentry->d_inode);
- if (ti && (ti->flags & TRACEFS_EVENT_INODE))
- eventfs_update_gid(dentry, gid);
-
- if (!list_empty(&dentry->d_subdirs)) {
- spin_unlock(&this_parent->d_lock);
- spin_release(&dentry->d_lock.dep_map, _RET_IP_);
- this_parent = dentry;
- spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_);
- goto repeat;
- }
- spin_unlock(&dentry->d_lock);
- }
- /*
- * All done at this level ... ascend and resume the search.
- */
- rcu_read_lock();
-ascend:
- if (this_parent != parent) {
- struct dentry *child = this_parent;
- this_parent = child->d_parent;
-
- spin_unlock(&child->d_lock);
- spin_lock(&this_parent->d_lock);
-
- /* go into the first sibling still alive */
- do {
- next = child->d_child.next;
- if (next == &this_parent->d_subdirs)
- goto ascend;
- child = list_entry(next, struct dentry, d_child);
- } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED));
- rcu_read_unlock();
- goto resume;
- }
- rcu_read_unlock();
- spin_unlock(&this_parent->d_lock);
- return;
-}
-
static int tracefs_parse_options(char *data, struct tracefs_mount_opts *opts)
{
substring_t args[MAX_OPT_ARGS];
@@ -336,10 +329,8 @@ static int tracefs_apply_options(struct super_block *sb, bool remount)
if (!remount || opts->opts & BIT(Opt_uid))
inode->i_uid = opts->uid;
- if (!remount || opts->opts & BIT(Opt_gid)) {
- /* Set all the group ids to the mount option */
- set_gid(sb->s_root, opts->gid);
- }
+ if (!remount || opts->opts & BIT(Opt_gid))
+ inode->i_gid = opts->gid;
return 0;
}
@@ -386,21 +377,30 @@ static const struct super_operations tracefs_super_operations = {
.show_options = tracefs_show_options,
};
-static void tracefs_dentry_iput(struct dentry *dentry, struct inode *inode)
+/*
+ * It would be cleaner if eventfs had its own dentry ops.
+ *
+ * Note that d_revalidate is called potentially under RCU,
+ * so it can't take the eventfs mutex etc. It's fine - if
+ * we open a file just as it's marked dead, things will
+ * still work just fine, and just see the old stale case.
+ */
+static void tracefs_d_release(struct dentry *dentry)
{
- struct tracefs_inode *ti;
+ if (dentry->d_fsdata)
+ eventfs_d_release(dentry);
+}
- if (!dentry || !inode)
- return;
+static int tracefs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+ struct eventfs_inode *ei = dentry->d_fsdata;
- ti = get_tracefs(inode);
- if (ti && ti->flags & TRACEFS_EVENT_INODE)
- eventfs_set_ei_status_free(ti, dentry);
- iput(inode);
+ return !(ei && ei->is_freed);
}
static const struct dentry_operations tracefs_dentry_operations = {
- .d_iput = tracefs_dentry_iput,
+ .d_revalidate = tracefs_d_revalidate,
+ .d_release = tracefs_d_release,
};
static int trace_fill_super(struct super_block *sb, void *data, int silent)
@@ -504,73 +504,24 @@ struct dentry *tracefs_end_creating(struct dentry *dentry)
return dentry;
}
-/**
- * eventfs_start_creating - start the process of creating a dentry
- * @name: Name of the file created for the dentry
- * @parent: The parent dentry where this dentry will be created
- *
- * This is a simple helper function for the dynamically created eventfs
- * files. When the directory of the eventfs files are accessed, their
- * dentries are created on the fly. This function is used to start that
- * process.
- */
-struct dentry *eventfs_start_creating(const char *name, struct dentry *parent)
+/* Find the inode that this will use for default */
+static struct inode *instance_inode(struct dentry *parent, struct inode *inode)
{
- struct dentry *dentry;
- int error;
-
- /* Must always have a parent. */
- if (WARN_ON_ONCE(!parent))
- return ERR_PTR(-EINVAL);
-
- error = simple_pin_fs(&trace_fs_type, &tracefs_mount,
- &tracefs_mount_count);
- if (error)
- return ERR_PTR(error);
+ struct tracefs_inode *ti;
- if (unlikely(IS_DEADDIR(parent->d_inode)))
- dentry = ERR_PTR(-ENOENT);
- else
- dentry = lookup_one_len(name, parent, strlen(name));
+ /* If parent is NULL then use root inode */
+ if (!parent)
+ return d_inode(inode->i_sb->s_root);
- if (!IS_ERR(dentry) && dentry->d_inode) {
- dput(dentry);
- dentry = ERR_PTR(-EEXIST);
+ /* Find the inode that is flagged as an instance or the root inode */
+ while (!IS_ROOT(parent)) {
+ ti = get_tracefs(d_inode(parent));
+ if (ti->flags & TRACEFS_INSTANCE_INODE)
+ break;
+ parent = parent->d_parent;
}
- if (IS_ERR(dentry))
- simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-
- return dentry;
-}
-
-/**
- * eventfs_failed_creating - clean up a failed eventfs dentry creation
- * @dentry: The dentry to clean up
- *
- * If after calling eventfs_start_creating(), a failure is detected, the
- * resources created by eventfs_start_creating() needs to be cleaned up. In
- * that case, this function should be called to perform that clean up.
- */
-struct dentry *eventfs_failed_creating(struct dentry *dentry)
-{
- dput(dentry);
- simple_release_fs(&tracefs_mount, &tracefs_mount_count);
- return NULL;
-}
-
-/**
- * eventfs_end_creating - Finish the process of creating a eventfs dentry
- * @dentry: The dentry that has successfully been created.
- *
- * This function is currently just a place holder to match
- * eventfs_start_creating(). In case any synchronization needs to be added,
- * this function will be used to implement that without having to modify
- * the callers of eventfs_start_creating().
- */
-struct dentry *eventfs_end_creating(struct dentry *dentry)
-{
- return dentry;
+ return d_inode(parent);
}
/**
@@ -603,6 +554,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
struct dentry *parent, void *data,
const struct file_operations *fops)
{
+ struct tracefs_inode *ti;
struct dentry *dentry;
struct inode *inode;
@@ -621,7 +573,11 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
if (unlikely(!inode))
return tracefs_failed_creating(dentry);
+ ti = get_tracefs(inode);
+ ti->private = instance_inode(parent, inode);
+
inode->i_mode = mode;
+ inode->i_op = &tracefs_file_inode_operations;
inode->i_fop = fops ? fops : &tracefs_file_operations;
inode->i_private = data;
inode->i_uid = d_inode(dentry->d_parent)->i_uid;
@@ -634,6 +590,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode,
static struct dentry *__create_dir(const char *name, struct dentry *parent,
const struct inode_operations *ops)
{
+ struct tracefs_inode *ti;
struct dentry *dentry = tracefs_start_creating(name, parent);
struct inode *inode;
@@ -651,6 +608,9 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent,
inode->i_uid = d_inode(dentry->d_parent)->i_uid;
inode->i_gid = d_inode(dentry->d_parent)->i_gid;
+ ti = get_tracefs(inode);
+ ti->private = instance_inode(parent, inode);
+
/* directory inodes start off with i_nlink == 2 (for "." entry) */
inc_nlink(inode);
d_instantiate(dentry, inode);
@@ -681,7 +641,7 @@ struct dentry *tracefs_create_dir(const char *name, struct dentry *parent)
if (security_locked_down(LOCKDOWN_TRACEFS))
return NULL;
- return __create_dir(name, parent, &simple_dir_inode_operations);
+ return __create_dir(name, parent, &tracefs_dir_inode_operations);
}
/**
@@ -712,7 +672,7 @@ __init struct dentry *tracefs_create_instance_dir(const char *name,
if (WARN_ON(tracefs_ops.mkdir || tracefs_ops.rmdir))
return NULL;
- dentry = __create_dir(name, parent, &tracefs_dir_inode_operations);
+ dentry = __create_dir(name, parent, &tracefs_instance_dir_inode_operations);
if (!dentry)
return NULL;
@@ -757,7 +717,11 @@ static void init_once(void *foo)
{
struct tracefs_inode *ti = (struct tracefs_inode *) foo;
+ /* inode_init_once() calls memset() on the vfs_inode portion */
inode_init_once(&ti->vfs_inode);
+
+ /* Zero out the rest */
+ memset_after(ti, 0, vfs_inode);
}
static int __init tracefs_init(void)
diff --git a/fs/tracefs/internal.h b/fs/tracefs/internal.h
index 42bdeb471a07..beb3dcd0e434 100644
--- a/fs/tracefs/internal.h
+++ b/fs/tracefs/internal.h
@@ -5,12 +5,16 @@
enum {
TRACEFS_EVENT_INODE = BIT(1),
TRACEFS_EVENT_TOP_INODE = BIT(2),
+ TRACEFS_GID_PERM_SET = BIT(3),
+ TRACEFS_UID_PERM_SET = BIT(4),
+ TRACEFS_INSTANCE_INODE = BIT(5),
};
struct tracefs_inode {
+ struct inode vfs_inode;
+ /* The below gets initialized with memset_after(ti, 0, vfs_inode) */
unsigned long flags;
void *private;
- struct inode vfs_inode;
};
/*
@@ -28,42 +32,37 @@ struct eventfs_attr {
/*
* struct eventfs_inode - hold the properties of the eventfs directories.
* @list: link list into the parent directory
+ * @rcu: Union with @list for freeing
+ * @children: link list into the child eventfs_inode
* @entries: the array of entries representing the files in the directory
* @name: the name of the directory to create
- * @children: link list into the child eventfs_inode
- * @dentry: the dentry of the directory
- * @d_parent: pointer to the parent's dentry
- * @d_children: The array of dentries to represent the files when created
+ * @events_dir: the dentry of the events directory
* @entry_attrs: Saved mode and ownership of the @d_children
- * @attr: Saved mode and ownership of eventfs_inode itself
* @data: The private data to pass to the callbacks
+ * @attr: Saved mode and ownership of eventfs_inode itself
* @is_freed: Flag set if the eventfs is on its way to be freed
* Note if is_freed is set, then dentry is corrupted.
+ * @is_events: Flag set for only the top level "events" directory
* @nr_entries: The number of items in @entries
+ * @ino: The saved inode number
*/
struct eventfs_inode {
- struct list_head list;
+ union {
+ struct list_head list;
+ struct rcu_head rcu;
+ };
+ struct list_head children;
const struct eventfs_entry *entries;
const char *name;
- struct list_head children;
- struct dentry *dentry; /* Check is_freed to access */
- struct dentry *d_parent;
- struct dentry **d_children;
+ struct dentry *events_dir;
struct eventfs_attr *entry_attrs;
- struct eventfs_attr attr;
void *data;
- /*
- * Union - used for deletion
- * @llist: for calling dput() if needed after RCU
- * @rcu: eventfs_inode to delete in RCU
- */
- union {
- struct llist_node llist;
- struct rcu_head rcu;
- };
+ struct eventfs_attr attr;
+ struct kref kref;
unsigned int is_freed:1;
unsigned int is_events:1;
unsigned int nr_entries:30;
+ unsigned int ino;
};
static inline struct tracefs_inode *get_tracefs(const struct inode *inode)
@@ -75,10 +74,7 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent);
struct dentry *tracefs_end_creating(struct dentry *dentry);
struct dentry *tracefs_failed_creating(struct dentry *dentry);
struct inode *tracefs_get_inode(struct super_block *sb);
-struct dentry *eventfs_start_creating(const char *name, struct dentry *parent);
-struct dentry *eventfs_failed_creating(struct dentry *dentry);
-struct dentry *eventfs_end_creating(struct dentry *dentry);
-void eventfs_update_gid(struct dentry *dentry, kgid_t gid);
-void eventfs_set_ei_status_free(struct tracefs_inode *ti, struct dentry *dentry);
+
+void eventfs_d_release(struct dentry *dentry);
#endif /* _TRACEFS_INTERNAL_H */
diff --git a/fs/ubifs/auth.c b/fs/ubifs/auth.c
index 0d561ecb6869..a4a0158f712d 100644
--- a/fs/ubifs/auth.c
+++ b/fs/ubifs/auth.c
@@ -18,7 +18,7 @@
#include "ubifs.h"
/**
- * ubifs_node_calc_hash - calculate the hash of a UBIFS node
+ * __ubifs_node_calc_hash - calculate the hash of a UBIFS node
* @c: UBIFS file-system description object
* @node: the node to calculate a hash for
* @hash: the returned hash
@@ -507,28 +507,13 @@ out:
*/
int ubifs_hmac_wkm(struct ubifs_info *c, u8 *hmac)
{
- SHASH_DESC_ON_STACK(shash, c->hmac_tfm);
- int err;
const char well_known_message[] = "UBIFS";
if (!ubifs_authenticated(c))
return 0;
- shash->tfm = c->hmac_tfm;
-
- err = crypto_shash_init(shash);
- if (err)
- return err;
-
- err = crypto_shash_update(shash, well_known_message,
- sizeof(well_known_message) - 1);
- if (err < 0)
- return err;
-
- err = crypto_shash_final(shash, hmac);
- if (err)
- return err;
- return 0;
+ return crypto_shash_tfm_digest(c->hmac_tfm, well_known_message,
+ sizeof(well_known_message) - 1, hmac);
}
/*
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index c4fc1047fc07..5b3a840098b0 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -70,18 +70,29 @@ static int nothing_to_commit(struct ubifs_info *c)
return 0;
/*
+ * Increasing @c->dirty_pn_cnt/@c->dirty_nn_cnt and marking
+ * nnodes/pnodes as dirty in run_gc() could race with following
+ * checking, which leads inconsistent states between @c->nroot
+ * and @c->dirty_pn_cnt/@c->dirty_nn_cnt, holding @c->lp_mutex
+ * to avoid that.
+ */
+ mutex_lock(&c->lp_mutex);
+ /*
* Even though the TNC is clean, the LPT tree may have dirty nodes. For
* example, this may happen if the budgeting subsystem invoked GC to
* make some free space, and the GC found an LEB with only dirty and
* free space. In this case GC would just change the lprops of this
* LEB (by turning all space into free space) and unmap it.
*/
- if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags))
+ if (c->nroot && test_bit(DIRTY_CNODE, &c->nroot->flags)) {
+ mutex_unlock(&c->lp_mutex);
return 0;
+ }
ubifs_assert(c, atomic_long_read(&c->dirty_zn_cnt) == 0);
ubifs_assert(c, c->dirty_pn_cnt == 0);
ubifs_assert(c, c->dirty_nn_cnt == 0);
+ mutex_unlock(&c->lp_mutex);
return 1;
}
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index 3b13c648d490..e413a9cf8ee3 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -1234,6 +1234,8 @@ out_cancel:
dir_ui->ui_size = dir->i_size;
mutex_unlock(&dir_ui->ui_mutex);
out_inode:
+ /* Free inode->i_link before inode is marked as bad. */
+ fscrypt_free_inode(inode);
make_bad_inode(inode);
iput(inode);
out_fname:
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2d2b39f843ce..5029eb3390a5 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -318,8 +318,9 @@ static int write_begin_slow(struct address_space *mapping,
* This is a helper function for 'ubifs_write_begin()' which allocates budget
* for the operation. The budget is allocated differently depending on whether
* this is appending, whether the page is dirty or not, and so on. This
- * function leaves the @ui->ui_mutex locked in case of appending. Returns zero
- * in case of success and %-ENOSPC in case of failure.
+ * function leaves the @ui->ui_mutex locked in case of appending.
+ *
+ * Returns: %0 in case of success and %-ENOSPC in case of failure.
*/
static int allocate_budget(struct ubifs_info *c, struct page *page,
struct ubifs_inode *ui, int appending)
@@ -600,7 +601,7 @@ out:
* @bu: bulk-read information
* @n: next zbranch slot
*
- * This function returns %0 on success and a negative error code on failure.
+ * Returns: %0 on success and a negative error code on failure.
*/
static int populate_page(struct ubifs_info *c, struct page *page,
struct bu_info *bu, int *n)
@@ -711,7 +712,7 @@ out_err:
* @bu: bulk-read information
* @page1: first page to read
*
- * This function returns %1 if the bulk-read is done, otherwise %0 is returned.
+ * Returns: %1 if the bulk-read is done, otherwise %0 is returned.
*/
static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
struct page *page1)
@@ -821,7 +822,9 @@ out_bu_off:
* Some flash media are capable of reading sequentially at faster rates. UBIFS
* bulk-read facility is designed to take advantage of that, by reading in one
* go consecutive data nodes that are also located consecutively in the same
- * LEB. This function returns %1 if a bulk-read is done and %0 otherwise.
+ * LEB.
+ *
+ * Returns: %1 if a bulk-read is done and %0 otherwise.
*/
static int ubifs_bulk_read(struct page *page)
{
@@ -1109,7 +1112,9 @@ static void do_attr_changes(struct inode *inode, const struct iattr *attr)
* @attr: inode attribute changes description
*
* This function implements VFS '->setattr()' call when the inode is truncated
- * to a smaller size. Returns zero in case of success and a negative error code
+ * to a smaller size.
+ *
+ * Returns: %0 in case of success and a negative error code
* in case of failure.
*/
static int do_truncation(struct ubifs_info *c, struct inode *inode,
@@ -1215,7 +1220,9 @@ out_budg:
* @attr: inode attribute changes description
*
* This function implements VFS '->setattr()' call for all cases except
- * truncations to smaller size. Returns zero in case of success and a negative
+ * truncations to smaller size.
+ *
+ * Returns: %0 in case of success and a negative
* error code in case of failure.
*/
static int do_setattr(struct ubifs_info *c, struct inode *inode,
@@ -1360,6 +1367,8 @@ out:
* This helper function checks if the inode mtime/ctime should be updated or
* not. If current values of the time-stamps are within the UBIFS inode time
* granularity, they are not updated. This is an optimization.
+ *
+ * Returns: %1 if time update is needed, %0 if not
*/
static inline int mctime_update_needed(const struct inode *inode,
const struct timespec64 *now)
@@ -1375,11 +1384,12 @@ static inline int mctime_update_needed(const struct inode *inode,
/**
* ubifs_update_time - update time of inode.
* @inode: inode to update
- * @time: timespec structure to hold the current time value
* @flags: time updating control flag determines updating
* which time fields of @inode
*
* This function updates time of the inode.
+ *
+ * Returns: %0 for success or a negative error code otherwise.
*/
int ubifs_update_time(struct inode *inode, int flags)
{
@@ -1413,7 +1423,9 @@ int ubifs_update_time(struct inode *inode, int flags)
* @inode: inode to update
*
* This function updates mtime and ctime of the inode if it is not equivalent to
- * current time. Returns zero in case of success and a negative error code in
+ * current time.
+ *
+ * Returns: %0 in case of success and a negative error code in
* case of failure.
*/
static int update_mctime(struct inode *inode)
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index c59d47fe7939..17da28d6247a 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -365,6 +365,7 @@ static void destroy_replay_list(struct ubifs_info *c)
* @lnum: node logical eraseblock number
* @offs: node offset
* @len: node length
+ * @hash: node hash
* @key: node key
* @sqnum: sequence number
* @deletion: non-zero if this is a deletion
@@ -417,6 +418,7 @@ static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
* @lnum: node logical eraseblock number
* @offs: node offset
* @len: node length
+ * @hash: node hash
* @key: node key
* @name: directory entry name
* @nlen: directory entry name length
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 3508ac484da3..1bb6ed948927 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -125,8 +125,6 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
udf_fiiter_release(&iter);
inode = udf_iget(dir->i_sb, &loc);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
}
return d_splice_alias(inode, dentry);
@@ -230,8 +228,6 @@ static int udf_fiiter_add_entry(struct inode *dir, struct dentry *dentry,
char name[UDF_NAME_LEN_CS0];
if (dentry) {
- if (!dentry->d_name.len)
- return -EINVAL;
namelen = udf_put_filename(dir->i_sb, dentry->d_name.name,
dentry->d_name.len,
name, UDF_NAME_LEN_CS0);
@@ -766,7 +762,7 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
struct inode *old_inode = d_inode(old_dentry);
struct inode *new_inode = d_inode(new_dentry);
struct udf_fileident_iter oiter, niter, diriter;
- bool has_diriter = false;
+ bool has_diriter = false, is_dir = false;
int retval;
struct kernel_lb_addr tloc;
@@ -789,6 +785,9 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
if (!empty_dir(new_inode))
goto out_oiter;
}
+ is_dir = true;
+ }
+ if (is_dir && old_dir != new_dir) {
retval = udf_fiiter_find_entry(old_inode, &dotdot_name,
&diriter);
if (retval == -ENOENT) {
@@ -878,7 +877,9 @@ static int udf_rename(struct mnt_idmap *idmap, struct inode *old_dir,
udf_dir_entry_len(&diriter.fi));
udf_fiiter_write_fi(&diriter, NULL);
udf_fiiter_release(&diriter);
+ }
+ if (is_dir) {
inode_dec_link_count(old_dir);
if (new_inode)
inode_dec_link_count(new_inode);
@@ -899,7 +900,6 @@ out_oiter:
static struct dentry *udf_get_parent(struct dentry *child)
{
struct kernel_lb_addr tloc;
- struct inode *inode = NULL;
struct udf_fileident_iter iter;
int err;
@@ -909,11 +909,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
tloc = lelb_to_cpu(iter.fi.icb.extLocation);
udf_fiiter_release(&iter);
- inode = udf_iget(child->d_sb, &tloc);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
-
- return d_obtain_alias(inode);
+ return d_obtain_alias(udf_iget(child->d_sb, &tloc));
}
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index ebce93b08281..a7bb2e63cdde 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -35,6 +35,7 @@
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/buffer_head.h>
+#include <linux/mpage.h>
#include <linux/writeback.h>
#include <linux/iversion.h>
@@ -390,7 +391,7 @@ out:
/**
* ufs_getfrag_block() - `get_block_t' function, interface between UFS and
- * read_folio, writepage and so on
+ * read_folio, writepages and so on
*/
static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create)
@@ -467,9 +468,10 @@ done:
return 0;
}
-static int ufs_writepage(struct page *page, struct writeback_control *wbc)
+static int ufs_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page,ufs_getfrag_block,wbc);
+ return mpage_writepages(mapping, wbc, ufs_getfrag_block);
}
static int ufs_read_folio(struct file *file, struct folio *folio)
@@ -528,9 +530,10 @@ const struct address_space_operations ufs_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
.read_folio = ufs_read_folio,
- .writepage = ufs_writepage,
+ .writepages = ufs_writepages,
.write_begin = ufs_write_begin,
.write_end = ufs_write_end,
+ .migrate_folio = buffer_migrate_folio,
.bmap = ufs_bmap
};
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index e8af40b05549..959551ff9a95 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -45,7 +45,6 @@ static struct ctl_table vm_userfaultfd_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
- { }
};
#endif
@@ -1033,7 +1032,7 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *new,
{
int fd;
- fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, new,
+ fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, new,
O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS), inode);
if (fd < 0)
return fd;
@@ -2005,6 +2004,75 @@ static inline unsigned int uffd_ctx_features(__u64 user_features)
return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
}
+static int userfaultfd_move(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ __s64 ret;
+ struct uffdio_move uffdio_move;
+ struct uffdio_move __user *user_uffdio_move;
+ struct userfaultfd_wake_range range;
+ struct mm_struct *mm = ctx->mm;
+
+ user_uffdio_move = (struct uffdio_move __user *) arg;
+
+ if (atomic_read(&ctx->mmap_changing))
+ return -EAGAIN;
+
+ if (copy_from_user(&uffdio_move, user_uffdio_move,
+ /* don't copy "move" last field */
+ sizeof(uffdio_move)-sizeof(__s64)))
+ return -EFAULT;
+
+ /* Do not allow cross-mm moves. */
+ if (mm != current->mm)
+ return -EINVAL;
+
+ ret = validate_range(mm, uffdio_move.dst, uffdio_move.len);
+ if (ret)
+ return ret;
+
+ ret = validate_range(mm, uffdio_move.src, uffdio_move.len);
+ if (ret)
+ return ret;
+
+ if (uffdio_move.mode & ~(UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES|
+ UFFDIO_MOVE_MODE_DONTWAKE))
+ return -EINVAL;
+
+ if (mmget_not_zero(mm)) {
+ mmap_read_lock(mm);
+
+ /* Re-check after taking mmap_lock */
+ if (likely(!atomic_read(&ctx->mmap_changing)))
+ ret = move_pages(ctx, mm, uffdio_move.dst, uffdio_move.src,
+ uffdio_move.len, uffdio_move.mode);
+ else
+ ret = -EINVAL;
+
+ mmap_read_unlock(mm);
+ mmput(mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (unlikely(put_user(ret, &user_uffdio_move->move)))
+ return -EFAULT;
+ if (ret < 0)
+ goto out;
+
+ /* len == 0 would wake all */
+ VM_WARN_ON(!ret);
+ range.len = ret;
+ if (!(uffdio_move.mode & UFFDIO_MOVE_MODE_DONTWAKE)) {
+ range.start = uffdio_move.dst;
+ wake_userfault(ctx, &range);
+ }
+ ret = range.len == uffdio_move.len ? 0 : -EAGAIN;
+
+out:
+ return ret;
+}
+
/*
* userland asks for a certain API version and we return which bits
* and ioctl commands are implemented in this kernel for such API
@@ -2097,6 +2165,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_ZEROPAGE:
ret = userfaultfd_zeropage(ctx, arg);
break;
+ case UFFDIO_MOVE:
+ ret = userfaultfd_move(ctx, arg);
+ break;
case UFFDIO_WRITEPROTECT:
ret = userfaultfd_writeprotect(ctx, arg);
break;
@@ -2189,7 +2260,8 @@ static int new_userfaultfd(int flags)
/* prevent the mm struct to be freed */
mmgrab(ctx->mm);
- fd = anon_inode_getfd_secure("[userfaultfd]", &userfaultfd_fops, ctx,
+ /* Create a new inode so that the LSM can block the creation. */
+ fd = anon_inode_create_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS), NULL);
if (fd < 0) {
mmdrop(ctx->mm);
diff --git a/fs/vboxsf/vboxsf_wrappers.c b/fs/vboxsf/vboxsf_wrappers.c
index bfc78a097dae..5e8d4359e171 100644
--- a/fs/vboxsf/vboxsf_wrappers.c
+++ b/fs/vboxsf/vboxsf_wrappers.c
@@ -114,7 +114,7 @@ int vboxsf_unmap_folder(u32 root)
* vboxsf_create - Create a new file or folder
* @root: Root of the shared folder in which to create the file
* @parsed_path: The path of the file or folder relative to the shared folder
- * @param: create_parms Parameters for file/folder creation.
+ * @create_parms: Parameters for file/folder creation.
*
* Create a new file or folder or open an existing one in a shared folder.
* Note this function always returns 0 / success unless an exceptional condition
diff --git a/fs/verity/fsverity_private.h b/fs/verity/fsverity_private.h
index d071a6e32581..a6a6b2749241 100644
--- a/fs/verity/fsverity_private.h
+++ b/fs/verity/fsverity_private.h
@@ -100,6 +100,16 @@ fsverity_msg(const struct inode *inode, const char *level,
#define fsverity_err(inode, fmt, ...) \
fsverity_msg((inode), KERN_ERR, fmt, ##__VA_ARGS__)
+/* measure.c */
+
+#ifdef CONFIG_BPF_SYSCALL
+void __init fsverity_init_bpf(void);
+#else
+static inline void fsverity_init_bpf(void)
+{
+}
+#endif
+
/* open.c */
int fsverity_init_merkle_tree_params(struct merkle_tree_params *params,
diff --git a/fs/verity/init.c b/fs/verity/init.c
index a29f062f6047..cb2c9aac61ed 100644
--- a/fs/verity/init.c
+++ b/fs/verity/init.c
@@ -24,7 +24,6 @@ static struct ctl_table fsverity_sysctl_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
- { }
};
static void __init fsverity_init_sysctl(void)
@@ -69,6 +68,7 @@ static int __init fsverity_init(void)
fsverity_init_workqueue();
fsverity_init_sysctl();
fsverity_init_signature();
+ fsverity_init_bpf();
return 0;
}
late_initcall(fsverity_init)
diff --git a/fs/verity/measure.c b/fs/verity/measure.c
index eec5956141da..bf7a5f4cccaf 100644
--- a/fs/verity/measure.c
+++ b/fs/verity/measure.c
@@ -7,6 +7,8 @@
#include "fsverity_private.h"
+#include <linux/bpf.h>
+#include <linux/btf.h>
#include <linux/uaccess.h>
/**
@@ -100,3 +102,85 @@ int fsverity_get_digest(struct inode *inode,
return hash_alg->digest_size;
}
EXPORT_SYMBOL_GPL(fsverity_get_digest);
+
+#ifdef CONFIG_BPF_SYSCALL
+
+/* bpf kfuncs */
+__bpf_kfunc_start_defs();
+
+/**
+ * bpf_get_fsverity_digest: read fsverity digest of file
+ * @file: file to get digest from
+ * @digest_ptr: (out) dynptr for struct fsverity_digest
+ *
+ * Read fsverity_digest of *file* into *digest_ptr*.
+ *
+ * Return: 0 on success, a negative value on error.
+ */
+__bpf_kfunc int bpf_get_fsverity_digest(struct file *file, struct bpf_dynptr_kern *digest_ptr)
+{
+ const struct inode *inode = file_inode(file);
+ u32 dynptr_sz = __bpf_dynptr_size(digest_ptr);
+ struct fsverity_digest *arg;
+ const struct fsverity_info *vi;
+ const struct fsverity_hash_alg *hash_alg;
+ int out_digest_sz;
+
+ if (dynptr_sz < sizeof(struct fsverity_digest))
+ return -EINVAL;
+
+ arg = __bpf_dynptr_data_rw(digest_ptr, dynptr_sz);
+ if (!arg)
+ return -EINVAL;
+
+ if (!IS_ALIGNED((uintptr_t)arg, __alignof__(*arg)))
+ return -EINVAL;
+
+ vi = fsverity_get_info(inode);
+ if (!vi)
+ return -ENODATA; /* not a verity file */
+
+ hash_alg = vi->tree_params.hash_alg;
+
+ arg->digest_algorithm = hash_alg - fsverity_hash_algs;
+ arg->digest_size = hash_alg->digest_size;
+
+ out_digest_sz = dynptr_sz - sizeof(struct fsverity_digest);
+
+ /* copy digest */
+ memcpy(arg->digest, vi->file_digest, min_t(int, hash_alg->digest_size, out_digest_sz));
+
+ /* fill the extra buffer with zeros */
+ if (out_digest_sz > hash_alg->digest_size)
+ memset(arg->digest + arg->digest_size, 0, out_digest_sz - hash_alg->digest_size);
+
+ return 0;
+}
+
+__bpf_kfunc_end_defs();
+
+BTF_SET8_START(fsverity_set_ids)
+BTF_ID_FLAGS(func, bpf_get_fsverity_digest, KF_TRUSTED_ARGS)
+BTF_SET8_END(fsverity_set_ids)
+
+static int bpf_get_fsverity_digest_filter(const struct bpf_prog *prog, u32 kfunc_id)
+{
+ if (!btf_id_set8_contains(&fsverity_set_ids, kfunc_id))
+ return 0;
+
+ /* Only allow to attach from LSM hooks, to avoid recursion */
+ return prog->type != BPF_PROG_TYPE_LSM ? -EACCES : 0;
+}
+
+static const struct btf_kfunc_id_set bpf_fsverity_set = {
+ .owner = THIS_MODULE,
+ .set = &fsverity_set_ids,
+ .filter = bpf_get_fsverity_digest_filter,
+};
+
+void __init fsverity_init_bpf(void)
+{
+ register_btf_kfunc_id_set(BPF_PROG_TYPE_LSM, &bpf_fsverity_set);
+}
+
+#endif /* CONFIG_BPF_SYSCALL */
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 7762c01a85cf..fbe3cdc79036 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -145,6 +145,7 @@ ifeq ($(CONFIG_XFS_ONLINE_SCRUB),y)
xfs-y += $(addprefix scrub/, \
trace.o \
+ agb_bitmap.o \
agheader.o \
alloc.o \
attr.o \
@@ -175,14 +176,32 @@ xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
rtsummary.o \
)
-xfs-$(CONFIG_XFS_QUOTA) += scrub/quota.o
+xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
+ dqiterate.o \
+ quota.o \
+ )
# online repair
ifeq ($(CONFIG_XFS_ONLINE_REPAIR),y)
xfs-y += $(addprefix scrub/, \
agheader_repair.o \
+ alloc_repair.o \
+ bmap_repair.o \
+ cow_repair.o \
+ ialloc_repair.o \
+ inode_repair.o \
+ newbt.o \
reap.o \
+ refcount_repair.o \
repair.o \
)
+
+xfs-$(CONFIG_XFS_RT) += $(addprefix scrub/, \
+ rtbitmap_repair.o \
+ )
+
+xfs-$(CONFIG_XFS_QUOTA) += $(addprefix scrub/, \
+ quota_repair.o \
+ )
endif
endif
diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c
index f9f4d694640d..39d9525270b7 100644
--- a/fs/xfs/libxfs/xfs_ag.c
+++ b/fs/xfs/libxfs/xfs_ag.c
@@ -332,6 +332,31 @@ xfs_agino_range(
return __xfs_agino_range(mp, xfs_ag_block_count(mp, agno), first, last);
}
+/*
+ * Free perag within the specified AG range, it is only used to free unused
+ * perags under the error handling path.
+ */
+void
+xfs_free_unused_perag_range(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agstart,
+ xfs_agnumber_t agend)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t index;
+
+ for (index = agstart; index < agend; index++) {
+ spin_lock(&mp->m_perag_lock);
+ pag = radix_tree_delete(&mp->m_perag_tree, index);
+ spin_unlock(&mp->m_perag_lock);
+ if (!pag)
+ break;
+ xfs_buf_hash_destroy(pag);
+ xfs_defer_drain_free(&pag->pag_intents_drain);
+ kmem_free(pag);
+ }
+}
+
int
xfs_initialize_perag(
struct xfs_mount *mp,
@@ -424,19 +449,14 @@ xfs_initialize_perag(
out_remove_pag:
xfs_defer_drain_free(&pag->pag_intents_drain);
+ spin_lock(&mp->m_perag_lock);
radix_tree_delete(&mp->m_perag_tree, index);
+ spin_unlock(&mp->m_perag_lock);
out_free_pag:
kmem_free(pag);
out_unwind_new_pags:
/* unwind any prior newly initialized pags */
- for (index = first_initialised; index < agcount; index++) {
- pag = radix_tree_delete(&mp->m_perag_tree, index);
- if (!pag)
- break;
- xfs_buf_hash_destroy(pag);
- xfs_defer_drain_free(&pag->pag_intents_drain);
- kmem_free(pag);
- }
+ xfs_free_unused_perag_range(mp, first_initialised, agcount);
return error;
}
@@ -984,7 +1004,7 @@ xfs_ag_shrink_space(
if (err2 != -ENOSPC)
goto resv_err;
- err2 = __xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
+ err2 = xfs_free_extent_later(*tpp, args.fsbno, delta, NULL,
XFS_AG_RESV_NONE, true);
if (err2)
goto resv_err;
diff --git a/fs/xfs/libxfs/xfs_ag.h b/fs/xfs/libxfs/xfs_ag.h
index 2e0aef87d633..4b343c4fac28 100644
--- a/fs/xfs/libxfs/xfs_ag.h
+++ b/fs/xfs/libxfs/xfs_ag.h
@@ -80,6 +80,16 @@ struct xfs_perag {
*/
uint16_t pag_checked;
uint16_t pag_sick;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /*
+ * Alternate btree heights so that online repair won't trip the write
+ * verifiers while rebuilding the AG btrees.
+ */
+ uint8_t pagf_repair_levels[XFS_BTNUM_AGF];
+ uint8_t pagf_repair_refcount_level;
+#endif
+
spinlock_t pag_state_lock;
spinlock_t pagb_lock; /* lock for pagb_tree */
@@ -133,6 +143,8 @@ __XFS_AG_OPSTATE(prefers_metadata, PREFERS_METADATA)
__XFS_AG_OPSTATE(allows_inodes, ALLOWS_INODES)
__XFS_AG_OPSTATE(agfl_needs_reset, AGFL_NEEDS_RESET)
+void xfs_free_unused_perag_range(struct xfs_mount *mp, xfs_agnumber_t agstart,
+ xfs_agnumber_t agend);
int xfs_initialize_perag(struct xfs_mount *mp, xfs_agnumber_t agcount,
xfs_rfsblock_t dcount, xfs_agnumber_t *maxagi);
int xfs_initialize_perag_data(struct xfs_mount *mp, xfs_agnumber_t agno);
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index 7fd1fea95552..da1057bd0e60 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -411,6 +411,8 @@ xfs_ag_resv_free_extent(
fallthrough;
case XFS_AG_RESV_NONE:
xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
+ fallthrough;
+ case XFS_AG_RESV_IGNORE:
return;
}
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 100ab5931b31..3bd0a33fee0a 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -246,11 +246,9 @@ xfs_alloc_btrec_to_irec(
/* Simple checks for free space records. */
xfs_failaddr_t
xfs_alloc_check_irec(
- struct xfs_btree_cur *cur,
- const struct xfs_alloc_rec_incore *irec)
+ struct xfs_perag *pag,
+ const struct xfs_alloc_rec_incore *irec)
{
- struct xfs_perag *pag = cur->bc_ag.pag;
-
if (irec->ar_blockcount == 0)
return __this_address;
@@ -299,7 +297,7 @@ xfs_alloc_get_rec(
return error;
xfs_alloc_btrec_to_irec(rec, &irec);
- fa = xfs_alloc_check_irec(cur, &irec);
+ fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
@@ -2514,7 +2512,7 @@ xfs_defer_agfl_block(
trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
xfs_extent_free_get_group(mp, xefi);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list);
+ xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type);
return 0;
}
@@ -2522,14 +2520,15 @@ xfs_defer_agfl_block(
* Add the extent to the list of extents to be free at transaction end.
* The list is maintained sorted (by block number).
*/
-int
-__xfs_free_extent_later(
+static int
+xfs_defer_extent_free(
struct xfs_trans *tp,
xfs_fsblock_t bno,
xfs_filblks_t len,
const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type,
- bool skip_discard)
+ bool skip_discard,
+ struct xfs_defer_pending **dfpp)
{
struct xfs_extent_free_item *xefi;
struct xfs_mount *mp = tp->t_mountp;
@@ -2577,10 +2576,105 @@ __xfs_free_extent_later(
XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
xfs_extent_free_get_group(mp, xefi);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list);
+ *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type);
return 0;
}
+int
+xfs_free_extent_later(
+ struct xfs_trans *tp,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ const struct xfs_owner_info *oinfo,
+ enum xfs_ag_resv_type type,
+ bool skip_discard)
+{
+ struct xfs_defer_pending *dontcare = NULL;
+
+ return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard,
+ &dontcare);
+}
+
+/*
+ * Set up automatic freeing of unwritten space in the filesystem.
+ *
+ * This function attached a paused deferred extent free item to the
+ * transaction. Pausing means that the EFI will be logged in the next
+ * transaction commit, but the pending EFI will not be finished until the
+ * pending item is unpaused.
+ *
+ * If the system goes down after the EFI has been persisted to the log but
+ * before the pending item is unpaused, log recovery will find the EFI, fail to
+ * find the EFD, and free the space.
+ *
+ * If the pending item is unpaused, the next transaction commit will log an EFD
+ * without freeing the space.
+ *
+ * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the
+ * @args structure are set to the relevant values.
+ */
+int
+xfs_alloc_schedule_autoreap(
+ const struct xfs_alloc_arg *args,
+ bool skip_discard,
+ struct xfs_alloc_autoreap *aarp)
+{
+ int error;
+
+ error = xfs_defer_extent_free(args->tp, args->fsbno, args->len,
+ &args->oinfo, args->resv, skip_discard, &aarp->dfp);
+ if (error)
+ return error;
+
+ xfs_defer_item_pause(args->tp, aarp->dfp);
+ return 0;
+}
+
+/*
+ * Cancel automatic freeing of unwritten space in the filesystem.
+ *
+ * Earlier, we created a paused deferred extent free item and attached it to
+ * this transaction so that we could automatically roll back a new space
+ * allocation if the system went down. Now we want to cancel the paused work
+ * item by marking the EFI stale so we don't actually free the space, unpausing
+ * the pending item and logging an EFD.
+ *
+ * The caller generally should have already mapped the space into the ondisk
+ * filesystem. If the reserved space was partially used, the caller must call
+ * xfs_free_extent_later to create a new EFI to free the unused space.
+ */
+void
+xfs_alloc_cancel_autoreap(
+ struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp)
+{
+ struct xfs_defer_pending *dfp = aarp->dfp;
+ struct xfs_extent_free_item *xefi;
+
+ if (!dfp)
+ return;
+
+ list_for_each_entry(xefi, &dfp->dfp_work, xefi_list)
+ xefi->xefi_flags |= XFS_EFI_CANCELLED;
+
+ xfs_defer_item_unpause(tp, dfp);
+}
+
+/*
+ * Commit automatic freeing of unwritten space in the filesystem.
+ *
+ * This unpauses an earlier _schedule_autoreap and commits to freeing the
+ * allocated space. Call this if none of the reserved space was used.
+ */
+void
+xfs_alloc_commit_autoreap(
+ struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp)
+{
+ if (aarp->dfp)
+ xfs_defer_item_unpause(tp, aarp->dfp);
+}
+
#ifdef DEBUG
/*
* Check if an AGF has a free extent record whose length is equal to
@@ -3848,7 +3942,7 @@ xfs_alloc_query_range_helper(
xfs_failaddr_t fa;
xfs_alloc_btrec_to_irec(rec, &irec);
- fa = xfs_alloc_check_irec(cur, &irec);
+ fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_alloc_complain_bad_rec(cur, fa, &irec);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 6bb8d295c321..0b956f8b9d5a 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -185,7 +185,7 @@ xfs_alloc_get_rec(
union xfs_btree_rec;
void xfs_alloc_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_alloc_rec_incore *irec);
-xfs_failaddr_t xfs_alloc_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_alloc_check_irec(struct xfs_perag *pag,
const struct xfs_alloc_rec_incore *irec);
int xfs_read_agf(struct xfs_perag *pag, struct xfs_trans *tp, int flags,
@@ -231,7 +231,7 @@ xfs_buf_to_agfl_bno(
return bp->b_addr;
}
-int __xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
+int xfs_free_extent_later(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, const struct xfs_owner_info *oinfo,
enum xfs_ag_resv_type type, bool skip_discard);
@@ -255,18 +255,18 @@ void xfs_extent_free_get_group(struct xfs_mount *mp,
#define XFS_EFI_SKIP_DISCARD (1U << 0) /* don't issue discard */
#define XFS_EFI_ATTR_FORK (1U << 1) /* freeing attr fork block */
#define XFS_EFI_BMBT_BLOCK (1U << 2) /* freeing bmap btree block */
+#define XFS_EFI_CANCELLED (1U << 3) /* dont actually free the space */
-static inline int
-xfs_free_extent_later(
- struct xfs_trans *tp,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- const struct xfs_owner_info *oinfo,
- enum xfs_ag_resv_type type)
-{
- return __xfs_free_extent_later(tp, bno, len, oinfo, type, false);
-}
+struct xfs_alloc_autoreap {
+ struct xfs_defer_pending *dfp;
+};
+int xfs_alloc_schedule_autoreap(const struct xfs_alloc_arg *args,
+ bool skip_discard, struct xfs_alloc_autoreap *aarp);
+void xfs_alloc_cancel_autoreap(struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp);
+void xfs_alloc_commit_autoreap(struct xfs_trans *tp,
+ struct xfs_alloc_autoreap *aarp);
extern struct kmem_cache *xfs_extfree_item_cache;
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index c65228efed4a..a7032bf0cd37 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -323,7 +323,18 @@ xfs_allocbt_verify(
if (bp->b_ops->magic[0] == cpu_to_be32(XFS_ABTC_MAGIC))
btnum = XFS_BTNUM_CNTi;
if (pag && xfs_perag_initialised_agf(pag)) {
- if (level >= pag->pagf_levels[btnum])
+ unsigned int maxlevel = pag->pagf_levels[btnum];
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /*
+ * Online repair could be rewriting the free space btrees, so
+ * we'll validate against the larger of either tree while this
+ * is going on.
+ */
+ maxlevel = max_t(unsigned int, maxlevel,
+ pag->pagf_repair_levels[btnum]);
+#endif
+ if (level >= maxlevel)
return __this_address;
} else if (level >= mp->m_alloc_maxlevels)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index e28d93d232de..e965a48e7db9 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -421,10 +421,10 @@ xfs_attr_complete_op(
bool do_replace = args->op_flags & XFS_DA_OP_REPLACE;
args->op_flags &= ~XFS_DA_OP_REPLACE;
- if (do_replace) {
- args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ args->attr_filter &= ~XFS_ATTR_INCOMPLETE;
+ if (do_replace)
return replace_state;
- }
+
return XFS_DAS_DONE;
}
@@ -862,8 +862,11 @@ xfs_attr_lookup(
if (!xfs_inode_hasattr(dp))
return -ENOATTR;
- if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL)
- return xfs_attr_sf_findname(args, NULL, NULL);
+ if (dp->i_af.if_format == XFS_DINODE_FMT_LOCAL) {
+ if (xfs_attr_sf_findname(args))
+ return -EEXIST;
+ return -ENOATTR;
+ }
if (xfs_attr_is_leaf(dp)) {
error = xfs_attr_leaf_hasname(args, &bp);
@@ -880,11 +883,10 @@ xfs_attr_lookup(
return error;
}
-static int
-xfs_attr_intent_init(
+static void
+xfs_attr_defer_add(
struct xfs_da_args *args,
- unsigned int op_flags, /* op flag (set or remove) */
- struct xfs_attr_intent **attr) /* new xfs_attr_intent */
+ unsigned int op_flags)
{
struct xfs_attr_intent *new;
@@ -893,66 +895,22 @@ xfs_attr_intent_init(
new->xattri_op_flags = op_flags;
new->xattri_da_args = args;
- *attr = new;
- return 0;
-}
-
-/* Sets an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_add(
- struct xfs_da_args *args)
-{
- struct xfs_attr_intent *new;
- int error = 0;
-
- error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_SET, &new);
- if (error)
- return error;
+ switch (op_flags) {
+ case XFS_ATTRI_OP_FLAGS_SET:
+ new->xattri_dela_state = xfs_attr_init_add_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_REPLACE:
+ new->xattri_dela_state = xfs_attr_init_replace_state(args);
+ break;
+ case XFS_ATTRI_OP_FLAGS_REMOVE:
+ new->xattri_dela_state = xfs_attr_init_remove_state(args);
+ break;
+ default:
+ ASSERT(0);
+ }
- new->xattri_dela_state = xfs_attr_init_add_state(args);
- xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
+ xfs_defer_add(args->trans, &new->xattri_list, &xfs_attr_defer_type);
trace_xfs_attr_defer_add(new->xattri_dela_state, args->dp);
-
- return 0;
-}
-
-/* Sets an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_replace(
- struct xfs_da_args *args)
-{
- struct xfs_attr_intent *new;
- int error = 0;
-
- error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REPLACE, &new);
- if (error)
- return error;
-
- new->xattri_dela_state = xfs_attr_init_replace_state(args);
- xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
- trace_xfs_attr_defer_replace(new->xattri_dela_state, args->dp);
-
- return 0;
-}
-
-/* Removes an attribute for an inode as a deferred operation */
-static int
-xfs_attr_defer_remove(
- struct xfs_da_args *args)
-{
-
- struct xfs_attr_intent *new;
- int error;
-
- error = xfs_attr_intent_init(args, XFS_ATTRI_OP_FLAGS_REMOVE, &new);
- if (error)
- return error;
-
- new->xattri_dela_state = xfs_attr_init_remove_state(args);
- xfs_defer_add(args->trans, XFS_DEFER_OPS_TYPE_ATTR, &new->xattri_list);
- trace_xfs_attr_defer_remove(new->xattri_dela_state, args->dp);
-
- return 0;
}
/*
@@ -1038,16 +996,16 @@ xfs_attr_set(
error = xfs_attr_lookup(args);
switch (error) {
case -EEXIST:
- /* if no value, we are performing a remove operation */
if (!args->value) {
- error = xfs_attr_defer_remove(args);
+ /* if no value, we are performing a remove operation */
+ xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REMOVE);
break;
}
+
/* Pure create fails if the attr already exists */
if (args->attr_flags & XATTR_CREATE)
goto out_trans_cancel;
-
- error = xfs_attr_defer_replace(args);
+ xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_REPLACE);
break;
case -ENOATTR:
/* Can't remove what isn't there. */
@@ -1057,14 +1015,11 @@ xfs_attr_set(
/* Pure replace fails if no existing attr to replace. */
if (args->attr_flags & XATTR_REPLACE)
goto out_trans_cancel;
-
- error = xfs_attr_defer_add(args);
+ xfs_attr_defer_add(args, XFS_ATTRI_OP_FLAGS_SET);
break;
default:
goto out_trans_cancel;
}
- if (error)
- goto out_trans_cancel;
/*
* If this is a synchronous mount, make sure that the
@@ -1097,10 +1052,9 @@ out_trans_cancel:
static inline int xfs_attr_sf_totsize(struct xfs_inode *dp)
{
- struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
- sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
- return be16_to_cpu(sf->hdr.totsize);
+ return be16_to_cpu(sf->totsize);
}
/*
@@ -1112,19 +1066,13 @@ xfs_attr_shortform_addname(
struct xfs_da_args *args)
{
int newsize, forkoff;
- int error;
trace_xfs_attr_sf_addname(args);
- error = xfs_attr_shortform_lookup(args);
- switch (error) {
- case -ENOATTR:
- if (args->op_flags & XFS_DA_OP_REPLACE)
- return error;
- break;
- case -EEXIST:
- if (!(args->op_flags & XFS_DA_OP_REPLACE))
- return error;
+ if (xfs_attr_sf_findname(args)) {
+ int error;
+
+ ASSERT(args->op_flags & XFS_DA_OP_REPLACE);
error = xfs_attr_sf_removename(args);
if (error)
@@ -1137,11 +1085,8 @@ xfs_attr_shortform_addname(
* around.
*/
args->op_flags &= ~XFS_DA_OP_REPLACE;
- break;
- case 0:
- break;
- default:
- return error;
+ } else {
+ ASSERT(!(args->op_flags & XFS_DA_OP_REPLACE));
}
if (args->namelen >= XFS_ATTR_SF_ENTSIZE_MAX ||
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 2580ae47209a..6374bf107242 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -690,56 +690,32 @@ xfs_attr_shortform_create(
ASSERT(ifp->if_bytes == 0);
if (ifp->if_format == XFS_DINODE_FMT_EXTENTS)
ifp->if_format = XFS_DINODE_FMT_LOCAL;
- xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
- hdr = (struct xfs_attr_sf_hdr *)ifp->if_u1.if_data;
+
+ hdr = xfs_idata_realloc(dp, sizeof(*hdr), XFS_ATTR_FORK);
memset(hdr, 0, sizeof(*hdr));
hdr->totsize = cpu_to_be16(sizeof(*hdr));
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
}
/*
- * Return -EEXIST if attr is found, or -ENOATTR if not
- * args: args containing attribute name and namelen
- * sfep: If not null, pointer will be set to the last attr entry found on
- -EEXIST. On -ENOATTR pointer is left at the last entry in the list
- * basep: If not null, pointer is set to the byte offset of the entry in the
- * list on -EEXIST. On -ENOATTR, pointer is left at the byte offset of
- * the last entry in the list
+ * Return the entry if the attr in args is found, or NULL if not.
*/
-int
+struct xfs_attr_sf_entry *
xfs_attr_sf_findname(
- struct xfs_da_args *args,
- struct xfs_attr_sf_entry **sfep,
- unsigned int *basep)
+ struct xfs_da_args *args)
{
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
- unsigned int base = sizeof(struct xfs_attr_sf_hdr);
- int size = 0;
- int end;
- int i;
+ struct xfs_attr_sf_hdr *sf = args->dp->i_af.if_data;
+ struct xfs_attr_sf_entry *sfe;
- sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
- sfe = &sf->list[0];
- end = sf->hdr.count;
- for (i = 0; i < end; sfe = xfs_attr_sf_nextentry(sfe),
- base += size, i++) {
- size = xfs_attr_sf_entsize(sfe);
- if (!xfs_attr_match(args, sfe->namelen, sfe->nameval,
- sfe->flags))
- continue;
- break;
+ for (sfe = xfs_attr_sf_firstentry(sf);
+ sfe < xfs_attr_sf_endptr(sf);
+ sfe = xfs_attr_sf_nextentry(sfe)) {
+ if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
+ sfe->flags))
+ return sfe;
}
- if (sfep != NULL)
- *sfep = sfe;
-
- if (basep != NULL)
- *basep = base;
-
- if (i == end)
- return -ENOATTR;
- return -EEXIST;
+ return NULL;
}
/*
@@ -751,38 +727,31 @@ xfs_attr_shortform_add(
struct xfs_da_args *args,
int forkoff)
{
- struct xfs_attr_shortform *sf;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_ifork *ifp = &dp->i_af;
+ struct xfs_attr_sf_hdr *sf = ifp->if_data;
struct xfs_attr_sf_entry *sfe;
- int offset, size;
- struct xfs_mount *mp;
- struct xfs_inode *dp;
- struct xfs_ifork *ifp;
+ int size;
trace_xfs_attr_sf_add(args);
- dp = args->dp;
- mp = dp->i_mount;
dp->i_forkoff = forkoff;
- ifp = &dp->i_af;
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- if (xfs_attr_sf_findname(args, &sfe, NULL) == -EEXIST)
- ASSERT(0);
+ ASSERT(!xfs_attr_sf_findname(args));
- offset = (char *)sfe - (char *)sf;
size = xfs_attr_sf_entsize_byname(args->namelen, args->valuelen);
- xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- sfe = (struct xfs_attr_sf_entry *)((char *)sf + offset);
+ sf = xfs_idata_realloc(dp, size, XFS_ATTR_FORK);
+ sfe = xfs_attr_sf_endptr(sf);
sfe->namelen = args->namelen;
sfe->valuelen = args->valuelen;
sfe->flags = args->attr_filter;
memcpy(sfe->nameval, args->name, args->namelen);
memcpy(&sfe->nameval[args->namelen], args->value, args->valuelen);
- sf->hdr.count++;
- be16_add_cpu(&sf->hdr.totsize, size);
+ sf->count++;
+ be16_add_cpu(&sf->totsize, size);
xfs_trans_log_inode(args->trans, dp, XFS_ILOG_CORE | XFS_ILOG_ADATA);
xfs_sbversion_add_attr2(mp, args->trans);
@@ -811,48 +780,43 @@ int
xfs_attr_sf_removename(
struct xfs_da_args *args)
{
- struct xfs_attr_shortform *sf;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_mount *mp = dp->i_mount;
+ struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
struct xfs_attr_sf_entry *sfe;
- int size = 0, end, totsize;
- unsigned int base;
- struct xfs_mount *mp;
- struct xfs_inode *dp;
- int error;
+ uint16_t totsize = be16_to_cpu(sf->totsize);
+ void *next, *end;
+ int size = 0;
trace_xfs_attr_sf_remove(args);
- dp = args->dp;
- mp = dp->i_mount;
- sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
-
- error = xfs_attr_sf_findname(args, &sfe, &base);
-
- /*
- * If we are recovering an operation, finding nothing to
- * remove is not an error - it just means there was nothing
- * to clean up.
- */
- if (error == -ENOATTR && (args->op_flags & XFS_DA_OP_RECOVERY))
- return 0;
- if (error != -EEXIST)
- return error;
- size = xfs_attr_sf_entsize(sfe);
+ sfe = xfs_attr_sf_findname(args);
+ if (!sfe) {
+ /*
+ * If we are recovering an operation, finding nothing to remove
+ * is not an error, it just means there was nothing to clean up.
+ */
+ if (args->op_flags & XFS_DA_OP_RECOVERY)
+ return 0;
+ return -ENOATTR;
+ }
/*
* Fix up the attribute fork data, covering the hole
*/
- end = base + size;
- totsize = be16_to_cpu(sf->hdr.totsize);
- if (end != totsize)
- memmove(&((char *)sf)[base], &((char *)sf)[end], totsize - end);
- sf->hdr.count--;
- be16_add_cpu(&sf->hdr.totsize, -size);
+ size = xfs_attr_sf_entsize(sfe);
+ next = xfs_attr_sf_nextentry(sfe);
+ end = xfs_attr_sf_endptr(sf);
+ if (next < end)
+ memmove(sfe, next, end - next);
+ sf->count--;
+ totsize -= size;
+ sf->totsize = cpu_to_be16(totsize);
/*
* Fix up the start offset of the attribute fork
*/
- totsize -= size;
- if (totsize == sizeof(xfs_attr_sf_hdr_t) && xfs_has_attr2(mp) &&
+ if (totsize == sizeof(struct xfs_attr_sf_hdr) && xfs_has_attr2(mp) &&
(dp->i_df.if_format != XFS_DINODE_FMT_BTREE) &&
!(args->op_flags & (XFS_DA_OP_ADDNAME | XFS_DA_OP_REPLACE))) {
xfs_attr_fork_remove(dp, args->trans);
@@ -860,7 +824,7 @@ xfs_attr_sf_removename(
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
dp->i_forkoff = xfs_attr_shortform_bytesfit(dp, totsize);
ASSERT(dp->i_forkoff);
- ASSERT(totsize > sizeof(xfs_attr_sf_hdr_t) ||
+ ASSERT(totsize > sizeof(struct xfs_attr_sf_hdr) ||
(args->op_flags & XFS_DA_OP_ADDNAME) ||
!xfs_has_attr2(mp) ||
dp->i_df.if_format == XFS_DINODE_FMT_BTREE);
@@ -874,33 +838,6 @@ xfs_attr_sf_removename(
}
/*
- * Look up a name in a shortform attribute list structure.
- */
-/*ARGSUSED*/
-int
-xfs_attr_shortform_lookup(xfs_da_args_t *args)
-{
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
- int i;
- struct xfs_ifork *ifp;
-
- trace_xfs_attr_sf_lookup(args);
-
- ifp = &args->dp->i_af;
- ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- sfe = &sf->list[0];
- for (i = 0; i < sf->hdr.count;
- sfe = xfs_attr_sf_nextentry(sfe), i++) {
- if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
- sfe->flags))
- return -EEXIST;
- }
- return -ENOATTR;
-}
-
-/*
* Retrieve the attribute value and length.
*
* If args->valuelen is zero, only the length needs to be returned. Unlike a
@@ -909,23 +846,19 @@ xfs_attr_shortform_lookup(xfs_da_args_t *args)
*/
int
xfs_attr_shortform_getvalue(
- struct xfs_da_args *args)
+ struct xfs_da_args *args)
{
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
- int i;
+ struct xfs_attr_sf_entry *sfe;
ASSERT(args->dp->i_af.if_format == XFS_DINODE_FMT_LOCAL);
- sf = (struct xfs_attr_shortform *)args->dp->i_af.if_u1.if_data;
- sfe = &sf->list[0];
- for (i = 0; i < sf->hdr.count;
- sfe = xfs_attr_sf_nextentry(sfe), i++) {
- if (xfs_attr_match(args, sfe->namelen, sfe->nameval,
- sfe->flags))
- return xfs_attr_copy_value(args,
- &sfe->nameval[args->namelen], sfe->valuelen);
- }
- return -ENOATTR;
+
+ trace_xfs_attr_sf_lookup(args);
+
+ sfe = xfs_attr_sf_findname(args);
+ if (!sfe)
+ return -ENOATTR;
+ return xfs_attr_copy_value(args, &sfe->nameval[args->namelen],
+ sfe->valuelen);
}
/* Convert from using the shortform to the leaf format. */
@@ -933,26 +866,23 @@ int
xfs_attr_shortform_to_leaf(
struct xfs_da_args *args)
{
- struct xfs_inode *dp;
- struct xfs_attr_shortform *sf;
+ struct xfs_inode *dp = args->dp;
+ struct xfs_ifork *ifp = &dp->i_af;
+ struct xfs_attr_sf_hdr *sf = ifp->if_data;
struct xfs_attr_sf_entry *sfe;
+ int size = be16_to_cpu(sf->totsize);
struct xfs_da_args nargs;
char *tmpbuffer;
- int error, i, size;
+ int error, i;
xfs_dablk_t blkno;
struct xfs_buf *bp;
- struct xfs_ifork *ifp;
trace_xfs_attr_sf_to_leaf(args);
- dp = args->dp;
- ifp = &dp->i_af;
- sf = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- size = be16_to_cpu(sf->hdr.totsize);
tmpbuffer = kmem_alloc(size, 0);
ASSERT(tmpbuffer != NULL);
- memcpy(tmpbuffer, ifp->if_u1.if_data, size);
- sf = (struct xfs_attr_shortform *)tmpbuffer;
+ memcpy(tmpbuffer, ifp->if_data, size);
+ sf = (struct xfs_attr_sf_hdr *)tmpbuffer;
xfs_idata_realloc(dp, -size, XFS_ATTR_FORK);
xfs_bmap_local_to_extents_empty(args->trans, dp, XFS_ATTR_FORK);
@@ -975,8 +905,8 @@ xfs_attr_shortform_to_leaf(
nargs.trans = args->trans;
nargs.op_flags = XFS_DA_OP_OKNOENT;
- sfe = &sf->list[0];
- for (i = 0; i < sf->hdr.count; i++) {
+ sfe = xfs_attr_sf_firstentry(sf);
+ for (i = 0; i < sf->count; i++) {
nargs.name = sfe->nameval;
nargs.namelen = sfe->namelen;
nargs.value = &sfe->nameval[nargs.namelen];
@@ -1040,23 +970,16 @@ xfs_attr_shortform_allfit(
return xfs_attr_shortform_bytesfit(dp, bytes);
}
-/* Verify the consistency of an inline attribute fork. */
+/* Verify the consistency of a raw inline attribute fork. */
xfs_failaddr_t
xfs_attr_shortform_verify(
- struct xfs_inode *ip)
+ struct xfs_attr_sf_hdr *sfp,
+ size_t size)
{
- struct xfs_attr_shortform *sfp;
- struct xfs_attr_sf_entry *sfep;
+ struct xfs_attr_sf_entry *sfep = xfs_attr_sf_firstentry(sfp);
struct xfs_attr_sf_entry *next_sfep;
char *endp;
- struct xfs_ifork *ifp;
int i;
- int64_t size;
-
- ASSERT(ip->i_af.if_format == XFS_DINODE_FMT_LOCAL);
- ifp = xfs_ifork_ptr(ip, XFS_ATTR_FORK);
- sfp = (struct xfs_attr_shortform *)ifp->if_u1.if_data;
- size = ifp->if_bytes;
/*
* Give up if the attribute is way too short.
@@ -1067,8 +990,7 @@ xfs_attr_shortform_verify(
endp = (char *)sfp + size;
/* Check all reported entries */
- sfep = &sfp->list[0];
- for (i = 0; i < sfp->hdr.count; i++) {
+ for (i = 0; i < sfp->count; i++) {
/*
* struct xfs_attr_sf_entry has a variable length.
* Check the fixed-offset parts of the structure are
@@ -1244,14 +1166,10 @@ xfs_attr3_leaf_to_node(
if (error)
goto out;
- /* copy leaf to new buffer, update identifiers */
- xfs_trans_buf_set_type(args->trans, bp2, XFS_BLFT_ATTR_LEAF_BUF);
- bp2->b_ops = bp1->b_ops;
- memcpy(bp2->b_addr, bp1->b_addr, args->geo->blksize);
- if (xfs_has_crc(mp)) {
- struct xfs_da3_blkinfo *hdr3 = bp2->b_addr;
- hdr3->blkno = cpu_to_be64(xfs_buf_daddr(bp2));
- }
+ /*
+ * Copy leaf to new buffer and log it.
+ */
+ xfs_da_buf_copy(bp2, bp1, args->geo->blksize);
xfs_trans_log_buf(args->trans, bp2, 0, args->geo->blksize - 1);
/*
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 368f4d9fa1d5..9b9948639c0f 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -47,16 +47,14 @@ struct xfs_attr3_icleaf_hdr {
*/
void xfs_attr_shortform_create(struct xfs_da_args *args);
void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff);
-int xfs_attr_shortform_lookup(struct xfs_da_args *args);
int xfs_attr_shortform_getvalue(struct xfs_da_args *args);
int xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
int xfs_attr_sf_removename(struct xfs_da_args *args);
-int xfs_attr_sf_findname(struct xfs_da_args *args,
- struct xfs_attr_sf_entry **sfep,
- unsigned int *basep);
+struct xfs_attr_sf_entry *xfs_attr_sf_findname(struct xfs_da_args *args);
int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
-xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_inode *ip);
+xfs_failaddr_t xfs_attr_shortform_verify(struct xfs_attr_sf_hdr *sfp,
+ size_t size);
void xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
/*
diff --git a/fs/xfs/libxfs/xfs_attr_sf.h b/fs/xfs/libxfs/xfs_attr_sf.h
index 37578b369d9b..bc4422223024 100644
--- a/fs/xfs/libxfs/xfs_attr_sf.h
+++ b/fs/xfs/libxfs/xfs_attr_sf.h
@@ -7,14 +7,6 @@
#define __XFS_ATTR_SF_H__
/*
- * Attribute storage when stored inside the inode.
- *
- * Small attribute lists are packed as tightly as possible so as
- * to fit into the literal area of the inode.
- */
-typedef struct xfs_attr_sf_hdr xfs_attr_sf_hdr_t;
-
-/*
* We generate this then sort it, attr_list() must return things in hash-order.
*/
typedef struct xfs_attr_sf_sort {
@@ -41,11 +33,25 @@ static inline int xfs_attr_sf_entsize(struct xfs_attr_sf_entry *sfep)
return struct_size(sfep, nameval, sfep->namelen + sfep->valuelen);
}
-/* next entry in struct */
+/* first entry in the SF attr fork */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_firstentry(struct xfs_attr_sf_hdr *hdr)
+{
+ return (struct xfs_attr_sf_entry *)(hdr + 1);
+}
+
+/* next entry after sfep */
static inline struct xfs_attr_sf_entry *
xfs_attr_sf_nextentry(struct xfs_attr_sf_entry *sfep)
{
return (void *)sfep + xfs_attr_sf_entsize(sfep);
}
+/* pointer to the space after the last entry, e.g. for adding a new one */
+static inline struct xfs_attr_sf_entry *
+xfs_attr_sf_endptr(struct xfs_attr_sf_hdr *sf)
+{
+ return (void *)sf + be16_to_cpu(sf->totsize);
+}
+
#endif /* __XFS_ATTR_SF_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index be62acffad6c..f362345467fa 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -575,7 +575,7 @@ xfs_bmap_btree_to_extents(
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, whichfork);
error = xfs_free_extent_later(cur->bc_tp, cbno, 1, &oinfo,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
return error;
@@ -747,7 +747,7 @@ xfs_bmap_local_to_extents_empty(
ASSERT(ifp->if_nextents == 0);
xfs_bmap_forkoff_reset(ip, whichfork);
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
ifp->if_format = XFS_DINODE_FMT_EXTENTS;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -832,7 +832,7 @@ xfs_bmap_local_to_extents(
xfs_bmap_local_to_extents_empty(tp, ip, whichfork);
flags |= XFS_ILOG_CORE;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
rec.br_startoff = 0;
@@ -3044,7 +3044,8 @@ xfs_bmap_extsize_align(
#define XFS_ALLOC_GAP_UNITS 4
-void
+/* returns true if ap->blkno was modified */
+bool
xfs_bmap_adjacent(
struct xfs_bmalloca *ap) /* bmap alloc argument struct */
{
@@ -3079,13 +3080,14 @@ xfs_bmap_adjacent(
if (adjust &&
ISVALID(ap->blkno + adjust, ap->prev.br_startblock))
ap->blkno += adjust;
+ return true;
}
/*
* If not at eof, then compare the two neighbor blocks.
* Figure out whether either one gives us a good starting point,
* and pick the better one.
*/
- else if (!ap->eof) {
+ if (!ap->eof) {
xfs_fsblock_t gotbno; /* right side block number */
xfs_fsblock_t gotdiff=0; /* right side difference */
xfs_fsblock_t prevbno; /* left side block number */
@@ -3165,14 +3167,21 @@ xfs_bmap_adjacent(
* If both valid, pick the better one, else the only good
* one, else ap->blkno is already set (to 0 or the inode block).
*/
- if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK)
+ if (prevbno != NULLFSBLOCK && gotbno != NULLFSBLOCK) {
ap->blkno = prevdiff <= gotdiff ? prevbno : gotbno;
- else if (prevbno != NULLFSBLOCK)
+ return true;
+ }
+ if (prevbno != NULLFSBLOCK) {
ap->blkno = prevbno;
- else if (gotbno != NULLFSBLOCK)
+ return true;
+ }
+ if (gotbno != NULLFSBLOCK) {
ap->blkno = gotbno;
+ return true;
+ }
}
#undef ISVALID
+ return false;
}
int
@@ -3263,11 +3272,14 @@ xfs_bmap_btalloc_select_lengths(
}
/* Update all inode and quota accounting for the allocation we just did. */
-static void
-xfs_bmap_btalloc_accounting(
- struct xfs_bmalloca *ap,
- struct xfs_alloc_arg *args)
+void
+xfs_bmap_alloc_account(
+ struct xfs_bmalloca *ap)
{
+ bool isrt = XFS_IS_REALTIME_INODE(ap->ip) &&
+ !(ap->flags & XFS_BMAPI_ATTRFORK);
+ uint fld;
+
if (ap->flags & XFS_BMAPI_COWFORK) {
/*
* COW fork blocks are in-core only and thus are treated as
@@ -3279,7 +3291,7 @@ xfs_bmap_btalloc_accounting(
* yet.
*/
if (ap->wasdel) {
- xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
return;
}
@@ -3291,22 +3303,25 @@ xfs_bmap_btalloc_accounting(
* This essentially transfers the transaction quota reservation
* to that of a delalloc extent.
*/
- ap->ip->i_delayed_blks += args->len;
- xfs_trans_mod_dquot_byino(ap->tp, ap->ip, XFS_TRANS_DQ_RES_BLKS,
- -(long)args->len);
+ ap->ip->i_delayed_blks += ap->length;
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip, isrt ?
+ XFS_TRANS_DQ_RES_RTBLKS : XFS_TRANS_DQ_RES_BLKS,
+ -(long)ap->length);
return;
}
/* data/attr fork only */
- ap->ip->i_nblocks += args->len;
+ ap->ip->i_nblocks += ap->length;
xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
if (ap->wasdel) {
- ap->ip->i_delayed_blks -= args->len;
- xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)args->len);
+ ap->ip->i_delayed_blks -= ap->length;
+ xfs_mod_delalloc(ap->ip->i_mount, -(int64_t)ap->length);
+ fld = isrt ? XFS_TRANS_DQ_DELRTBCOUNT : XFS_TRANS_DQ_DELBCOUNT;
+ } else {
+ fld = isrt ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT;
}
- xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
- ap->wasdel ? XFS_TRANS_DQ_DELBCOUNT : XFS_TRANS_DQ_BCOUNT,
- args->len);
+
+ xfs_trans_mod_dquot_byino(ap->tp, ap->ip, fld, ap->length);
}
static int
@@ -3380,7 +3395,7 @@ xfs_bmap_process_allocated_extent(
ap->offset = orig_offset;
else if (ap->offset + ap->length < orig_offset + orig_length)
ap->offset = orig_offset + orig_length - ap->length;
- xfs_bmap_btalloc_accounting(ap, args);
+ xfs_bmap_alloc_account(ap);
}
#ifdef DEBUG
@@ -5010,7 +5025,6 @@ xfs_bmap_del_extent_real(
xfs_fileoff_t del_endoff; /* first offset past del */
int do_fx; /* free extent at end of routine */
int error; /* error return value */
- int flags = 0;/* inode logging flags */
struct xfs_bmbt_irec got; /* current extent entry */
xfs_fileoff_t got_endoff; /* first offset past got */
int i; /* temp state */
@@ -5023,6 +5037,8 @@ xfs_bmap_del_extent_real(
uint32_t state = xfs_bmap_fork_to_state(whichfork);
struct xfs_bmbt_irec old;
+ *logflagsp = 0;
+
mp = ip->i_mount;
XFS_STATS_INC(mp, xs_del_exlist);
@@ -5035,7 +5051,6 @@ xfs_bmap_del_extent_real(
ASSERT(got_endoff >= del_endoff);
ASSERT(!isnullstartblock(got.br_startblock));
qfield = 0;
- error = 0;
/*
* If it's the case where the directory code is running with no block
@@ -5051,13 +5066,13 @@ xfs_bmap_del_extent_real(
del->br_startoff > got.br_startoff && del_endoff < got_endoff)
return -ENOSPC;
- flags = XFS_ILOG_CORE;
+ *logflagsp = XFS_ILOG_CORE;
if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) {
if (!(bflags & XFS_BMAPI_REMAP)) {
error = xfs_rtfree_blocks(tp, del->br_startblock,
del->br_blockcount);
if (error)
- goto done;
+ return error;
}
do_fx = 0;
@@ -5072,11 +5087,9 @@ xfs_bmap_del_extent_real(
if (cur) {
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
}
if (got.br_startoff == del->br_startoff)
@@ -5093,17 +5106,15 @@ xfs_bmap_del_extent_real(
xfs_iext_prev(ifp, icur);
ifp->if_nextents--;
- flags |= XFS_ILOG_CORE;
+ *logflagsp |= XFS_ILOG_CORE;
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
if ((error = xfs_btree_delete(cur, &i)))
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
break;
case BMAP_LEFT_FILLING:
/*
@@ -5114,12 +5125,12 @@ xfs_bmap_del_extent_real(
got.br_blockcount -= del->br_blockcount;
xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
break;
case BMAP_RIGHT_FILLING:
/*
@@ -5128,12 +5139,12 @@ xfs_bmap_del_extent_real(
got.br_blockcount -= del->br_blockcount;
xfs_iext_update_extent(ip, state, icur, &got);
if (!cur) {
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
break;
}
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
break;
case 0:
/*
@@ -5150,18 +5161,18 @@ xfs_bmap_del_extent_real(
new.br_state = got.br_state;
new.br_startblock = del_endblock;
- flags |= XFS_ILOG_CORE;
+ *logflagsp |= XFS_ILOG_CORE;
if (cur) {
error = xfs_bmbt_update(cur, &got);
if (error)
- goto done;
+ return error;
error = xfs_btree_increment(cur, 0, &i);
if (error)
- goto done;
+ return error;
cur->bc_rec.b = new;
error = xfs_btree_insert(cur, &i);
if (error && error != -ENOSPC)
- goto done;
+ return error;
/*
* If get no-space back from btree insert, it tried a
* split, and we have a zero block reservation. Fix up
@@ -5174,33 +5185,28 @@ xfs_bmap_del_extent_real(
*/
error = xfs_bmbt_lookup_eq(cur, &got, &i);
if (error)
- goto done;
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
- }
+ return error;
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
/*
* Update the btree record back
* to the original value.
*/
error = xfs_bmbt_update(cur, &old);
if (error)
- goto done;
+ return error;
/*
* Reset the extent record back
* to the original value.
*/
xfs_iext_update_extent(ip, state, icur, &old);
- flags = 0;
- error = -ENOSPC;
- goto done;
- }
- if (XFS_IS_CORRUPT(mp, i != 1)) {
- error = -EFSCORRUPTED;
- goto done;
+ *logflagsp = 0;
+ return -ENOSPC;
}
+ if (XFS_IS_CORRUPT(mp, i != 1))
+ return -EFSCORRUPTED;
} else
- flags |= xfs_ilog_fext(whichfork);
+ *logflagsp |= xfs_ilog_fext(whichfork);
ifp->if_nextents++;
xfs_iext_next(ifp, icur);
@@ -5218,13 +5224,13 @@ xfs_bmap_del_extent_real(
if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
xfs_refcount_decrease_extent(tp, del);
} else {
- error = __xfs_free_extent_later(tp, del->br_startblock,
+ error = xfs_free_extent_later(tp, del->br_startblock,
del->br_blockcount, NULL,
XFS_AG_RESV_NONE,
((bflags & XFS_BMAPI_NODISCARD) ||
del->br_state == XFS_EXT_UNWRITTEN));
if (error)
- goto done;
+ return error;
}
}
@@ -5239,9 +5245,7 @@ xfs_bmap_del_extent_real(
if (qfield && !(bflags & XFS_BMAPI_REMAP))
xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
-done:
- *logflagsp = flags;
- return error;
+ return 0;
}
/*
@@ -5250,7 +5254,7 @@ done:
* that value. If not all extents in the block range can be removed then
* *done is set.
*/
-int /* error */
+static int
__xfs_bunmapi(
struct xfs_trans *tp, /* transaction pointer */
struct xfs_inode *ip, /* incore inode */
@@ -6102,7 +6106,7 @@ __xfs_bmap_add(
bi->bi_bmap = *bmap;
xfs_bmap_update_get_group(tp->t_mountp, bi);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
+ xfs_defer_add(tp, &bi->bi_list, &xfs_bmap_update_defer_type);
return 0;
}
@@ -6179,19 +6183,18 @@ xfs_bmap_finish_one(
return error;
}
-/* Check that an inode's extent does not have invalid flags or bad ranges. */
+/* Check that an extent does not have invalid flags or bad ranges. */
xfs_failaddr_t
-xfs_bmap_validate_extent(
- struct xfs_inode *ip,
+xfs_bmap_validate_extent_raw(
+ struct xfs_mount *mp,
+ bool rtfile,
int whichfork,
struct xfs_bmbt_irec *irec)
{
- struct xfs_mount *mp = ip->i_mount;
-
if (!xfs_verify_fileext(mp, irec->br_startoff, irec->br_blockcount))
return __this_address;
- if (XFS_IS_REALTIME_INODE(ip) && whichfork == XFS_DATA_FORK) {
+ if (rtfile && whichfork == XFS_DATA_FORK) {
if (!xfs_verify_rtbext(mp, irec->br_startblock,
irec->br_blockcount))
return __this_address;
@@ -6221,3 +6224,53 @@ xfs_bmap_intent_destroy_cache(void)
kmem_cache_destroy(xfs_bmap_intent_cache);
xfs_bmap_intent_cache = NULL;
}
+
+/* Check that an inode's extent does not have invalid flags or bad ranges. */
+xfs_failaddr_t
+xfs_bmap_validate_extent(
+ struct xfs_inode *ip,
+ int whichfork,
+ struct xfs_bmbt_irec *irec)
+{
+ return xfs_bmap_validate_extent_raw(ip->i_mount,
+ XFS_IS_REALTIME_INODE(ip), whichfork, irec);
+}
+
+/*
+ * Used in xfs_itruncate_extents(). This is the maximum number of extents
+ * freed from a file in a single transaction.
+ */
+#define XFS_ITRUNC_MAX_EXTENTS 2
+
+/*
+ * Unmap every extent in part of an inode's fork. We don't do any higher level
+ * invalidation work at all.
+ */
+int
+xfs_bunmapi_range(
+ struct xfs_trans **tpp,
+ struct xfs_inode *ip,
+ uint32_t flags,
+ xfs_fileoff_t startoff,
+ xfs_fileoff_t endoff)
+{
+ xfs_filblks_t unmap_len = endoff - startoff + 1;
+ int error = 0;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ while (unmap_len > 0) {
+ ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
+ error = __xfs_bunmapi(*tpp, ip, startoff, &unmap_len, flags,
+ XFS_ITRUNC_MAX_EXTENTS);
+ if (error)
+ goto out;
+
+ /* free the just unmapped extents */
+ error = xfs_defer_finish(tpp);
+ if (error)
+ goto out;
+ }
+out:
+ return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index e33470e39728..f6b73f1bad5f 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -116,6 +116,8 @@ static inline int xfs_bmapi_whichfork(uint32_t bmapi_flags)
return XFS_DATA_FORK;
}
+void xfs_bmap_alloc_account(struct xfs_bmalloca *ap);
+
/*
* Special values for xfs_bmbt_irec_t br_startblock field.
*/
@@ -190,9 +192,6 @@ int xfs_bmapi_read(struct xfs_inode *ip, xfs_fileoff_t bno,
int xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extlen_t total, struct xfs_bmbt_irec *mval, int *nmap);
-int __xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
- xfs_fileoff_t bno, xfs_filblks_t *rlen, uint32_t flags,
- xfs_extnum_t nexts);
int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, uint32_t flags,
xfs_extnum_t nexts, int *done);
@@ -263,6 +262,8 @@ static inline uint32_t xfs_bmap_fork_to_state(int whichfork)
}
}
+xfs_failaddr_t xfs_bmap_validate_extent_raw(struct xfs_mount *mp, bool rtfile,
+ int whichfork, struct xfs_bmbt_irec *irec);
xfs_failaddr_t xfs_bmap_validate_extent(struct xfs_inode *ip, int whichfork,
struct xfs_bmbt_irec *irec);
int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork,
@@ -271,6 +272,8 @@ int xfs_bmap_complain_bad_rec(struct xfs_inode *ip, int whichfork,
int xfs_bmapi_remap(struct xfs_trans *tp, struct xfs_inode *ip,
xfs_fileoff_t bno, xfs_filblks_t len, xfs_fsblock_t startblock,
uint32_t flags);
+int xfs_bunmapi_range(struct xfs_trans **tpp, struct xfs_inode *ip,
+ uint32_t flags, xfs_fileoff_t startoff, xfs_fileoff_t endoff);
extern struct kmem_cache *xfs_bmap_intent_cache;
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index bf3f1b36fdd2..71f2d50f7823 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -15,6 +15,7 @@
#include "xfs_trans.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
#include "xfs_bmap_btree.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
@@ -272,7 +273,7 @@ xfs_bmbt_free_block(
xfs_rmap_ino_bmbt_owner(&oinfo, ip->i_ino, cur->bc_ino.whichfork);
error = xfs_free_extent_later(cur->bc_tp, fsbno, 1, &oinfo,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
return error;
@@ -288,10 +289,7 @@ xfs_bmbt_get_minrecs(
int level)
{
if (level == cur->bc_nlevels - 1) {
- struct xfs_ifork *ifp;
-
- ifp = xfs_ifork_ptr(cur->bc_ino.ip,
- cur->bc_ino.whichfork);
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0) / 2;
@@ -306,10 +304,7 @@ xfs_bmbt_get_maxrecs(
int level)
{
if (level == cur->bc_nlevels - 1) {
- struct xfs_ifork *ifp;
-
- ifp = xfs_ifork_ptr(cur->bc_ino.ip,
- cur->bc_ino.whichfork);
+ struct xfs_ifork *ifp = xfs_btree_ifork_ptr(cur);
return xfs_bmbt_maxrecs(cur->bc_mp,
ifp->if_broot_bytes, level == 0);
@@ -543,23 +538,19 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.keys_contiguous = xfs_bmbt_keys_contiguous,
};
-/*
- * Allocate a new bmap btree cursor.
- */
-struct xfs_btree_cur * /* new bmap btree cursor */
-xfs_bmbt_init_cursor(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_inode *ip, /* inode owning the btree */
- int whichfork) /* data or attr fork */
+static struct xfs_btree_cur *
+xfs_bmbt_init_common(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork)
{
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
struct xfs_btree_cur *cur;
+
ASSERT(whichfork != XFS_COW_FORK);
cur = xfs_btree_alloc_cursor(mp, tp, XFS_BTNUM_BMAP,
mp->m_bm_maxlevels[whichfork], xfs_bmbt_cur_cache);
- cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
cur->bc_ops = &xfs_bmbt_ops;
@@ -567,10 +558,30 @@ xfs_bmbt_init_cursor(
if (xfs_has_crc(mp))
cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
- cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
cur->bc_ino.ip = ip;
cur->bc_ino.allocated = 0;
cur->bc_ino.flags = 0;
+
+ return cur;
+}
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur *
+xfs_bmbt_init_cursor(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork);
+ struct xfs_btree_cur *cur;
+
+ cur = xfs_bmbt_init_common(mp, tp, ip, whichfork);
+
+ cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ cur->bc_ino.forksize = xfs_inode_fork_size(ip, whichfork);
cur->bc_ino.whichfork = whichfork;
return cur;
@@ -588,6 +599,76 @@ xfs_bmbt_block_maxrecs(
}
/*
+ * Allocate a new bmap btree cursor for reloading an inode block mapping data
+ * structure. Note that callers can use the staged cursor to reload extents
+ * format inode forks if they rebuild the iext tree and commit the staged
+ * cursor immediately.
+ */
+struct xfs_btree_cur *
+xfs_bmbt_stage_cursor(
+ struct xfs_mount *mp,
+ struct xfs_inode *ip,
+ struct xbtree_ifakeroot *ifake)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_btree_ops *ops;
+
+ /* data fork always has larger maxheight */
+ cur = xfs_bmbt_init_common(mp, NULL, ip, XFS_DATA_FORK);
+ cur->bc_nlevels = ifake->if_levels;
+ cur->bc_ino.forksize = ifake->if_fork_size;
+
+ /* Don't let anyone think we're attached to the real fork yet. */
+ cur->bc_ino.whichfork = -1;
+ xfs_btree_stage_ifakeroot(cur, ifake, &ops);
+ ops->update_cursor = NULL;
+ return cur;
+}
+
+/*
+ * Swap in the new inode fork root. Once we pass this point the newly rebuilt
+ * mappings are in place and we have to kill off any old btree blocks.
+ */
+void
+xfs_bmbt_commit_staged_btree(
+ struct xfs_btree_cur *cur,
+ struct xfs_trans *tp,
+ int whichfork)
+{
+ struct xbtree_ifakeroot *ifake = cur->bc_ino.ifake;
+ struct xfs_ifork *ifp;
+ static const short brootflag[2] = {XFS_ILOG_DBROOT, XFS_ILOG_ABROOT};
+ static const short extflag[2] = {XFS_ILOG_DEXT, XFS_ILOG_AEXT};
+ int flags = XFS_ILOG_CORE;
+
+ ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
+ ASSERT(whichfork != XFS_COW_FORK);
+
+ /*
+ * Free any resources hanging off the real fork, then shallow-copy the
+ * staging fork's contents into the real fork to transfer everything
+ * we just built.
+ */
+ ifp = xfs_ifork_ptr(cur->bc_ino.ip, whichfork);
+ xfs_idestroy_fork(ifp);
+ memcpy(ifp, ifake->if_fork, sizeof(struct xfs_ifork));
+
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ flags |= extflag[whichfork];
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ flags |= brootflag[whichfork];
+ break;
+ default:
+ ASSERT(0);
+ break;
+ }
+ xfs_trans_log_inode(tp, cur->bc_ino.ip, flags);
+ xfs_btree_commit_ifakeroot(cur, tp, whichfork, &xfs_bmbt_ops);
+}
+
+/*
* Calculate number of records in a bmap btree block.
*/
int
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h
index 3e7a40a83835..151b8491f60e 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.h
+++ b/fs/xfs/libxfs/xfs_bmap_btree.h
@@ -11,6 +11,7 @@ struct xfs_btree_block;
struct xfs_mount;
struct xfs_inode;
struct xfs_trans;
+struct xbtree_ifakeroot;
/*
* Btree block header size depends on a superblock flag.
@@ -106,6 +107,10 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
+struct xfs_btree_cur *xfs_bmbt_stage_cursor(struct xfs_mount *mp,
+ struct xfs_inode *ip, struct xbtree_ifakeroot *ifake);
+void xfs_bmbt_commit_staged_btree(struct xfs_btree_cur *cur,
+ struct xfs_trans *tp, int whichfork);
extern unsigned long long xfs_bmbt_calc_size(struct xfs_mount *mp,
unsigned long long len);
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 6a6503ab0cd7..ea8d3659df20 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1330,7 +1330,7 @@ xfs_btree_get_buf_block(
* Read in the buffer at the given ptr and return the buffer and
* the block pointer within the buffer.
*/
-STATIC int
+int
xfs_btree_read_buf_block(
struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr,
@@ -5212,3 +5212,29 @@ xfs_btree_destroy_cur_caches(void)
xfs_rmapbt_destroy_cur_cache();
xfs_refcountbt_destroy_cur_cache();
}
+
+/* Move the btree cursor before the first record. */
+int
+xfs_btree_goto_left_edge(
+ struct xfs_btree_cur *cur)
+{
+ int stat = 0;
+ int error;
+
+ memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
+ error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
+ if (error)
+ return error;
+ if (!stat)
+ return 0;
+
+ error = xfs_btree_decrement(cur, 0, &stat);
+ if (error)
+ return error;
+ if (stat != 0) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 4d68a58be160..d906324e25c8 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -700,6 +700,9 @@ void xfs_btree_set_ptr_null(struct xfs_btree_cur *cur,
int xfs_btree_get_buf_block(struct xfs_btree_cur *cur,
const union xfs_btree_ptr *ptr, struct xfs_btree_block **block,
struct xfs_buf **bpp);
+int xfs_btree_read_buf_block(struct xfs_btree_cur *cur,
+ const union xfs_btree_ptr *ptr, int flags,
+ struct xfs_btree_block **block, struct xfs_buf **bpp);
void xfs_btree_set_sibling(struct xfs_btree_cur *cur,
struct xfs_btree_block *block, const union xfs_btree_ptr *ptr,
int lr);
@@ -735,4 +738,6 @@ xfs_btree_alloc_cursor(
int __init xfs_btree_init_cur_caches(void);
void xfs_btree_destroy_cur_caches(void);
+int xfs_btree_goto_left_edge(struct xfs_btree_cur *cur);
+
#endif /* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_btree_staging.c b/fs/xfs/libxfs/xfs_btree_staging.c
index dd75e208b543..e276eba87cb1 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.c
+++ b/fs/xfs/libxfs/xfs_btree_staging.c
@@ -333,20 +333,41 @@ xfs_btree_commit_ifakeroot(
/*
* Put a btree block that we're loading onto the ordered list and release it.
* The btree blocks will be written to disk when bulk loading is finished.
+ * If we reach the dirty buffer threshold, flush them to disk before
+ * continuing.
*/
-static void
+static int
xfs_btree_bload_drop_buf(
- struct list_head *buffers_list,
- struct xfs_buf **bpp)
+ struct xfs_btree_bload *bbl,
+ struct list_head *buffers_list,
+ struct xfs_buf **bpp)
{
- if (*bpp == NULL)
- return;
+ struct xfs_buf *bp = *bpp;
+ int error;
+
+ if (!bp)
+ return 0;
- if (!xfs_buf_delwri_queue(*bpp, buffers_list))
- ASSERT(0);
+ /*
+ * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
+ * xfs_buf_read will not pointlessly reread the contents from the disk.
+ */
+ bp->b_flags |= XBF_DONE;
- xfs_buf_relse(*bpp);
+ xfs_buf_delwri_queue_here(bp, buffers_list);
+ xfs_buf_relse(bp);
*bpp = NULL;
+ bbl->nr_dirty++;
+
+ if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
+ return 0;
+
+ error = xfs_buf_delwri_submit(buffers_list);
+ if (error)
+ return error;
+
+ bbl->nr_dirty = 0;
+ return 0;
}
/*
@@ -384,7 +405,7 @@ xfs_btree_bload_prep_block(
ASSERT(*bpp == NULL);
/* Allocate a new incore btree root block. */
- new_size = bbl->iroot_size(cur, nr_this_block, priv);
+ new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
ifp->if_broot = kmem_zalloc(new_size, 0);
ifp->if_broot_bytes = (int)new_size;
@@ -418,7 +439,10 @@ xfs_btree_bload_prep_block(
*/
if (*blockp)
xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
- xfs_btree_bload_drop_buf(buffers_list, bpp);
+
+ ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
+ if (ret)
+ return ret;
/* Initialize the new btree block. */
xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
@@ -436,22 +460,19 @@ STATIC int
xfs_btree_bload_leaf(
struct xfs_btree_cur *cur,
unsigned int recs_this_block,
- xfs_btree_bload_get_record_fn get_record,
+ xfs_btree_bload_get_records_fn get_records,
struct xfs_btree_block *block,
void *priv)
{
- unsigned int j;
+ unsigned int j = 1;
int ret;
/* Fill the leaf block with records. */
- for (j = 1; j <= recs_this_block; j++) {
- union xfs_btree_rec *block_rec;
-
- ret = get_record(cur, priv);
- if (ret)
+ while (j <= recs_this_block) {
+ ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
+ if (ret < 0)
return ret;
- block_rec = xfs_btree_rec_addr(cur, j, block);
- cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ j += ret;
}
return 0;
@@ -485,7 +506,12 @@ xfs_btree_bload_node(
ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
- ret = xfs_btree_get_buf_block(cur, child_ptr, &child_block,
+ /*
+ * Read the lower-level block in case the buffer for it has
+ * been reclaimed. LRU refs will be set on the block, which is
+ * desirable if the new btree commits.
+ */
+ ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
&child_bp);
if (ret)
return ret;
@@ -570,7 +596,14 @@ xfs_btree_bload_level_geometry(
unsigned int desired_npb;
unsigned int maxnr;
- maxnr = cur->bc_ops->get_maxrecs(cur, level);
+ /*
+ * Compute the absolute maximum number of records that we can store in
+ * the ondisk block or inode root.
+ */
+ if (cur->bc_ops->get_dmaxrecs)
+ maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
+ else
+ maxnr = cur->bc_ops->get_maxrecs(cur, level);
/*
* Compute the number of blocks we need to fill each block with the
@@ -764,6 +797,7 @@ xfs_btree_bload(
cur->bc_nlevels = bbl->btree_height;
xfs_btree_set_ptr_null(cur, &child_ptr);
xfs_btree_set_ptr_null(cur, &ptr);
+ bbl->nr_dirty = 0;
xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
&avg_per_block, &blocks, &blocks_with_extra);
@@ -789,7 +823,7 @@ xfs_btree_bload(
trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
nr_this_block);
- ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_record,
+ ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
block, priv);
if (ret)
goto out;
@@ -802,7 +836,10 @@ xfs_btree_bload(
xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
}
total_blocks += blocks;
- xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+ ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
+ if (ret)
+ goto out;
/* Populate the internal btree nodes. */
for (level = 1; level < cur->bc_nlevels; level++) {
@@ -844,7 +881,11 @@ xfs_btree_bload(
xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
}
total_blocks += blocks;
- xfs_btree_bload_drop_buf(&buffers_list, &bp);
+
+ ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
+ if (ret)
+ goto out;
+
xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
}
diff --git a/fs/xfs/libxfs/xfs_btree_staging.h b/fs/xfs/libxfs/xfs_btree_staging.h
index f0d2976050ae..055ea43b1e18 100644
--- a/fs/xfs/libxfs/xfs_btree_staging.h
+++ b/fs/xfs/libxfs/xfs_btree_staging.h
@@ -37,12 +37,6 @@ struct xbtree_ifakeroot {
/* Number of bytes available for this fork in the inode. */
unsigned int if_fork_size;
-
- /* Fork format. */
- unsigned int if_format;
-
- /* Number of records. */
- unsigned int if_extents;
};
/* Cursor interactions with fake roots for inode-rooted btrees. */
@@ -53,19 +47,24 @@ void xfs_btree_commit_ifakeroot(struct xfs_btree_cur *cur, struct xfs_trans *tp,
int whichfork, const struct xfs_btree_ops *ops);
/* Bulk loading of staged btrees. */
-typedef int (*xfs_btree_bload_get_record_fn)(struct xfs_btree_cur *cur, void *priv);
+typedef int (*xfs_btree_bload_get_records_fn)(struct xfs_btree_cur *cur,
+ unsigned int idx, struct xfs_btree_block *block,
+ unsigned int nr_wanted, void *priv);
typedef int (*xfs_btree_bload_claim_block_fn)(struct xfs_btree_cur *cur,
union xfs_btree_ptr *ptr, void *priv);
typedef size_t (*xfs_btree_bload_iroot_size_fn)(struct xfs_btree_cur *cur,
- unsigned int nr_this_level, void *priv);
+ unsigned int level, unsigned int nr_this_level, void *priv);
struct xfs_btree_bload {
/*
- * This function will be called nr_records times to load records into
- * the btree. The function does this by setting the cursor's bc_rec
- * field in in-core format. Records must be returned in sort order.
+ * This function will be called to load @nr_wanted records into the
+ * btree. The implementation does this by setting the cursor's bc_rec
+ * field in in-core format and using init_rec_from_cur to set the
+ * records in the btree block. Records must be returned in sort order.
+ * The function must return the number of records loaded or the usual
+ * negative errno.
*/
- xfs_btree_bload_get_record_fn get_record;
+ xfs_btree_bload_get_records_fn get_records;
/*
* This function will be called nr_blocks times to obtain a pointer
@@ -113,6 +112,16 @@ struct xfs_btree_bload {
* height of the new btree.
*/
unsigned int btree_height;
+
+ /*
+ * Flush the new btree block buffer list to disk after this many blocks
+ * have been formatted. Zero prohibits writing any buffers until all
+ * blocks have been formatted.
+ */
+ uint16_t max_dirty;
+
+ /* Number of dirty buffers. */
+ uint16_t nr_dirty;
};
int xfs_btree_bload_compute_geometry(struct xfs_btree_cur *cur,
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e576560b46e9..5457188bb4de 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -421,6 +421,25 @@ xfs_da3_node_read_mapped(
return xfs_da3_node_set_type(tp, *bpp);
}
+/*
+ * Copy src directory/attr leaf/node buffer to the dst.
+ * For v5 file systems make sure the right blkno is stamped in.
+ */
+void
+xfs_da_buf_copy(
+ struct xfs_buf *dst,
+ struct xfs_buf *src,
+ size_t size)
+{
+ struct xfs_da3_blkinfo *da3 = dst->b_addr;
+
+ memcpy(dst->b_addr, src->b_addr, size);
+ dst->b_ops = src->b_ops;
+ xfs_trans_buf_copy_type(dst, src);
+ if (xfs_has_crc(dst->b_mount))
+ da3->blkno = cpu_to_be64(xfs_buf_daddr(dst));
+}
+
/*========================================================================
* Routines used for growing the Btree.
*========================================================================*/
@@ -690,12 +709,6 @@ xfs_da3_root_split(
btree = icnodehdr.btree;
size = (int)((char *)&btree[icnodehdr.count] - (char *)oldroot);
level = icnodehdr.level;
-
- /*
- * we are about to copy oldroot to bp, so set up the type
- * of bp while we know exactly what it will be.
- */
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DA_NODE_BUF);
} else {
struct xfs_dir3_icleaf_hdr leafhdr;
@@ -707,31 +720,17 @@ xfs_da3_root_split(
size = (int)((char *)&leafhdr.ents[leafhdr.count] -
(char *)leaf);
level = 0;
-
- /*
- * we are about to copy oldroot to bp, so set up the type
- * of bp while we know exactly what it will be.
- */
- xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DIR_LEAFN_BUF);
}
/*
- * we can copy most of the information in the node from one block to
- * another, but for CRC enabled headers we have to make sure that the
- * block specific identifiers are kept intact. We update the buffer
- * directly for this.
+ * Copy old root to new buffer and log it.
*/
- memcpy(node, oldroot, size);
- if (oldroot->hdr.info.magic == cpu_to_be16(XFS_DA3_NODE_MAGIC) ||
- oldroot->hdr.info.magic == cpu_to_be16(XFS_DIR3_LEAFN_MAGIC)) {
- struct xfs_da3_intnode *node3 = (struct xfs_da3_intnode *)node;
-
- node3->hdr.info.blkno = cpu_to_be64(xfs_buf_daddr(bp));
- }
+ xfs_da_buf_copy(bp, blk1->bp, size);
xfs_trans_log_buf(tp, bp, 0, size - 1);
- bp->b_ops = blk1->bp->b_ops;
- xfs_trans_buf_copy_type(bp, blk1->bp);
+ /*
+ * Update blk1 to point to new buffer.
+ */
blk1->bp = bp;
blk1->blkno = blkno;
@@ -1220,21 +1219,14 @@ xfs_da3_root_join(
xfs_da_blkinfo_onlychild_validate(bp->b_addr, oldroothdr.level);
/*
- * This could be copying a leaf back into the root block in the case of
- * there only being a single leaf block left in the tree. Hence we have
- * to update the b_ops pointer as well to match the buffer type change
- * that could occur. For dir3 blocks we also need to update the block
- * number in the buffer header.
+ * Copy child to root buffer and log it.
*/
- memcpy(root_blk->bp->b_addr, bp->b_addr, args->geo->blksize);
- root_blk->bp->b_ops = bp->b_ops;
- xfs_trans_buf_copy_type(root_blk->bp, bp);
- if (oldroothdr.magic == XFS_DA3_NODE_MAGIC) {
- struct xfs_da3_blkinfo *da3 = root_blk->bp->b_addr;
- da3->blkno = cpu_to_be64(xfs_buf_daddr(root_blk->bp));
- }
+ xfs_da_buf_copy(root_blk->bp, bp, args->geo->blksize);
xfs_trans_log_buf(args->trans, root_blk->bp, 0,
args->geo->blksize - 1);
+ /*
+ * Now we can drop the child buffer.
+ */
error = xfs_da_shrink_inode(args, child, bp);
return error;
}
@@ -2317,9 +2309,10 @@ xfs_da3_swap_lastblock(
/*
* Copy the last block into the dead buffer and log it.
*/
- memcpy(dead_buf->b_addr, last_buf->b_addr, args->geo->blksize);
+ xfs_da_buf_copy(dead_buf, last_buf, args->geo->blksize);
xfs_trans_log_buf(tp, dead_buf, 0, args->geo->blksize - 1);
dead_info = dead_buf->b_addr;
+
/*
* Get values from the moved block.
*/
diff --git a/fs/xfs/libxfs/xfs_da_btree.h b/fs/xfs/libxfs/xfs_da_btree.h
index ffa3df5b2893..706baf36e175 100644
--- a/fs/xfs/libxfs/xfs_da_btree.h
+++ b/fs/xfs/libxfs/xfs_da_btree.h
@@ -219,6 +219,8 @@ int xfs_da_reada_buf(struct xfs_inode *dp, xfs_dablk_t bno,
const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
struct xfs_buf *dead_buf);
+void xfs_da_buf_copy(struct xfs_buf *dst, struct xfs_buf *src,
+ size_t size);
uint xfs_da_hashname(const uint8_t *name_string, int name_length);
enum xfs_dacmp xfs_da_compname(struct xfs_da_args *args,
diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h
index f9015f88eca7..24f9d1461f9a 100644
--- a/fs/xfs/libxfs/xfs_da_format.h
+++ b/fs/xfs/libxfs/xfs_da_format.h
@@ -578,20 +578,25 @@ xfs_dir2_block_leaf_p(struct xfs_dir2_block_tail *btp)
#define XFS_ATTR_LEAF_MAPSIZE 3 /* how many freespace slots */
/*
- * Entries are packed toward the top as tight as possible.
- */
-struct xfs_attr_shortform {
- struct xfs_attr_sf_hdr { /* constant-structure header block */
- __be16 totsize; /* total bytes in shortform list */
- __u8 count; /* count of active entries */
- __u8 padding;
- } hdr;
- struct xfs_attr_sf_entry {
- uint8_t namelen; /* actual length of name (no NULL) */
- uint8_t valuelen; /* actual length of value (no NULL) */
- uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */
- uint8_t nameval[]; /* name & value bytes concatenated */
- } list[]; /* variable sized array */
+ * Attribute storage when stored inside the inode.
+ *
+ * Small attribute lists are packed as tightly as possible so as to fit into the
+ * literal area of the inode.
+ *
+ * These "shortform" attribute forks consist of a single xfs_attr_sf_hdr header
+ * followed by zero or more xfs_attr_sf_entry structures.
+ */
+struct xfs_attr_sf_hdr { /* constant-structure header block */
+ __be16 totsize; /* total bytes in shortform list */
+ __u8 count; /* count of active entries */
+ __u8 padding;
+};
+
+struct xfs_attr_sf_entry {
+ __u8 namelen; /* actual length of name (no NULL) */
+ __u8 valuelen; /* actual length of value (no NULL) */
+ __u8 flags; /* flags bits (XFS_ATTR_*) */
+ __u8 nameval[]; /* name & value bytes concatenated */
};
typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */
diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c
index f71679ce23b9..66a17910d021 100644
--- a/fs/xfs/libxfs/xfs_defer.c
+++ b/fs/xfs/libxfs/xfs_defer.c
@@ -26,6 +26,7 @@
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
#include "xfs_attr.h"
+#include "xfs_trans_priv.h"
static struct kmem_cache *xfs_defer_pending_cache;
@@ -181,16 +182,89 @@ static struct kmem_cache *xfs_defer_pending_cache;
* Note that the continuation requested between t2 and t3 is likely to
* reoccur.
*/
+STATIC struct xfs_log_item *
+xfs_defer_barrier_create_intent(
+ struct xfs_trans *tp,
+ struct list_head *items,
+ unsigned int count,
+ bool sort)
+{
+ return NULL;
+}
-static const struct xfs_defer_op_type *defer_op_types[] = {
- [XFS_DEFER_OPS_TYPE_BMAP] = &xfs_bmap_update_defer_type,
- [XFS_DEFER_OPS_TYPE_REFCOUNT] = &xfs_refcount_update_defer_type,
- [XFS_DEFER_OPS_TYPE_RMAP] = &xfs_rmap_update_defer_type,
- [XFS_DEFER_OPS_TYPE_FREE] = &xfs_extent_free_defer_type,
- [XFS_DEFER_OPS_TYPE_AGFL_FREE] = &xfs_agfl_free_defer_type,
- [XFS_DEFER_OPS_TYPE_ATTR] = &xfs_attr_defer_type,
+STATIC void
+xfs_defer_barrier_abort_intent(
+ struct xfs_log_item *intent)
+{
+ /* empty */
+}
+
+STATIC struct xfs_log_item *
+xfs_defer_barrier_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ return NULL;
+}
+
+STATIC int
+xfs_defer_barrier_finish_item(
+ struct xfs_trans *tp,
+ struct xfs_log_item *done,
+ struct list_head *item,
+ struct xfs_btree_cur **state)
+{
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+STATIC void
+xfs_defer_barrier_cancel_item(
+ struct list_head *item)
+{
+ ASSERT(0);
+}
+
+static const struct xfs_defer_op_type xfs_barrier_defer_type = {
+ .max_items = 1,
+ .create_intent = xfs_defer_barrier_create_intent,
+ .abort_intent = xfs_defer_barrier_abort_intent,
+ .create_done = xfs_defer_barrier_create_done,
+ .finish_item = xfs_defer_barrier_finish_item,
+ .cancel_item = xfs_defer_barrier_cancel_item,
};
+/* Create a log intent done item for a log intent item. */
+static inline void
+xfs_defer_create_done(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ struct xfs_log_item *lip;
+
+ /* If there is no log intent item, there can be no log done item. */
+ if (!dfp->dfp_intent)
+ return;
+
+ /*
+ * Mark the transaction dirty, even on error. This ensures the
+ * transaction is aborted, which:
+ *
+ * 1.) releases the log intent item and frees the log done item
+ * 2.) shuts down the filesystem
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ lip = dfp->dfp_ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+ if (!lip)
+ return;
+
+ tp->t_flags |= XFS_TRANS_HAS_INTENT_DONE;
+ xfs_trans_add_item(tp, lip);
+ set_bit(XFS_LI_DIRTY, &lip->li_flags);
+ dfp->dfp_done = lip;
+}
+
/*
* Ensure there's a log intent item associated with this deferred work item if
* the operation must be restarted on crash. Returns 1 if there's a log item;
@@ -202,18 +276,21 @@ xfs_defer_create_intent(
struct xfs_defer_pending *dfp,
bool sort)
{
- const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
struct xfs_log_item *lip;
if (dfp->dfp_intent)
return 1;
- lip = ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count, sort);
+ lip = dfp->dfp_ops->create_intent(tp, &dfp->dfp_work, dfp->dfp_count,
+ sort);
if (!lip)
return 0;
if (IS_ERR(lip))
return PTR_ERR(lip);
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ xfs_trans_add_item(tp, lip);
+ set_bit(XFS_LI_DIRTY, &lip->li_flags);
dfp->dfp_intent = lip;
return 1;
}
@@ -245,23 +322,50 @@ xfs_defer_create_intents(
return ret;
}
-STATIC void
+static inline void
xfs_defer_pending_abort(
struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp)
+{
+ trace_xfs_defer_pending_abort(mp, dfp);
+
+ if (dfp->dfp_intent && !dfp->dfp_done) {
+ dfp->dfp_ops->abort_intent(dfp->dfp_intent);
+ dfp->dfp_intent = NULL;
+ }
+}
+
+static inline void
+xfs_defer_pending_cancel_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp)
+{
+ struct list_head *pwi;
+ struct list_head *n;
+
+ trace_xfs_defer_cancel_list(mp, dfp);
+
+ list_del(&dfp->dfp_list);
+ list_for_each_safe(pwi, n, &dfp->dfp_work) {
+ list_del(pwi);
+ dfp->dfp_count--;
+ trace_xfs_defer_cancel_item(mp, dfp, pwi);
+ dfp->dfp_ops->cancel_item(pwi);
+ }
+ ASSERT(dfp->dfp_count == 0);
+ kmem_cache_free(xfs_defer_pending_cache, dfp);
+}
+
+STATIC void
+xfs_defer_pending_abort_list(
+ struct xfs_mount *mp,
struct list_head *dop_list)
{
struct xfs_defer_pending *dfp;
- const struct xfs_defer_op_type *ops;
/* Abort intent items that don't have a done item. */
- list_for_each_entry(dfp, dop_list, dfp_list) {
- ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_pending_abort(mp, dfp);
- if (dfp->dfp_intent && !dfp->dfp_done) {
- ops->abort_intent(dfp->dfp_intent);
- dfp->dfp_intent = NULL;
- }
- }
+ list_for_each_entry(dfp, dop_list, dfp_list)
+ xfs_defer_pending_abort(mp, dfp);
}
/* Abort all the intents that were committed. */
@@ -271,7 +375,7 @@ xfs_defer_trans_abort(
struct list_head *dop_pending)
{
trace_xfs_defer_trans_abort(tp, _RET_IP_);
- xfs_defer_pending_abort(tp->t_mountp, dop_pending);
+ xfs_defer_pending_abort_list(tp->t_mountp, dop_pending);
}
/*
@@ -389,27 +493,31 @@ xfs_defer_cancel_list(
{
struct xfs_defer_pending *dfp;
struct xfs_defer_pending *pli;
- struct list_head *pwi;
- struct list_head *n;
- const struct xfs_defer_op_type *ops;
/*
* Free the pending items. Caller should already have arranged
* for the intent items to be released.
*/
- list_for_each_entry_safe(dfp, pli, dop_list, dfp_list) {
- ops = defer_op_types[dfp->dfp_type];
- trace_xfs_defer_cancel_list(mp, dfp);
- list_del(&dfp->dfp_list);
- list_for_each_safe(pwi, n, &dfp->dfp_work) {
- list_del(pwi);
- dfp->dfp_count--;
- trace_xfs_defer_cancel_item(mp, dfp, pwi);
- ops->cancel_item(pwi);
- }
- ASSERT(dfp->dfp_count == 0);
- kmem_cache_free(xfs_defer_pending_cache, dfp);
+ list_for_each_entry_safe(dfp, pli, dop_list, dfp_list)
+ xfs_defer_pending_cancel_work(mp, dfp);
+}
+
+static inline void
+xfs_defer_relog_intent(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ struct xfs_log_item *lip;
+
+ xfs_defer_create_done(tp, dfp);
+
+ lip = dfp->dfp_ops->relog_intent(tp, dfp->dfp_intent, dfp->dfp_done);
+ if (lip) {
+ xfs_trans_add_item(tp, lip);
+ set_bit(XFS_LI_DIRTY, &lip->li_flags);
}
+ dfp->dfp_done = NULL;
+ dfp->dfp_intent = lip;
}
/*
@@ -417,7 +525,7 @@ xfs_defer_cancel_list(
* done item to release the intent item; and then log a new intent item.
* The caller should provide a fresh transaction and roll it after we're done.
*/
-static int
+static void
xfs_defer_relog(
struct xfs_trans **tpp,
struct list_head *dfops)
@@ -456,31 +564,28 @@ xfs_defer_relog(
trace_xfs_defer_relog_intent((*tpp)->t_mountp, dfp);
XFS_STATS_INC((*tpp)->t_mountp, defer_relog);
- dfp->dfp_intent = xfs_trans_item_relog(dfp->dfp_intent, *tpp);
- }
- if ((*tpp)->t_flags & XFS_TRANS_DIRTY)
- return xfs_defer_trans_roll(tpp);
- return 0;
+ xfs_defer_relog_intent(*tpp, dfp);
+ }
}
/*
* Log an intent-done item for the first pending intent, and finish the work
* items.
*/
-static int
+int
xfs_defer_finish_one(
struct xfs_trans *tp,
struct xfs_defer_pending *dfp)
{
- const struct xfs_defer_op_type *ops = defer_op_types[dfp->dfp_type];
+ const struct xfs_defer_op_type *ops = dfp->dfp_ops;
struct xfs_btree_cur *state = NULL;
struct list_head *li, *n;
int error;
trace_xfs_defer_pending_finish(tp->t_mountp, dfp);
- dfp->dfp_done = ops->create_done(tp, dfp->dfp_intent, dfp->dfp_count);
+ xfs_defer_create_done(tp, dfp);
list_for_each_safe(li, n, &dfp->dfp_work) {
list_del(li);
dfp->dfp_count--;
@@ -517,6 +622,24 @@ out:
return error;
}
+/* Move all paused deferred work from @tp to @paused_list. */
+static void
+xfs_defer_isolate_paused(
+ struct xfs_trans *tp,
+ struct list_head *paused_list)
+{
+ struct xfs_defer_pending *dfp;
+ struct xfs_defer_pending *pli;
+
+ list_for_each_entry_safe(dfp, pli, &tp->t_dfops, dfp_list) {
+ if (!(dfp->dfp_flags & XFS_DEFER_PAUSED))
+ continue;
+
+ list_move_tail(&dfp->dfp_list, paused_list);
+ trace_xfs_defer_isolate_paused(tp->t_mountp, dfp);
+ }
+}
+
/*
* Finish all the pending work. This involves logging intent items for
* any work items that wandered in since the last transaction roll (if
@@ -532,6 +655,7 @@ xfs_defer_finish_noroll(
struct xfs_defer_pending *dfp = NULL;
int error = 0;
LIST_HEAD(dop_pending);
+ LIST_HEAD(dop_paused);
ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
@@ -550,6 +674,8 @@ xfs_defer_finish_noroll(
*/
int has_intents = xfs_defer_create_intents(*tp);
+ xfs_defer_isolate_paused(*tp, &dop_paused);
+
list_splice_init(&(*tp)->t_dfops, &dop_pending);
if (has_intents < 0) {
@@ -562,22 +688,33 @@ xfs_defer_finish_noroll(
goto out_shutdown;
/* Relog intent items to keep the log moving. */
- error = xfs_defer_relog(tp, &dop_pending);
- if (error)
- goto out_shutdown;
+ xfs_defer_relog(tp, &dop_pending);
+ xfs_defer_relog(tp, &dop_paused);
+
+ if ((*tp)->t_flags & XFS_TRANS_DIRTY) {
+ error = xfs_defer_trans_roll(tp);
+ if (error)
+ goto out_shutdown;
+ }
}
- dfp = list_first_entry(&dop_pending, struct xfs_defer_pending,
- dfp_list);
+ dfp = list_first_entry_or_null(&dop_pending,
+ struct xfs_defer_pending, dfp_list);
+ if (!dfp)
+ break;
error = xfs_defer_finish_one(*tp, dfp);
if (error && error != -EAGAIN)
goto out_shutdown;
}
+ /* Requeue the paused items in the outgoing transaction. */
+ list_splice_tail_init(&dop_paused, &(*tp)->t_dfops);
+
trace_xfs_defer_finish_done(*tp, _RET_IP_);
return 0;
out_shutdown:
+ list_splice_tail_init(&dop_paused, &dop_pending);
xfs_defer_trans_abort(*tp, &dop_pending);
xfs_force_shutdown((*tp)->t_mountp, SHUTDOWN_CORRUPT_INCORE);
trace_xfs_defer_finish_error(*tp, error);
@@ -590,6 +727,9 @@ int
xfs_defer_finish(
struct xfs_trans **tp)
{
+#ifdef DEBUG
+ struct xfs_defer_pending *dfp;
+#endif
int error;
/*
@@ -609,7 +749,10 @@ xfs_defer_finish(
}
/* Reset LOWMODE now that we've finished all the dfops. */
- ASSERT(list_empty(&(*tp)->t_dfops));
+#ifdef DEBUG
+ list_for_each_entry(dfp, &(*tp)->t_dfops, dfp_list)
+ ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
+#endif
(*tp)->t_flags &= ~XFS_TRANS_LOWMODE;
return 0;
}
@@ -621,48 +764,165 @@ xfs_defer_cancel(
struct xfs_mount *mp = tp->t_mountp;
trace_xfs_defer_cancel(tp, _RET_IP_);
+ xfs_defer_trans_abort(tp, &tp->t_dfops);
xfs_defer_cancel_list(mp, &tp->t_dfops);
}
+/*
+ * Return the last pending work item attached to this transaction if it matches
+ * the deferred op type.
+ */
+static inline struct xfs_defer_pending *
+xfs_defer_find_last(
+ struct xfs_trans *tp,
+ const struct xfs_defer_op_type *ops)
+{
+ struct xfs_defer_pending *dfp = NULL;
+
+ /* No dfops at all? */
+ if (list_empty(&tp->t_dfops))
+ return NULL;
+
+ dfp = list_last_entry(&tp->t_dfops, struct xfs_defer_pending,
+ dfp_list);
+
+ /* Wrong type? */
+ if (dfp->dfp_ops != ops)
+ return NULL;
+ return dfp;
+}
+
+/*
+ * Decide if we can add a deferred work item to the last dfops item attached
+ * to the transaction.
+ */
+static inline bool
+xfs_defer_can_append(
+ struct xfs_defer_pending *dfp,
+ const struct xfs_defer_op_type *ops)
+{
+ /* Already logged? */
+ if (dfp->dfp_intent)
+ return false;
+
+ /* Paused items cannot absorb more work */
+ if (dfp->dfp_flags & XFS_DEFER_PAUSED)
+ return NULL;
+
+ /* Already full? */
+ if (ops->max_items && dfp->dfp_count >= ops->max_items)
+ return false;
+
+ return true;
+}
+
+/* Create a new pending item at the end of the transaction list. */
+static inline struct xfs_defer_pending *
+xfs_defer_alloc(
+ struct xfs_trans *tp,
+ const struct xfs_defer_op_type *ops)
+{
+ struct xfs_defer_pending *dfp;
+
+ dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ dfp->dfp_ops = ops;
+ INIT_LIST_HEAD(&dfp->dfp_work);
+ list_add_tail(&dfp->dfp_list, &tp->t_dfops);
+
+ return dfp;
+}
+
/* Add an item for later deferred processing. */
-void
+struct xfs_defer_pending *
xfs_defer_add(
struct xfs_trans *tp,
- enum xfs_defer_ops_type type,
- struct list_head *li)
+ struct list_head *li,
+ const struct xfs_defer_op_type *ops)
{
struct xfs_defer_pending *dfp = NULL;
- const struct xfs_defer_op_type *ops = defer_op_types[type];
ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
- BUILD_BUG_ON(ARRAY_SIZE(defer_op_types) != XFS_DEFER_OPS_TYPE_MAX);
- /*
- * Add the item to a pending item at the end of the intake list.
- * If the last pending item has the same type, reuse it. Else,
- * create a new pending item at the end of the intake list.
- */
- if (!list_empty(&tp->t_dfops)) {
- dfp = list_last_entry(&tp->t_dfops,
- struct xfs_defer_pending, dfp_list);
- if (dfp->dfp_type != type ||
- (ops->max_items && dfp->dfp_count >= ops->max_items))
- dfp = NULL;
- }
- if (!dfp) {
- dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
- GFP_NOFS | __GFP_NOFAIL);
- dfp->dfp_type = type;
- dfp->dfp_intent = NULL;
- dfp->dfp_done = NULL;
- dfp->dfp_count = 0;
- INIT_LIST_HEAD(&dfp->dfp_work);
- list_add_tail(&dfp->dfp_list, &tp->t_dfops);
- }
+ dfp = xfs_defer_find_last(tp, ops);
+ if (!dfp || !xfs_defer_can_append(dfp, ops))
+ dfp = xfs_defer_alloc(tp, ops);
- list_add_tail(li, &dfp->dfp_work);
+ xfs_defer_add_item(dfp, li);
trace_xfs_defer_add_item(tp->t_mountp, dfp, li);
- dfp->dfp_count++;
+ return dfp;
+}
+
+/*
+ * Add a defer ops barrier to force two otherwise adjacent deferred work items
+ * to be tracked separately and have separate log items.
+ */
+void
+xfs_defer_add_barrier(
+ struct xfs_trans *tp)
+{
+ struct xfs_defer_pending *dfp;
+
+ ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
+
+ /* If the last defer op added was a barrier, we're done. */
+ dfp = xfs_defer_find_last(tp, &xfs_barrier_defer_type);
+ if (dfp)
+ return;
+
+ xfs_defer_alloc(tp, &xfs_barrier_defer_type);
+
+ trace_xfs_defer_add_item(tp->t_mountp, dfp, NULL);
+}
+
+/*
+ * Create a pending deferred work item to replay the recovered intent item
+ * and add it to the list.
+ */
+void
+xfs_defer_start_recovery(
+ struct xfs_log_item *lip,
+ struct list_head *r_dfops,
+ const struct xfs_defer_op_type *ops)
+{
+ struct xfs_defer_pending *dfp;
+
+ dfp = kmem_cache_zalloc(xfs_defer_pending_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ dfp->dfp_ops = ops;
+ dfp->dfp_intent = lip;
+ INIT_LIST_HEAD(&dfp->dfp_work);
+ list_add_tail(&dfp->dfp_list, r_dfops);
+}
+
+/*
+ * Cancel a deferred work item created to recover a log intent item. @dfp
+ * will be freed after this function returns.
+ */
+void
+xfs_defer_cancel_recovery(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp)
+{
+ xfs_defer_pending_abort(mp, dfp);
+ xfs_defer_pending_cancel_work(mp, dfp);
+}
+
+/* Replay the deferred work item created from a recovered log intent item. */
+int
+xfs_defer_finish_recovery(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct list_head *capture_list)
+{
+ const struct xfs_defer_op_type *ops = dfp->dfp_ops;
+ int error;
+
+ /* dfp is freed by recover_work and must not be accessed afterwards */
+ error = ops->recover_work(dfp, capture_list);
+ if (error)
+ trace_xlog_intent_recovery_failed(mp, ops, error);
+ return error;
}
/*
@@ -769,7 +1029,7 @@ xfs_defer_ops_capture_abort(
{
unsigned short i;
- xfs_defer_pending_abort(mp, &dfc->dfc_dfops);
+ xfs_defer_pending_abort_list(mp, &dfc->dfc_dfops);
xfs_defer_cancel_list(mp, &dfc->dfc_dfops);
for (i = 0; i < dfc->dfc_held.dr_bufs; i++)
@@ -938,3 +1198,36 @@ xfs_defer_destroy_item_caches(void)
xfs_rmap_intent_destroy_cache();
xfs_defer_destroy_cache();
}
+
+/*
+ * Mark a deferred work item so that it will be requeued indefinitely without
+ * being finished. Caller must ensure there are no data dependencies on this
+ * work item in the meantime.
+ */
+void
+xfs_defer_item_pause(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ ASSERT(!(dfp->dfp_flags & XFS_DEFER_PAUSED));
+
+ dfp->dfp_flags |= XFS_DEFER_PAUSED;
+
+ trace_xfs_defer_item_pause(tp->t_mountp, dfp);
+}
+
+/*
+ * Release a paused deferred work item so that it will be finished during the
+ * next transaction roll.
+ */
+void
+xfs_defer_item_unpause(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ ASSERT(dfp->dfp_flags & XFS_DEFER_PAUSED);
+
+ dfp->dfp_flags &= ~XFS_DEFER_PAUSED;
+
+ trace_xfs_defer_item_unpause(tp->t_mountp, dfp);
+}
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index 8788ad5f6a73..18a9fb92dde8 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -11,19 +11,6 @@ struct xfs_defer_op_type;
struct xfs_defer_capture;
/*
- * Header for deferred operation list.
- */
-enum xfs_defer_ops_type {
- XFS_DEFER_OPS_TYPE_BMAP,
- XFS_DEFER_OPS_TYPE_REFCOUNT,
- XFS_DEFER_OPS_TYPE_RMAP,
- XFS_DEFER_OPS_TYPE_FREE,
- XFS_DEFER_OPS_TYPE_AGFL_FREE,
- XFS_DEFER_OPS_TYPE_ATTR,
- XFS_DEFER_OPS_TYPE_MAX,
-};
-
-/*
* Save a log intent item and a list of extents, so that we can replay
* whatever action had to happen to the extent list and file the log done
* item.
@@ -33,19 +20,35 @@ struct xfs_defer_pending {
struct list_head dfp_work; /* work items */
struct xfs_log_item *dfp_intent; /* log intent item */
struct xfs_log_item *dfp_done; /* log done item */
+ const struct xfs_defer_op_type *dfp_ops;
unsigned int dfp_count; /* # extent items */
- enum xfs_defer_ops_type dfp_type;
+ unsigned int dfp_flags;
};
-void xfs_defer_add(struct xfs_trans *tp, enum xfs_defer_ops_type type,
- struct list_head *h);
+/*
+ * Create a log intent item for this deferred item, but don't actually finish
+ * the work. Caller must clear this before the final transaction commit.
+ */
+#define XFS_DEFER_PAUSED (1U << 0)
+
+#define XFS_DEFER_PENDING_STRINGS \
+ { XFS_DEFER_PAUSED, "paused" }
+
+void xfs_defer_item_pause(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
+void xfs_defer_item_unpause(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
+
+struct xfs_defer_pending *xfs_defer_add(struct xfs_trans *tp, struct list_head *h,
+ const struct xfs_defer_op_type *ops);
int xfs_defer_finish_noroll(struct xfs_trans **tp);
int xfs_defer_finish(struct xfs_trans **tp);
+int xfs_defer_finish_one(struct xfs_trans *tp, struct xfs_defer_pending *dfp);
void xfs_defer_cancel(struct xfs_trans *);
void xfs_defer_move(struct xfs_trans *dtp, struct xfs_trans *stp);
/* Description of a deferred type. */
struct xfs_defer_op_type {
+ const char *name;
+ unsigned int max_items;
struct xfs_log_item *(*create_intent)(struct xfs_trans *tp,
struct list_head *items, unsigned int count, bool sort);
void (*abort_intent)(struct xfs_log_item *intent);
@@ -56,7 +59,11 @@ struct xfs_defer_op_type {
void (*finish_cleanup)(struct xfs_trans *tp,
struct xfs_btree_cur *state, int error);
void (*cancel_item)(struct list_head *item);
- unsigned int max_items;
+ int (*recover_work)(struct xfs_defer_pending *dfp,
+ struct list_head *capture_list);
+ struct xfs_log_item *(*relog_intent)(struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ struct xfs_log_item *done_item);
};
extern const struct xfs_defer_op_type xfs_bmap_update_defer_type;
@@ -125,7 +132,25 @@ void xfs_defer_ops_capture_abort(struct xfs_mount *mp,
struct xfs_defer_capture *d);
void xfs_defer_resources_rele(struct xfs_defer_resources *dres);
+void xfs_defer_start_recovery(struct xfs_log_item *lip,
+ struct list_head *r_dfops, const struct xfs_defer_op_type *ops);
+void xfs_defer_cancel_recovery(struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp);
+int xfs_defer_finish_recovery(struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp, struct list_head *capture_list);
+
+static inline void
+xfs_defer_add_item(
+ struct xfs_defer_pending *dfp,
+ struct list_head *work)
+{
+ list_add_tail(work, &dfp->dfp_work);
+ dfp->dfp_count++;
+}
+
int __init xfs_defer_init_item_caches(void);
void xfs_defer_destroy_item_caches(void);
+void xfs_defer_add_barrier(struct xfs_trans *tp);
+
#endif /* __XFS_DEFER_H__ */
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index f5462fd582d5..a76673281514 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -196,7 +196,7 @@ xfs_dir_isempty(
return 1;
if (dp->i_disk_size > xfs_inode_data_fork_size(dp))
return 0;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
return !sfp->count;
}
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 00f960a703b2..3c256d4cc40b 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -1089,7 +1089,7 @@ xfs_dir2_sf_to_block(
int newoffset; /* offset from current entry */
unsigned int offset = geo->data_entry_offset;
xfs_dir2_sf_entry_t *sfep; /* sf entry pointer */
- xfs_dir2_sf_hdr_t *oldsfp; /* old shortform header */
+ struct xfs_dir2_sf_hdr *oldsfp = ifp->if_data;
xfs_dir2_sf_hdr_t *sfp; /* shortform header */
__be16 *tagp; /* end of data entry */
struct xfs_name name;
@@ -1099,10 +1099,8 @@ xfs_dir2_sf_to_block(
ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
- oldsfp = (xfs_dir2_sf_hdr_t *)ifp->if_u1.if_data;
-
ASSERT(ifp->if_bytes == dp->i_disk_size);
- ASSERT(ifp->if_u1.if_data != NULL);
+ ASSERT(oldsfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(oldsfp->i8count));
ASSERT(dp->i_df.if_nextents == 0);
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index 7404a9ff1a92..1db2e60ba827 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -175,7 +175,8 @@ extern int xfs_dir2_sf_create(struct xfs_da_args *args, xfs_ino_t pino);
extern int xfs_dir2_sf_lookup(struct xfs_da_args *args);
extern int xfs_dir2_sf_removename(struct xfs_da_args *args);
extern int xfs_dir2_sf_replace(struct xfs_da_args *args);
-extern xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_inode *ip);
+xfs_failaddr_t xfs_dir2_sf_verify(struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *sfp, int64_t size);
int xfs_dir2_sf_entsize(struct xfs_mount *mp,
struct xfs_dir2_sf_hdr *hdr, int len);
void xfs_dir2_sf_put_ino(struct xfs_mount *mp, struct xfs_dir2_sf_hdr *hdr,
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 8cd37e6e9d38..e1f83fc7b6ad 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -364,25 +364,23 @@ int /* error */
xfs_dir2_sf_addname(
xfs_da_args_t *args) /* operation arguments */
{
- xfs_inode_t *dp; /* incore directory inode */
+ struct xfs_inode *dp = args->dp;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int error; /* error return value */
int incr_isize; /* total change in size */
int new_isize; /* size after adding name */
int objchange; /* changing to 8-byte inodes */
xfs_dir2_data_aoff_t offset = 0; /* offset for new entry */
int pick; /* which algorithm to use */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
xfs_dir2_sf_entry_t *sfep = NULL; /* shortform entry */
trace_xfs_dir2_sf_addname(args);
ASSERT(xfs_dir2_sf_lookup(args) == -ENOENT);
- dp = args->dp;
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Compute entry (and change in) size.
@@ -462,20 +460,17 @@ xfs_dir2_sf_addname_easy(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
- int byteoff; /* byte offset in sf dir */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
+ int byteoff = (int)((char *)sfep - (char *)sfp);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
- byteoff = (int)((char *)sfep - (char *)sfp);
/*
* Grow the in-inode space.
*/
- xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
+ sfp = xfs_idata_realloc(dp, xfs_dir2_sf_entsize(mp, sfp, args->namelen),
XFS_DATA_FORK);
/*
* Need to set up again due to realloc of the inode data.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
sfep = (xfs_dir2_sf_entry_t *)((char *)sfp + byteoff);
/*
* Fill in the new entry.
@@ -528,11 +523,10 @@ xfs_dir2_sf_addname_hard(
/*
* Copy the old directory to the stack buffer.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
old_isize = (int)dp->i_disk_size;
buf = kmem_alloc(old_isize, 0);
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
- memcpy(oldsfp, sfp, old_isize);
+ memcpy(oldsfp, dp->i_df.if_data, old_isize);
/*
* Loop over the old directory finding the place we're going
* to insert the new entry.
@@ -556,11 +550,8 @@ xfs_dir2_sf_addname_hard(
* the data.
*/
xfs_idata_realloc(dp, -old_isize, XFS_DATA_FORK);
- xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
- /*
- * Reset the pointer since the buffer was reallocated.
- */
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = xfs_idata_realloc(dp, new_isize, XFS_DATA_FORK);
+
/*
* Copy the first part of the directory, including the header.
*/
@@ -610,11 +601,10 @@ xfs_dir2_sf_addname_pick(
int i; /* entry number */
xfs_dir2_data_aoff_t offset; /* data block offset */
xfs_dir2_sf_entry_t *sfep; /* shortform entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int size; /* entry's data size */
int used; /* data bytes used */
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
size = xfs_dir2_data_entsize(mp, args->namelen);
offset = args->geo->data_first_offset;
sfep = xfs_dir2_sf_firstentry(sfp);
@@ -673,14 +663,13 @@ xfs_dir2_sf_check(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int i; /* entry number */
int i8count; /* number of big inode#s */
xfs_ino_t ino; /* entry inode number */
int offset; /* data offset */
xfs_dir2_sf_entry_t *sfep; /* shortform dir entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
offset = args->geo->data_first_offset;
ino = xfs_dir2_sf_get_parent_ino(sfp);
i8count = ino > XFS_DIR2_MAX_SHORT_INUM;
@@ -707,11 +696,10 @@ xfs_dir2_sf_check(
/* Verify the consistency of an inline directory. */
xfs_failaddr_t
xfs_dir2_sf_verify(
- struct xfs_inode *ip)
+ struct xfs_mount *mp,
+ struct xfs_dir2_sf_hdr *sfp,
+ int64_t size)
{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
- struct xfs_dir2_sf_hdr *sfp;
struct xfs_dir2_sf_entry *sfep;
struct xfs_dir2_sf_entry *next_sfep;
char *endp;
@@ -719,15 +707,9 @@ xfs_dir2_sf_verify(
int i;
int i8count;
int offset;
- int64_t size;
int error;
uint8_t filetype;
- ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
-
- sfp = (struct xfs_dir2_sf_hdr *)ifp->if_u1.if_data;
- size = ifp->if_bytes;
-
/*
* Give up if the directory is way too short.
*/
@@ -834,15 +816,13 @@ xfs_dir2_sf_create(
ASSERT(dp->i_df.if_bytes == 0);
i8count = pino > XFS_DIR2_MAX_SHORT_INUM;
size = xfs_dir2_sf_hdr_size(i8count);
+
/*
- * Make a buffer for the data.
- */
- xfs_idata_realloc(dp, size, XFS_DATA_FORK);
- /*
- * Fill in the header,
+ * Make a buffer for the data and fill in the header.
*/
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = xfs_idata_realloc(dp, size, XFS_DATA_FORK);
sfp->i8count = i8count;
+
/*
* Now can put in the inode number, since i8count is set.
*/
@@ -864,9 +844,9 @@ xfs_dir2_sf_lookup(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int i; /* entry index */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
enum xfs_dacmp cmp; /* comparison result */
xfs_dir2_sf_entry_t *ci_sfep; /* case-insens. entry */
@@ -877,8 +857,7 @@ xfs_dir2_sf_lookup(
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Special case for .
@@ -940,13 +919,13 @@ xfs_dir2_sf_removename(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int byteoff; /* offset of removed entry */
int entsize; /* this entry's size */
int i; /* shortform entry index */
int newsize; /* new inode size */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
trace_xfs_dir2_sf_removename(args);
@@ -954,8 +933,7 @@ xfs_dir2_sf_removename(
oldsize = (int)dp->i_disk_size;
ASSERT(oldsize >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == oldsize);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(oldsize >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
* Loop over the old directory entries.
@@ -992,11 +970,12 @@ xfs_dir2_sf_removename(
*/
sfp->count--;
dp->i_disk_size = newsize;
+
/*
* Reallocate, making it smaller.
*/
- xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = xfs_idata_realloc(dp, newsize - oldsize, XFS_DATA_FORK);
+
/*
* Are we changing inode number size?
*/
@@ -1019,13 +998,12 @@ xfs_dir2_sf_replace_needblock(
struct xfs_inode *dp,
xfs_ino_t inum)
{
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int newsize;
- struct xfs_dir2_sf_hdr *sfp;
if (dp->i_df.if_format != XFS_DINODE_FMT_LOCAL)
return false;
- sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
newsize = dp->i_df.if_bytes + (sfp->count + 1) * XFS_INO64_DIFF;
return inum > XFS_DIR2_MAX_SHORT_INUM &&
@@ -1041,19 +1019,18 @@ xfs_dir2_sf_replace(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
int i; /* entry index */
xfs_ino_t ino=0; /* entry old inode number */
int i8elevated; /* sf_toino8 set i8count=1 */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
trace_xfs_dir2_sf_replace(args);
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_disk_size >= offsetof(struct xfs_dir2_sf_hdr, parent));
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
ASSERT(dp->i_disk_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
/*
@@ -1076,7 +1053,7 @@ xfs_dir2_sf_replace(
*/
xfs_dir2_sf_toino8(args);
i8elevated = 1;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
} else
i8elevated = 0;
@@ -1157,11 +1134,11 @@ xfs_dir2_sf_toino4(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data;
char *buf; /* old dir's buffer */
int i; /* entry index */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
- xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
@@ -1175,7 +1152,6 @@ xfs_dir2_sf_toino4(
*/
oldsize = dp->i_df.if_bytes;
buf = kmem_alloc(oldsize, 0);
- oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
ASSERT(oldsfp->i8count == 1);
memcpy(buf, oldsfp, oldsize);
/*
@@ -1188,7 +1164,7 @@ xfs_dir2_sf_toino4(
* Reset our pointers, the data has moved.
*/
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
/*
* Fill in the new header.
*/
@@ -1230,11 +1206,11 @@ xfs_dir2_sf_toino8(
{
struct xfs_inode *dp = args->dp;
struct xfs_mount *mp = dp->i_mount;
+ struct xfs_dir2_sf_hdr *oldsfp = dp->i_df.if_data;
char *buf; /* old dir's buffer */
int i; /* entry index */
int newsize; /* new inode size */
xfs_dir2_sf_entry_t *oldsfep; /* old sf entry */
- xfs_dir2_sf_hdr_t *oldsfp; /* old sf directory */
int oldsize; /* old inode size */
xfs_dir2_sf_entry_t *sfep; /* new sf entry */
xfs_dir2_sf_hdr_t *sfp; /* new sf directory */
@@ -1248,7 +1224,6 @@ xfs_dir2_sf_toino8(
*/
oldsize = dp->i_df.if_bytes;
buf = kmem_alloc(oldsize, 0);
- oldsfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
ASSERT(oldsfp->i8count == 0);
memcpy(buf, oldsfp, oldsize);
/*
@@ -1261,7 +1236,7 @@ xfs_dir2_sf_toino8(
* Reset our pointers, the data has moved.
*/
oldsfp = (xfs_dir2_sf_hdr_t *)buf;
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ sfp = dp->i_df.if_data;
/*
* Fill in the new header.
*/
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 9a88aba1589f..382ab1e71c0b 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -1008,7 +1008,7 @@ enum xfs_dinode_fmt {
* Return pointers to the data or attribute forks.
*/
#define XFS_DFORK_DPTR(dip) \
- ((char *)dip + xfs_dinode_size(dip->di_version))
+ ((void *)dip + xfs_dinode_size(dip->di_version))
#define XFS_DFORK_APTR(dip) \
(XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
#define XFS_DFORK_PTR(dip,w) \
@@ -1156,20 +1156,6 @@ static inline bool xfs_dinode_has_large_extent_counts(
#define XFS_DFL_RTEXTSIZE (64 * 1024) /* 64kB */
#define XFS_MIN_RTEXTSIZE (4 * 1024) /* 4kB */
-#define XFS_BLOCKSIZE(mp) ((mp)->m_sb.sb_blocksize)
-#define XFS_BLOCKMASK(mp) ((mp)->m_blockmask)
-
-/*
- * RT bit manipulation macros.
- */
-#define XFS_RTMIN(a,b) ((a) < (b) ? (a) : (b))
-#define XFS_RTMAX(a,b) ((a) > (b) ? (a) : (b))
-
-#define XFS_RTLOBIT(w) xfs_lowbit32(w)
-#define XFS_RTHIBIT(w) xfs_highbit32(w)
-
-#define XFS_RTBLOCKLOG(b) xfs_highbit64(b)
-
/*
* Dquot and dquot block format definitions
*/
@@ -1272,6 +1258,9 @@ static inline time64_t xfs_dq_bigtime_to_unix(uint32_t ondisk_seconds)
#define XFS_DQ_GRACE_MIN ((int64_t)0)
#define XFS_DQ_GRACE_MAX ((int64_t)U32_MAX)
+/* Maximum id value for a quota record */
+#define XFS_DQ_ID_MAX (U32_MAX)
+
/*
* This is the main portion of the on-disk representation of quota information
* for a user. We pad this with some more expansion room to construct the on
diff --git a/fs/xfs/libxfs/xfs_health.h b/fs/xfs/libxfs/xfs_health.h
index 99e796256c5d..6296993ff8f3 100644
--- a/fs/xfs/libxfs/xfs_health.h
+++ b/fs/xfs/libxfs/xfs_health.h
@@ -68,6 +68,11 @@ struct xfs_fsop_geom;
#define XFS_SICK_INO_SYMLINK (1 << 6) /* symbolic link remote target */
#define XFS_SICK_INO_PARENT (1 << 7) /* parent pointers */
+#define XFS_SICK_INO_BMBTD_ZAPPED (1 << 8) /* data fork erased */
+#define XFS_SICK_INO_BMBTA_ZAPPED (1 << 9) /* attr fork erased */
+#define XFS_SICK_INO_DIR_ZAPPED (1 << 10) /* directory erased */
+#define XFS_SICK_INO_SYMLINK_ZAPPED (1 << 11) /* symlink erased */
+
/* Primary evidence of health problems in a given group. */
#define XFS_SICK_FS_PRIMARY (XFS_SICK_FS_COUNTERS | \
XFS_SICK_FS_UQUOTA | \
@@ -97,6 +102,11 @@ struct xfs_fsop_geom;
XFS_SICK_INO_SYMLINK | \
XFS_SICK_INO_PARENT)
+#define XFS_SICK_INO_ZAPPED (XFS_SICK_INO_BMBTD_ZAPPED | \
+ XFS_SICK_INO_BMBTA_ZAPPED | \
+ XFS_SICK_INO_DIR_ZAPPED | \
+ XFS_SICK_INO_SYMLINK_ZAPPED)
+
/* These functions must be provided by the xfs implementation. */
void xfs_fs_mark_sick(struct xfs_mount *mp, unsigned int mask);
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index b83e54c70906..2361a22035b0 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -95,18 +95,28 @@ xfs_inobt_btrec_to_irec(
irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
}
+/* Compute the freecount of an incore inode record. */
+uint8_t
+xfs_inobt_rec_freecount(
+ const struct xfs_inobt_rec_incore *irec)
+{
+ uint64_t realfree = irec->ir_free;
+
+ if (xfs_inobt_issparse(irec->ir_holemask))
+ realfree &= xfs_inobt_irec_to_allocmask(irec);
+ return hweight64(realfree);
+}
+
/* Simple checks for inode records. */
xfs_failaddr_t
xfs_inobt_check_irec(
- struct xfs_btree_cur *cur,
+ struct xfs_perag *pag,
const struct xfs_inobt_rec_incore *irec)
{
- uint64_t realfree;
-
/* Record has to be properly aligned within the AG. */
- if (!xfs_verify_agino(cur->bc_ag.pag, irec->ir_startino))
+ if (!xfs_verify_agino(pag, irec->ir_startino))
return __this_address;
- if (!xfs_verify_agino(cur->bc_ag.pag,
+ if (!xfs_verify_agino(pag,
irec->ir_startino + XFS_INODES_PER_CHUNK - 1))
return __this_address;
if (irec->ir_count < XFS_INODES_PER_HOLEMASK_BIT ||
@@ -115,12 +125,7 @@ xfs_inobt_check_irec(
if (irec->ir_freecount > XFS_INODES_PER_CHUNK)
return __this_address;
- /* if there are no holes, return the first available offset */
- if (!xfs_inobt_issparse(irec->ir_holemask))
- realfree = irec->ir_free;
- else
- realfree = irec->ir_free & xfs_inobt_irec_to_allocmask(irec);
- if (hweight64(realfree) != irec->ir_freecount)
+ if (xfs_inobt_rec_freecount(irec) != irec->ir_freecount)
return __this_address;
return NULL;
@@ -164,7 +169,7 @@ xfs_inobt_get_rec(
return error;
xfs_inobt_btrec_to_irec(mp, rec, irec);
- fa = xfs_inobt_check_irec(cur, irec);
+ fa = xfs_inobt_check_irec(cur->bc_ag.pag, irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, irec);
@@ -1854,7 +1859,7 @@ xfs_difree_inode_chunk(
return xfs_free_extent_later(tp,
XFS_AGB_TO_FSB(mp, agno, sagbno),
M_IGEO(mp)->ialloc_blks, &XFS_RMAP_OINFO_INODES,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
}
/* holemask is only 16-bits (fits in an unsigned long) */
@@ -1900,7 +1905,8 @@ xfs_difree_inode_chunk(
ASSERT(contigblk % mp->m_sb.sb_spino_align == 0);
error = xfs_free_extent_later(tp,
XFS_AGB_TO_FSB(mp, agno, agbno), contigblk,
- &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE);
+ &XFS_RMAP_OINFO_INODES, XFS_AG_RESV_NONE,
+ false);
if (error)
return error;
@@ -2739,7 +2745,7 @@ xfs_ialloc_count_inodes_rec(
xfs_failaddr_t fa;
xfs_inobt_btrec_to_irec(cur->bc_mp, rec, &irec);
- fa = xfs_inobt_check_irec(cur, &irec);
+ fa = xfs_inobt_check_irec(cur->bc_ag.pag, &irec);
if (fa)
return xfs_inobt_complain_bad_rec(cur, fa, &irec);
diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h
index fe824bb04a09..f1412183bb44 100644
--- a/fs/xfs/libxfs/xfs_ialloc.h
+++ b/fs/xfs/libxfs/xfs_ialloc.h
@@ -79,6 +79,7 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
*/
int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
xfs_inobt_rec_incore_t *rec, int *stat);
+uint8_t xfs_inobt_rec_freecount(const struct xfs_inobt_rec_incore *irec);
/*
* Inode chunk initialisation routine
@@ -93,7 +94,7 @@ union xfs_btree_rec;
void xfs_inobt_btrec_to_irec(struct xfs_mount *mp,
const union xfs_btree_rec *rec,
struct xfs_inobt_rec_incore *irec);
-xfs_failaddr_t xfs_inobt_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_inobt_check_irec(struct xfs_perag *pag,
const struct xfs_inobt_rec_incore *irec);
int xfs_ialloc_has_inodes_at_extent(struct xfs_btree_cur *cur,
xfs_agblock_t bno, xfs_extlen_t len,
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index 9258f01c0015..42a5e1f227a0 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -161,7 +161,7 @@ __xfs_inobt_free_block(
xfs_inobt_mod_blockcount(cur, -1);
fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
- &XFS_RMAP_OINFO_INOBT, resv);
+ &XFS_RMAP_OINFO_INOBT, resv, false);
}
STATIC int
diff --git a/fs/xfs/libxfs/xfs_iext_tree.c b/fs/xfs/libxfs/xfs_iext_tree.c
index 773cf4349428..f4e6b200cdf8 100644
--- a/fs/xfs/libxfs/xfs_iext_tree.c
+++ b/fs/xfs/libxfs/xfs_iext_tree.c
@@ -158,7 +158,7 @@ static void *
xfs_iext_find_first_leaf(
struct xfs_ifork *ifp)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height;
if (!ifp->if_height)
@@ -176,7 +176,7 @@ static void *
xfs_iext_find_last_leaf(
struct xfs_ifork *ifp)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height, i;
if (!ifp->if_height)
@@ -306,7 +306,7 @@ xfs_iext_find_level(
xfs_fileoff_t offset,
int level)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height, i;
if (!ifp->if_height)
@@ -402,12 +402,12 @@ xfs_iext_grow(
int i;
if (ifp->if_height == 1) {
- struct xfs_iext_leaf *prev = ifp->if_u1.if_root;
+ struct xfs_iext_leaf *prev = ifp->if_data;
node->keys[0] = xfs_iext_leaf_key(prev, 0);
node->ptrs[0] = prev;
} else {
- struct xfs_iext_node *prev = ifp->if_u1.if_root;
+ struct xfs_iext_node *prev = ifp->if_data;
ASSERT(ifp->if_height > 1);
@@ -418,7 +418,7 @@ xfs_iext_grow(
for (i = 1; i < KEYS_PER_NODE; i++)
node->keys[i] = XFS_IEXT_KEY_INVALID;
- ifp->if_u1.if_root = node;
+ ifp->if_data = node;
ifp->if_height++;
}
@@ -430,7 +430,7 @@ xfs_iext_update_node(
int level,
void *ptr)
{
- struct xfs_iext_node *node = ifp->if_u1.if_root;
+ struct xfs_iext_node *node = ifp->if_data;
int height, i;
for (height = ifp->if_height; height > level; height--) {
@@ -583,11 +583,11 @@ xfs_iext_alloc_root(
{
ASSERT(ifp->if_bytes == 0);
- ifp->if_u1.if_root = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
+ ifp->if_data = kmem_zalloc(sizeof(struct xfs_iext_rec), KM_NOFS);
ifp->if_height = 1;
/* now that we have a node step into it */
- cur->leaf = ifp->if_u1.if_root;
+ cur->leaf = ifp->if_data;
cur->pos = 0;
}
@@ -603,9 +603,9 @@ xfs_iext_realloc_root(
if (new_size / sizeof(struct xfs_iext_rec) == RECS_PER_LEAF)
new_size = NODE_SIZE;
- new = krealloc(ifp->if_u1.if_root, new_size, GFP_NOFS | __GFP_NOFAIL);
+ new = krealloc(ifp->if_data, new_size, GFP_NOFS | __GFP_NOFAIL);
memset(new + ifp->if_bytes, 0, new_size - ifp->if_bytes);
- ifp->if_u1.if_root = new;
+ ifp->if_data = new;
cur->leaf = new;
}
@@ -622,13 +622,11 @@ static inline void xfs_iext_inc_seq(struct xfs_ifork *ifp)
}
void
-xfs_iext_insert(
- struct xfs_inode *ip,
+xfs_iext_insert_raw(
+ struct xfs_ifork *ifp,
struct xfs_iext_cursor *cur,
- struct xfs_bmbt_irec *irec,
- int state)
+ struct xfs_bmbt_irec *irec)
{
- struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
xfs_fileoff_t offset = irec->br_startoff;
struct xfs_iext_leaf *new = NULL;
int nr_entries, i;
@@ -662,12 +660,23 @@ xfs_iext_insert(
xfs_iext_set(cur_rec(cur), irec);
ifp->if_bytes += sizeof(struct xfs_iext_rec);
- trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
-
if (new)
xfs_iext_insert_node(ifp, xfs_iext_leaf_key(new, 0), new, 2);
}
+void
+xfs_iext_insert(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *irec,
+ int state)
+{
+ struct xfs_ifork *ifp = xfs_iext_state_to_fork(ip, state);
+
+ xfs_iext_insert_raw(ifp, cur, irec);
+ trace_xfs_iext_insert(ip, cur, state, _RET_IP_);
+}
+
static struct xfs_iext_node *
xfs_iext_rebalance_node(
struct xfs_iext_node *parent,
@@ -777,8 +786,8 @@ again:
* If we are at the root and only one entry is left we can just
* free this node and update the root pointer.
*/
- ASSERT(node == ifp->if_u1.if_root);
- ifp->if_u1.if_root = node->ptrs[0];
+ ASSERT(node == ifp->if_data);
+ ifp->if_data = node->ptrs[0];
ifp->if_height--;
kmem_free(node);
}
@@ -854,8 +863,8 @@ xfs_iext_free_last_leaf(
struct xfs_ifork *ifp)
{
ifp->if_height--;
- kmem_free(ifp->if_u1.if_root);
- ifp->if_u1.if_root = NULL;
+ kmem_free(ifp->if_data);
+ ifp->if_data = NULL;
}
void
@@ -872,7 +881,7 @@ xfs_iext_remove(
trace_xfs_iext_remove(ip, cur, state, _RET_IP_);
ASSERT(ifp->if_height > 0);
- ASSERT(ifp->if_u1.if_root != NULL);
+ ASSERT(ifp->if_data != NULL);
ASSERT(xfs_iext_valid(ifp, cur));
xfs_iext_inc_seq(ifp);
@@ -1042,9 +1051,9 @@ void
xfs_iext_destroy(
struct xfs_ifork *ifp)
{
- xfs_iext_destroy_node(ifp->if_u1.if_root, ifp->if_height);
+ xfs_iext_destroy_node(ifp->if_data, ifp->if_height);
ifp->if_bytes = 0;
ifp->if_height = 0;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 5a2e7ddfa76d..f4569e18a8d0 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -50,12 +50,15 @@ xfs_init_local_fork(
mem_size++;
if (size) {
- ifp->if_u1.if_data = kmem_alloc(mem_size, KM_NOFS);
- memcpy(ifp->if_u1.if_data, data, size);
+ char *new_data = kmem_alloc(mem_size, KM_NOFS);
+
+ memcpy(new_data, data, size);
if (zero_terminate)
- ifp->if_u1.if_data[size] = '\0';
+ new_data[size] = '\0';
+
+ ifp->if_data = new_data;
} else {
- ifp->if_u1.if_data = NULL;
+ ifp->if_data = NULL;
}
ifp->if_bytes = size;
@@ -125,7 +128,7 @@ xfs_iformat_extents(
}
ifp->if_bytes = 0;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
if (size) {
dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
@@ -212,7 +215,7 @@ xfs_iformat_btree(
ifp->if_broot, size);
ifp->if_bytes = 0;
- ifp->if_u1.if_root = NULL;
+ ifp->if_data = NULL;
ifp->if_height = 0;
return 0;
}
@@ -276,10 +279,9 @@ static uint16_t
xfs_dfork_attr_shortform_size(
struct xfs_dinode *dip)
{
- struct xfs_attr_shortform *atp =
- (struct xfs_attr_shortform *)XFS_DFORK_APTR(dip);
+ struct xfs_attr_sf_hdr *sf = XFS_DFORK_APTR(dip);
- return be16_to_cpu(atp->hdr.totsize);
+ return be16_to_cpu(sf->totsize);
}
void
@@ -493,7 +495,7 @@ xfs_iroot_realloc(
* byte_diff -- the change in the number of bytes, positive or negative,
* requested for the if_data array.
*/
-void
+void *
xfs_idata_realloc(
struct xfs_inode *ip,
int64_t byte_diff,
@@ -505,21 +507,18 @@ xfs_idata_realloc(
ASSERT(new_size >= 0);
ASSERT(new_size <= xfs_inode_fork_size(ip, whichfork));
- if (byte_diff == 0)
- return;
-
- if (new_size == 0) {
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = NULL;
- ifp->if_bytes = 0;
- return;
+ if (byte_diff) {
+ ifp->if_data = krealloc(ifp->if_data, new_size,
+ GFP_NOFS | __GFP_NOFAIL);
+ if (new_size == 0)
+ ifp->if_data = NULL;
+ ifp->if_bytes = new_size;
}
- ifp->if_u1.if_data = krealloc(ifp->if_u1.if_data, new_size,
- GFP_NOFS | __GFP_NOFAIL);
- ifp->if_bytes = new_size;
+ return ifp->if_data;
}
+/* Free all memory and reset a fork back to its initial state. */
void
xfs_idestroy_fork(
struct xfs_ifork *ifp)
@@ -531,8 +530,8 @@ xfs_idestroy_fork(
switch (ifp->if_format) {
case XFS_DINODE_FMT_LOCAL:
- kmem_free(ifp->if_u1.if_data);
- ifp->if_u1.if_data = NULL;
+ kmem_free(ifp->if_data);
+ ifp->if_data = NULL;
break;
case XFS_DINODE_FMT_EXTENTS:
case XFS_DINODE_FMT_BTREE:
@@ -625,9 +624,9 @@ xfs_iflush_fork(
case XFS_DINODE_FMT_LOCAL:
if ((iip->ili_fields & dataflag[whichfork]) &&
(ifp->if_bytes > 0)) {
- ASSERT(ifp->if_u1.if_data != NULL);
+ ASSERT(ifp->if_data != NULL);
ASSERT(ifp->if_bytes <= xfs_inode_fork_size(ip, whichfork));
- memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
+ memcpy(cp, ifp->if_data, ifp->if_bytes);
}
break;
@@ -702,19 +701,27 @@ xfs_ifork_verify_local_data(
xfs_failaddr_t fa = NULL;
switch (VFS_I(ip)->i_mode & S_IFMT) {
- case S_IFDIR:
- fa = xfs_dir2_sf_verify(ip);
+ case S_IFDIR: {
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+ struct xfs_dir2_sf_hdr *sfp = ifp->if_data;
+
+ fa = xfs_dir2_sf_verify(mp, sfp, ifp->if_bytes);
break;
- case S_IFLNK:
- fa = xfs_symlink_shortform_verify(ip);
+ }
+ case S_IFLNK: {
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
+
+ fa = xfs_symlink_shortform_verify(ifp->if_data, ifp->if_bytes);
break;
+ }
default:
break;
}
if (fa) {
xfs_inode_verifier_error(ip, -EFSCORRUPTED, "data fork",
- ip->i_df.if_u1.if_data, ip->i_df.if_bytes, fa);
+ ip->i_df.if_data, ip->i_df.if_bytes, fa);
return -EFSCORRUPTED;
}
@@ -729,14 +736,17 @@ xfs_ifork_verify_local_attr(
struct xfs_ifork *ifp = &ip->i_af;
xfs_failaddr_t fa;
- if (!xfs_inode_has_attr_fork(ip))
+ if (!xfs_inode_has_attr_fork(ip)) {
fa = __this_address;
- else
- fa = xfs_attr_shortform_verify(ip);
+ } else {
+ struct xfs_ifork *ifp = &ip->i_af;
+ ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
+ fa = xfs_attr_shortform_verify(ifp->if_data, ifp->if_bytes);
+ }
if (fa) {
xfs_inode_verifier_error(ip, -EFSCORRUPTED, "attr fork",
- ifp->if_u1.if_data, ifp->if_bytes, fa);
+ ifp->if_data, ifp->if_bytes, fa);
return -EFSCORRUPTED;
}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index 96d307784c85..96303249d28a 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -13,14 +13,12 @@ struct xfs_dinode;
* File incore extent information, present for each of data & attr forks.
*/
struct xfs_ifork {
- int64_t if_bytes; /* bytes in if_u1 */
+ int64_t if_bytes; /* bytes in if_data */
struct xfs_btree_block *if_broot; /* file's incore btree root */
unsigned int if_seq; /* fork mod counter */
int if_height; /* height of the extent tree */
- union {
- void *if_root; /* extent tree root */
- char *if_data; /* inline file data */
- } if_u1;
+ void *if_data; /* extent tree root or
+ inline data */
xfs_extnum_t if_nextents; /* # of extents in this fork */
short if_broot_bytes; /* bytes allocated for root */
int8_t if_format; /* format of this fork */
@@ -170,7 +168,7 @@ int xfs_iformat_attr_fork(struct xfs_inode *, struct xfs_dinode *);
void xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
struct xfs_inode_log_item *, int);
void xfs_idestroy_fork(struct xfs_ifork *ifp);
-void xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
+void * xfs_idata_realloc(struct xfs_inode *ip, int64_t byte_diff,
int whichfork);
void xfs_iroot_realloc(struct xfs_inode *, int, int);
int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
@@ -180,6 +178,9 @@ void xfs_init_local_fork(struct xfs_inode *ip, int whichfork,
const void *data, int64_t size);
xfs_extnum_t xfs_iext_count(struct xfs_ifork *ifp);
+void xfs_iext_insert_raw(struct xfs_ifork *ifp,
+ struct xfs_iext_cursor *cur,
+ struct xfs_bmbt_irec *irec);
void xfs_iext_insert(struct xfs_inode *, struct xfs_iext_cursor *cur,
struct xfs_bmbt_irec *, int);
void xfs_iext_remove(struct xfs_inode *, struct xfs_iext_cursor *,
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index a5100a11faf9..9fe7a9564bca 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -11,6 +11,7 @@
* define how recovery should work for that type of log item.
*/
struct xlog_recover_item;
+struct xfs_defer_op_type;
/* Sorting hat for log items as they're read in. */
enum xlog_recover_reorder {
@@ -153,4 +154,11 @@ xlog_recover_resv(const struct xfs_trans_res *r)
return ret;
}
+struct xfs_defer_pending;
+
+void xlog_recover_intent_item(struct xlog *log, struct xfs_log_item *lip,
+ xfs_lsn_t lsn, const struct xfs_defer_op_type *ops);
+int xlog_recover_finish_intent(struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp);
+
#endif /* __XFS_LOG_RECOVER_H__ */
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h
index 21a7e350b4c5..81885a6a028e 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/libxfs/xfs_ondisk.h
@@ -7,16 +7,16 @@
#define __XFS_ONDISK_H
#define XFS_CHECK_STRUCT_SIZE(structname, size) \
- BUILD_BUG_ON_MSG(sizeof(structname) != (size), "XFS: sizeof(" \
- #structname ") is wrong, expected " #size)
+ static_assert(sizeof(structname) == (size), \
+ "XFS: sizeof(" #structname ") is wrong, expected " #size)
#define XFS_CHECK_OFFSET(structname, member, off) \
- BUILD_BUG_ON_MSG(offsetof(structname, member) != (off), \
+ static_assert(offsetof(structname, member) == (off), \
"XFS: offsetof(" #structname ", " #member ") is wrong, " \
"expected " #off)
#define XFS_CHECK_VALUE(value, expected) \
- BUILD_BUG_ON_MSG((value) != (expected), \
+ static_assert((value) == (expected), \
"XFS: value of " #value " is wrong, expected " #expected)
static inline void __init
@@ -93,13 +93,13 @@ xfs_check_ondisk_structs(void)
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8);
XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9);
XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32);
- XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].valuelen, 5);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].flags, 6);
- XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].nameval, 7);
+ XFS_CHECK_STRUCT_SIZE(struct xfs_attr_sf_hdr, 4);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, totsize, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_hdr, count, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, namelen, 0);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, valuelen, 1);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, flags, 2);
+ XFS_CHECK_OFFSET(struct xfs_attr_sf_entry, nameval, 3);
XFS_CHECK_STRUCT_SIZE(xfs_da_blkinfo_t, 12);
XFS_CHECK_STRUCT_SIZE(xfs_da_intnode_t, 16);
XFS_CHECK_STRUCT_SIZE(xfs_da_node_entry_t, 8);
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
index 646b3fa362ad..6709a7f8bad5 100644
--- a/fs/xfs/libxfs/xfs_refcount.c
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -123,11 +123,9 @@ xfs_refcount_btrec_to_irec(
/* Simple checks for refcount records. */
xfs_failaddr_t
xfs_refcount_check_irec(
- struct xfs_btree_cur *cur,
+ struct xfs_perag *pag,
const struct xfs_refcount_irec *irec)
{
- struct xfs_perag *pag = cur->bc_ag.pag;
-
if (irec->rc_blockcount == 0 || irec->rc_blockcount > MAXREFCEXTLEN)
return __this_address;
@@ -179,7 +177,7 @@ xfs_refcount_get_rec(
return error;
xfs_refcount_btrec_to_irec(rec, irec);
- fa = xfs_refcount_check_irec(cur, irec);
+ fa = xfs_refcount_check_irec(cur->bc_ag.pag, irec);
if (fa)
return xfs_refcount_complain_bad_rec(cur, fa, irec);
@@ -1153,7 +1151,7 @@ xfs_refcount_adjust_extents(
tmp.rc_startblock);
error = xfs_free_extent_later(cur->bc_tp, fsbno,
tmp.rc_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
goto out_error;
}
@@ -1215,7 +1213,7 @@ xfs_refcount_adjust_extents(
ext.rc_startblock);
error = xfs_free_extent_later(cur->bc_tp, fsbno,
ext.rc_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
goto out_error;
}
@@ -1458,7 +1456,7 @@ __xfs_refcount_add(
ri->ri_blockcount = blockcount;
xfs_refcount_update_get_group(tp->t_mountp, ri);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
+ xfs_defer_add(tp, &ri->ri_list, &xfs_refcount_update_defer_type);
}
/*
@@ -1899,7 +1897,7 @@ xfs_refcount_recover_extent(
INIT_LIST_HEAD(&rr->rr_list);
xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
- if (xfs_refcount_check_irec(cur, &rr->rr_rrec) != NULL ||
+ if (xfs_refcount_check_irec(cur->bc_ag.pag, &rr->rr_rrec) != NULL ||
XFS_IS_CORRUPT(cur->bc_mp,
rr->rr_rrec.rc_domain != XFS_REFC_DOMAIN_COW)) {
kfree(rr);
@@ -1985,7 +1983,7 @@ xfs_refcount_recover_cow_leftovers(
/* Free the block. */
error = xfs_free_extent_later(tp, fsb,
rr->rr_rrec.rc_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
goto out_trans;
@@ -2033,6 +2031,47 @@ xfs_refcount_has_records(
return xfs_btree_has_records(cur, &low, &high, NULL, outcome);
}
+struct xfs_refcount_query_range_info {
+ xfs_refcount_query_range_fn fn;
+ void *priv;
+};
+
+/* Format btree record and pass to our callback. */
+STATIC int
+xfs_refcount_query_range_helper(
+ struct xfs_btree_cur *cur,
+ const union xfs_btree_rec *rec,
+ void *priv)
+{
+ struct xfs_refcount_query_range_info *query = priv;
+ struct xfs_refcount_irec irec;
+ xfs_failaddr_t fa;
+
+ xfs_refcount_btrec_to_irec(rec, &irec);
+ fa = xfs_refcount_check_irec(cur->bc_ag.pag, &irec);
+ if (fa)
+ return xfs_refcount_complain_bad_rec(cur, fa, &irec);
+
+ return query->fn(cur, &irec, query->priv);
+}
+
+/* Find all refcount records between two keys. */
+int
+xfs_refcount_query_range(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *low_rec,
+ const struct xfs_refcount_irec *high_rec,
+ xfs_refcount_query_range_fn fn,
+ void *priv)
+{
+ union xfs_btree_irec low_brec = { .rc = *low_rec };
+ union xfs_btree_irec high_brec = { .rc = *high_rec };
+ struct xfs_refcount_query_range_info query = { .priv = priv, .fn = fn };
+
+ return xfs_btree_query_range(cur, &low_brec, &high_brec,
+ xfs_refcount_query_range_helper, &query);
+}
+
int __init
xfs_refcount_intent_init_cache(void)
{
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
index 783cd89ca195..9b56768a590c 100644
--- a/fs/xfs/libxfs/xfs_refcount.h
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -117,7 +117,7 @@ extern int xfs_refcount_has_records(struct xfs_btree_cur *cur,
union xfs_btree_rec;
extern void xfs_refcount_btrec_to_irec(const union xfs_btree_rec *rec,
struct xfs_refcount_irec *irec);
-xfs_failaddr_t xfs_refcount_check_irec(struct xfs_btree_cur *cur,
+xfs_failaddr_t xfs_refcount_check_irec(struct xfs_perag *pag,
const struct xfs_refcount_irec *irec);
extern int xfs_refcount_insert(struct xfs_btree_cur *cur,
struct xfs_refcount_irec *irec, int *stat);
@@ -127,4 +127,14 @@ extern struct kmem_cache *xfs_refcount_intent_cache;
int __init xfs_refcount_intent_init_cache(void);
void xfs_refcount_intent_destroy_cache(void);
+typedef int (*xfs_refcount_query_range_fn)(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *rec,
+ void *priv);
+
+int xfs_refcount_query_range(struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *low_rec,
+ const struct xfs_refcount_irec *high_rec,
+ xfs_refcount_query_range_fn fn, void *priv);
+
#endif /* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 5c3987d8dc24..0d80bd99147c 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -112,7 +112,7 @@ xfs_refcountbt_free_block(
be32_add_cpu(&agf->agf_refcount_blocks, -1);
xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
return xfs_free_extent_later(cur->bc_tp, fsbno, 1,
- &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
+ &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA, false);
}
STATIC int
@@ -226,7 +226,18 @@ xfs_refcountbt_verify(
level = be16_to_cpu(block->bb_level);
if (pag && xfs_perag_initialised_agf(pag)) {
- if (level >= pag->pagf_refcount_level)
+ unsigned int maxlevel = pag->pagf_refcount_level;
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+ /*
+ * Online repair could be rewriting the refcount btree, so
+ * we'll validate against the larger of either tree while this
+ * is going on.
+ */
+ maxlevel = max_t(unsigned int, maxlevel,
+ pag->pagf_repair_refcount_level);
+#endif
+ if (level >= maxlevel)
return __this_address;
} else if (level >= mp->m_refc_maxlevels)
return __this_address;
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index fbb0b2637463..76bf7f48cb5a 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -2567,7 +2567,7 @@ __xfs_rmap_add(
ri->ri_bmap = *bmap;
xfs_rmap_update_get_group(tp->t_mountp, ri);
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_RMAP, &ri->ri_list);
+ xfs_defer_add(tp, &ri->ri_list, &xfs_rmap_update_defer_type);
}
/* Map an extent into a file. */
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index c269d704314d..e31663cb7b43 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -184,7 +184,7 @@ xfs_rtfind_back(
* Calculate first (leftmost) bit number to look at,
* and mask for all the relevant bits in this word.
*/
- firstbit = XFS_RTMAX((xfs_srtblock_t)(bit - len + 1), 0);
+ firstbit = max_t(xfs_srtblock_t, bit - len + 1, 0);
mask = (((xfs_rtword_t)1 << (bit - firstbit + 1)) - 1) <<
firstbit;
/*
@@ -195,7 +195,7 @@ xfs_rtfind_back(
/*
* Different. Mark where we are and return.
*/
- i = bit - XFS_RTHIBIT(wdiff);
+ i = bit - xfs_highbit32(wdiff);
*rtx = start - i + 1;
return 0;
}
@@ -233,7 +233,7 @@ xfs_rtfind_back(
/*
* Different, mark where we are and return.
*/
- i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ i += XFS_NBWORD - 1 - xfs_highbit32(wdiff);
*rtx = start - i + 1;
return 0;
}
@@ -272,7 +272,7 @@ xfs_rtfind_back(
/*
* Different, mark where we are and return.
*/
- i += XFS_NBWORD - 1 - XFS_RTHIBIT(wdiff);
+ i += XFS_NBWORD - 1 - xfs_highbit32(wdiff);
*rtx = start - i + 1;
return 0;
} else
@@ -338,7 +338,7 @@ xfs_rtfind_forw(
* Calculate last (rightmost) bit number to look at,
* and mask for all the relevant bits in this word.
*/
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ lastbit = min(bit + len, XFS_NBWORD);
mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
/*
* Calculate the difference between the value there
@@ -348,7 +348,7 @@ xfs_rtfind_forw(
/*
* Different. Mark where we are and return.
*/
- i = XFS_RTLOBIT(wdiff) - bit;
+ i = xfs_lowbit32(wdiff) - bit;
*rtx = start + i - 1;
return 0;
}
@@ -386,7 +386,7 @@ xfs_rtfind_forw(
/*
* Different, mark where we are and return.
*/
- i += XFS_RTLOBIT(wdiff);
+ i += xfs_lowbit32(wdiff);
*rtx = start + i - 1;
return 0;
}
@@ -423,7 +423,7 @@ xfs_rtfind_forw(
/*
* Different, mark where we are and return.
*/
- i += XFS_RTLOBIT(wdiff);
+ i += xfs_lowbit32(wdiff);
*rtx = start + i - 1;
return 0;
} else
@@ -452,71 +452,59 @@ xfs_trans_log_rtsummary(
}
/*
- * Read and/or modify the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- *
- * Summary information is returned in *sum if specified.
- * If no delta is specified, returns summary only.
+ * Modify the summary information for a given extent size, bitmap block
+ * combination.
*/
int
-xfs_rtmodify_summary_int(
+xfs_rtmodify_summary(
struct xfs_rtalloc_args *args,
int log, /* log2 of extent size */
xfs_fileoff_t bbno, /* bitmap block number */
- int delta, /* change to make to summary info */
- xfs_suminfo_t *sum) /* out: summary info for this block */
+ int delta) /* in/out: summary block number */
{
struct xfs_mount *mp = args->mp;
- int error;
- xfs_fileoff_t sb; /* summary fsblock */
- xfs_rtsumoff_t so; /* index into the summary file */
+ xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno);
unsigned int infoword;
+ xfs_suminfo_t val;
+ int error;
- /*
- * Compute entry number in the summary file.
- */
- so = xfs_rtsumoffs(mp, log, bbno);
- /*
- * Compute the block number in the summary file.
- */
- sb = xfs_rtsumoffs_to_block(mp, so);
-
- error = xfs_rtsummary_read_buf(args, sb);
+ error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so));
if (error)
return error;
- /*
- * Point to the summary information, modify/log it, and/or copy it out.
- */
infoword = xfs_rtsumoffs_to_infoword(mp, so);
- if (delta) {
- xfs_suminfo_t val = xfs_suminfo_add(args, infoword, delta);
-
- if (mp->m_rsum_cache) {
- if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log;
- if (val != 0 && log >= mp->m_rsum_cache[bbno])
- mp->m_rsum_cache[bbno] = log + 1;
- }
- xfs_trans_log_rtsummary(args, infoword);
- if (sum)
- *sum = val;
- } else if (sum) {
- *sum = xfs_suminfo_get(args, infoword);
+ val = xfs_suminfo_add(args, infoword, delta);
+
+ if (mp->m_rsum_cache) {
+ if (val == 0 && log + 1 == mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log;
+ if (val != 0 && log >= mp->m_rsum_cache[bbno])
+ mp->m_rsum_cache[bbno] = log + 1;
}
+
+ xfs_trans_log_rtsummary(args, infoword);
return 0;
}
+/*
+ * Read and return the summary information for a given extent size, bitmap block
+ * combination.
+ */
int
-xfs_rtmodify_summary(
+xfs_rtget_summary(
struct xfs_rtalloc_args *args,
int log, /* log2 of extent size */
xfs_fileoff_t bbno, /* bitmap block number */
- int delta) /* in/out: summary block number */
+ xfs_suminfo_t *sum) /* out: summary info for this block */
{
- return xfs_rtmodify_summary_int(args, log, bbno, delta, NULL);
+ struct xfs_mount *mp = args->mp;
+ xfs_rtsumoff_t so = xfs_rtsumoffs(mp, log, bbno);
+ int error;
+
+ error = xfs_rtsummary_read_buf(args, xfs_rtsumoffs_to_block(mp, so));
+ if (!error)
+ *sum = xfs_suminfo_get(args, xfs_rtsumoffs_to_infoword(mp, so));
+ return error;
}
/* Log rtbitmap block from the word @from to the byte before @next. */
@@ -585,7 +573,7 @@ xfs_rtmodify_range(
/*
* Compute first bit not changed and mask of relevant bits.
*/
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ lastbit = min(bit + len, XFS_NBWORD);
mask = (((xfs_rtword_t)1 << (lastbit - bit)) - 1) << bit;
/*
* Set/clear the active bits.
@@ -720,7 +708,7 @@ xfs_rtfree_range(
*/
if (preblock < start) {
error = xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(start - preblock),
+ xfs_highbit64(start - preblock),
xfs_rtx_to_rbmblock(mp, preblock), -1);
if (error) {
return error;
@@ -732,7 +720,7 @@ xfs_rtfree_range(
*/
if (postblock > end) {
error = xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(postblock - end),
+ xfs_highbit64(postblock - end),
xfs_rtx_to_rbmblock(mp, end + 1), -1);
if (error) {
return error;
@@ -743,7 +731,7 @@ xfs_rtfree_range(
* (new) free extent.
*/
return xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ xfs_highbit64(postblock + 1 - preblock),
xfs_rtx_to_rbmblock(mp, preblock), 1);
}
@@ -799,7 +787,7 @@ xfs_rtcheck_range(
/*
* Compute first bit not examined.
*/
- lastbit = XFS_RTMIN(bit + len, XFS_NBWORD);
+ lastbit = min(bit + len, XFS_NBWORD);
/*
* Mask of relevant bits.
*/
@@ -812,7 +800,7 @@ xfs_rtcheck_range(
/*
* Different, compute first wrong bit and return.
*/
- i = XFS_RTLOBIT(wdiff) - bit;
+ i = xfs_lowbit32(wdiff) - bit;
*new = start + i;
*stat = 0;
return 0;
@@ -851,7 +839,7 @@ xfs_rtcheck_range(
/*
* Different, compute first wrong bit and return.
*/
- i += XFS_RTLOBIT(wdiff);
+ i += xfs_lowbit32(wdiff);
*new = start + i;
*stat = 0;
return 0;
@@ -889,7 +877,7 @@ xfs_rtcheck_range(
/*
* Different, compute first wrong bit and return.
*/
- i += XFS_RTLOBIT(wdiff);
+ i += xfs_lowbit32(wdiff);
*new = start + i;
*stat = 0;
return 0;
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.h b/fs/xfs/libxfs/xfs_rtbitmap.h
index c0637057d69c..152a66750af5 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.h
+++ b/fs/xfs/libxfs/xfs_rtbitmap.h
@@ -321,8 +321,8 @@ int xfs_rtfind_forw(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
xfs_rtxnum_t limit, xfs_rtxnum_t *rtblock);
int xfs_rtmodify_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
xfs_rtxlen_t len, int val);
-int xfs_rtmodify_summary_int(struct xfs_rtalloc_args *args, int log,
- xfs_fileoff_t bbno, int delta, xfs_suminfo_t *sum);
+int xfs_rtget_summary(struct xfs_rtalloc_args *args, int log,
+ xfs_fileoff_t bbno, xfs_suminfo_t *sum);
int xfs_rtmodify_summary(struct xfs_rtalloc_args *args, int log,
xfs_fileoff_t bbno, int delta);
int xfs_rtfree_range(struct xfs_rtalloc_args *args, xfs_rtxnum_t start,
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 1f74d0cd1618..5bb6e2bd6dee 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -25,6 +25,7 @@
#include "xfs_da_format.h"
#include "xfs_health.h"
#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
/*
* Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -508,8 +509,9 @@ xfs_validate_sb_common(
rbmblocks = howmany_64(sbp->sb_rextents,
NBBY * sbp->sb_blocksize);
- if (sbp->sb_rextents != rexts ||
- sbp->sb_rextslog != xfs_highbit32(sbp->sb_rextents) ||
+ if (!xfs_validate_rtextents(rexts) ||
+ sbp->sb_rextents != rexts ||
+ sbp->sb_rextslog != xfs_compute_rextslog(rexts) ||
sbp->sb_rbmblocks != rbmblocks) {
xfs_notice(mp,
"realtime geometry sanity check failed");
@@ -1375,3 +1377,17 @@ xfs_validate_stripe_geometry(
}
return true;
}
+
+/*
+ * Compute the maximum level number of the realtime summary file, as defined by
+ * mkfs. The historic use of highbit32 on a 64-bit quantity prohibited correct
+ * use of rt volumes with more than 2^32 extents.
+ */
+uint8_t
+xfs_compute_rextslog(
+ xfs_rtbxlen_t rtextents)
+{
+ if (!rtextents)
+ return 0;
+ return xfs_highbit64(rtextents);
+}
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 19134b23c10b..2e8e8d63d4eb 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -38,4 +38,6 @@ extern int xfs_sb_get_secondary(struct xfs_mount *mp,
extern bool xfs_validate_stripe_geometry(struct xfs_mount *mp,
__s64 sunit, __s64 swidth, int sectorsize, bool silent);
+uint8_t xfs_compute_rextslog(xfs_rtbxlen_t rtextents);
+
#endif /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index c4381388c0c1..4220d3584c1b 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -139,7 +139,7 @@ bool xfs_symlink_hdr_ok(xfs_ino_t ino, uint32_t offset,
uint32_t size, struct xfs_buf *bp);
void xfs_symlink_local_to_remote(struct xfs_trans *tp, struct xfs_buf *bp,
struct xfs_inode *ip, struct xfs_ifork *ifp);
-xfs_failaddr_t xfs_symlink_shortform_verify(struct xfs_inode *ip);
+xfs_failaddr_t xfs_symlink_shortform_verify(void *sfp, int64_t size);
/* Computed inode geometry for the filesystem. */
struct xfs_ino_geometry {
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index bdc777b9ec4a..160aa20aa441 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -175,7 +175,7 @@ xfs_symlink_local_to_remote(
if (!xfs_has_crc(mp)) {
bp->b_ops = NULL;
- memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+ memcpy(bp->b_addr, ifp->if_data, ifp->if_bytes);
xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
return;
}
@@ -191,7 +191,7 @@ xfs_symlink_local_to_remote(
buf = bp->b_addr;
buf += xfs_symlink_hdr_set(mp, ip->i_ino, 0, ifp->if_bytes, bp);
- memcpy(buf, ifp->if_u1.if_data, ifp->if_bytes);
+ memcpy(buf, ifp->if_data, ifp->if_bytes);
xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsymlink_hdr) +
ifp->if_bytes - 1);
}
@@ -202,15 +202,11 @@ xfs_symlink_local_to_remote(
*/
xfs_failaddr_t
xfs_symlink_shortform_verify(
- struct xfs_inode *ip)
+ void *sfp,
+ int64_t size)
{
- struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
- char *sfp = (char *)ifp->if_u1.if_data;
- int size = ifp->if_bytes;
char *endp = sfp + size;
- ASSERT(ifp->if_format == XFS_DINODE_FMT_LOCAL);
-
/*
* Zero length symlinks should never occur in memory as they are
* never allowed to exist on disk.
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 533200c4ccc2..62e02d5380ad 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -51,7 +51,6 @@ typedef void * xfs_failaddr_t;
#define NULLRFSBLOCK ((xfs_rfsblock_t)-1)
#define NULLRTBLOCK ((xfs_rtblock_t)-1)
#define NULLFILEOFF ((xfs_fileoff_t)-1)
-#define NULLRTEXTNO ((xfs_rtxnum_t)-1)
#define NULLAGBLOCK ((xfs_agblock_t)-1)
#define NULLAGNUMBER ((xfs_agnumber_t)-1)
@@ -208,6 +207,13 @@ enum xfs_ag_resv_type {
XFS_AG_RESV_AGFL,
XFS_AG_RESV_METADATA,
XFS_AG_RESV_RMAPBT,
+
+ /*
+ * Don't increase fdblocks when freeing extent. This is a pony for
+ * the bnobt repair functions to re-free the free space without
+ * altering fdblocks. If you think you need this you're wrong.
+ */
+ XFS_AG_RESV_IGNORE,
};
/* Results of scanning a btree keyspace to check occupancy. */
@@ -245,4 +251,16 @@ bool xfs_verify_fileoff(struct xfs_mount *mp, xfs_fileoff_t off);
bool xfs_verify_fileext(struct xfs_mount *mp, xfs_fileoff_t off,
xfs_fileoff_t len);
+/* Do we support an rt volume having this number of rtextents? */
+static inline bool
+xfs_validate_rtextents(
+ xfs_rtbxlen_t rtextents)
+{
+ /* No runt rt volumes */
+ if (rtextents == 0)
+ return false;
+
+ return true;
+}
+
#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/scrub/agb_bitmap.c b/fs/xfs/scrub/agb_bitmap.c
new file mode 100644
index 000000000000..573e4e062754
--- /dev/null
+++ b/fs/xfs/scrub/agb_bitmap.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "bitmap.h"
+#include "scrub/agb_bitmap.h"
+
+/*
+ * Record all btree blocks seen while iterating all records of a btree.
+ *
+ * We know that the btree query_all function starts at the left edge and walks
+ * towards the right edge of the tree. Therefore, we know that we can walk up
+ * the btree cursor towards the root; if the pointer for a given level points
+ * to the first record/key in that block, we haven't seen this block before;
+ * and therefore we need to remember that we saw this block in the btree.
+ *
+ * So if our btree is:
+ *
+ * 4
+ * / | \
+ * 1 2 3
+ *
+ * Pretend for this example that each leaf block has 100 btree records. For
+ * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
+ * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
+ * we record block 4. The list is [1, 4].
+ *
+ * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
+ * the loop. The list remains [1, 4].
+ *
+ * For the 101st btree record, we've moved onto leaf block 2. Now
+ * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
+ * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
+ *
+ * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
+ *
+ * For the 201st record, we've moved on to leaf block 3.
+ * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
+ *
+ * For the 300th record we just exit, with the list being [1, 4, 2, 3].
+ */
+
+/* Mark a btree block to the agblock bitmap. */
+STATIC int
+xagb_bitmap_visit_btblock(
+ struct xfs_btree_cur *cur,
+ int level,
+ void *priv)
+{
+ struct xagb_bitmap *bitmap = priv;
+ struct xfs_buf *bp;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t agbno;
+
+ xfs_btree_get_block(cur, level, &bp);
+ if (!bp)
+ return 0;
+
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
+ agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+
+ return xagb_bitmap_set(bitmap, agbno, 1);
+}
+
+/* Mark all (per-AG) btree blocks in the agblock bitmap. */
+int
+xagb_bitmap_set_btblocks(
+ struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur)
+{
+ return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
+ XFS_BTREE_VISIT_ALL, bitmap);
+}
+
+/*
+ * Record all the buffers pointed to by the btree cursor. Callers already
+ * engaged in a btree walk should call this function to capture the list of
+ * blocks going from the leaf towards the root.
+ */
+int
+xagb_bitmap_set_btcur_path(
+ struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur)
+{
+ int i;
+ int error;
+
+ for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
+ error = xagb_bitmap_visit_btblock(cur, i, bitmap);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/agb_bitmap.h b/fs/xfs/scrub/agb_bitmap.h
new file mode 100644
index 000000000000..ed08f76ff4f3
--- /dev/null
+++ b/fs/xfs/scrub/agb_bitmap.h
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_AGB_BITMAP_H__
+#define __XFS_SCRUB_AGB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_agblock_t */
+
+struct xagb_bitmap {
+ struct xbitmap32 agbitmap;
+};
+
+static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
+{
+ xbitmap32_init(&bitmap->agbitmap);
+}
+
+static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
+{
+ xbitmap32_destroy(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
+ xfs_agblock_t start, xfs_extlen_t len)
+{
+ return xbitmap32_clear(&bitmap->agbitmap, start, len);
+}
+static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
+ xfs_agblock_t start, xfs_extlen_t len)
+{
+ return xbitmap32_set(&bitmap->agbitmap, start, len);
+}
+
+static inline bool xagb_bitmap_test(struct xagb_bitmap *bitmap,
+ xfs_agblock_t start, xfs_extlen_t *len)
+{
+ return xbitmap32_test(&bitmap->agbitmap, start, len);
+}
+
+static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
+ struct xagb_bitmap *sub)
+{
+ return xbitmap32_disunion(&bitmap->agbitmap, &sub->agbitmap);
+}
+
+static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
+{
+ return xbitmap32_hweight(&bitmap->agbitmap);
+}
+static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
+{
+ return xbitmap32_empty(&bitmap->agbitmap);
+}
+
+static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
+ xbitmap32_walk_fn fn, void *priv)
+{
+ return xbitmap32_walk(&bitmap->agbitmap, fn, priv);
+}
+
+int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur);
+int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
+ struct xfs_btree_cur *cur);
+
+#endif /* __XFS_SCRUB_AGB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/agheader_repair.c b/fs/xfs/scrub/agheader_repair.c
index 876a2f41b063..26bd1ff68f1b 100644
--- a/fs/xfs/scrub/agheader_repair.c
+++ b/fs/xfs/scrub/agheader_repair.c
@@ -26,6 +26,7 @@
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
#include "scrub/reap.h"
/* Superblock */
@@ -72,7 +73,7 @@ xrep_superblock(
/* Write this to disk. */
xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_SB_BUF);
xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
- return error;
+ return 0;
}
/* AGF */
@@ -341,7 +342,7 @@ xrep_agf_commit_new(
pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
- return 0;
+ return xrep_roll_ag_trans(sc);
}
/* Repair the AGF. v5 filesystems only. */
@@ -494,12 +495,11 @@ xrep_agfl_walk_rmap(
/* Strike out the blocks that are cross-linked according to the rmapbt. */
STATIC int
xrep_agfl_check_extent(
- uint64_t start,
- uint64_t len,
+ uint32_t agbno,
+ uint32_t len,
void *priv)
{
struct xrep_agfl *ra = priv;
- xfs_agblock_t agbno = start;
xfs_agblock_t last_agbno = agbno + len - 1;
int error;
@@ -647,8 +647,8 @@ struct xrep_agfl_fill {
/* Fill the AGFL with whatever blocks are in this extent. */
static int
xrep_agfl_fill(
- uint64_t start,
- uint64_t len,
+ uint32_t start,
+ uint32_t len,
void *priv)
{
struct xrep_agfl_fill *af = priv;
@@ -789,6 +789,9 @@ xrep_agfl(
/* Dump any AGFL overflow. */
error = xrep_reap_agblocks(sc, &agfl_extents, &XFS_RMAP_OINFO_AG,
XFS_AG_RESV_AGFL);
+ if (error)
+ goto err;
+
err:
xagb_bitmap_destroy(&agfl_extents);
return error;
@@ -962,7 +965,7 @@ xrep_agi_commit_new(
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
set_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
- return 0;
+ return xrep_roll_ag_trans(sc);
}
/* Repair the AGI. */
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 279af72b1671..d1b8a4997dd2 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -9,13 +9,16 @@
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
#include "xfs_btree.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
+#include "xfs_ag.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
-#include "xfs_ag.h"
+#include "scrub/repair.h"
/*
* Set us up to scrub free space btrees.
@@ -24,10 +27,19 @@ int
xchk_setup_ag_allocbt(
struct xfs_scrub *sc)
{
+ int error;
+
if (xchk_need_intent_drain(sc))
xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN);
- return xchk_setup_ag_btree(sc, false);
+ error = xchk_setup_ag_btree(sc, false);
+ if (error)
+ return error;
+
+ if (xchk_could_repair(sc))
+ return xrep_setup_ag_allocbt(sc);
+
+ return 0;
}
/* Free space btree scrubber. */
@@ -127,7 +139,7 @@ xchk_allocbt_rec(
struct xchk_alloc *ca = bs->private;
xfs_alloc_btrec_to_irec(rec, &irec);
- if (xfs_alloc_check_irec(bs->cur, &irec) != NULL) {
+ if (xfs_alloc_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -138,31 +150,27 @@ xchk_allocbt_rec(
return 0;
}
-/* Scrub the freespace btrees for some AG. */
-STATIC int
+/* Scrub one of the freespace btrees for some AG. */
+int
xchk_allocbt(
- struct xfs_scrub *sc,
- xfs_btnum_t which)
+ struct xfs_scrub *sc)
{
struct xchk_alloc ca = { };
struct xfs_btree_cur *cur;
- cur = which == XFS_BTNUM_BNO ? sc->sa.bno_cur : sc->sa.cnt_cur;
- return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
-}
-
-int
-xchk_bnobt(
- struct xfs_scrub *sc)
-{
- return xchk_allocbt(sc, XFS_BTNUM_BNO);
-}
+ switch (sc->sm->sm_type) {
+ case XFS_SCRUB_TYPE_BNOBT:
+ cur = sc->sa.bno_cur;
+ break;
+ case XFS_SCRUB_TYPE_CNTBT:
+ cur = sc->sa.cnt_cur;
+ break;
+ default:
+ ASSERT(0);
+ return -EIO;
+ }
-int
-xchk_cntbt(
- struct xfs_scrub *sc)
-{
- return xchk_allocbt(sc, XFS_BTNUM_CNT);
+ return xchk_btree(sc, cur, xchk_allocbt_rec, &XFS_RMAP_OINFO_AG, &ca);
}
/* xref check that the extent is not free */
diff --git a/fs/xfs/scrub/alloc_repair.c b/fs/xfs/scrub/alloc_repair.c
new file mode 100644
index 000000000000..45edda096869
--- /dev/null
+++ b/fs/xfs/scrub/alloc_repair.c
@@ -0,0 +1,934 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_inode.h"
+#include "xfs_refcount.h"
+#include "xfs_extent_busy.h"
+#include "xfs_health.h"
+#include "xfs_bmap.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Free Space Btree Repair
+ * =======================
+ *
+ * The reverse mappings are supposed to record all space usage for the entire
+ * AG. Therefore, we can recreate the free extent records in an AG by looking
+ * for gaps in the physical extents recorded in the rmapbt. These records are
+ * staged in @free_records. Identifying the gaps is more difficult on a
+ * reflink filesystem because rmap records are allowed to overlap.
+ *
+ * Because the final step of building a new index is to free the space used by
+ * the old index, repair needs to find that space. Unfortunately, all
+ * structures that live in the free space (bnobt, cntbt, rmapbt, agfl) share
+ * the same rmapbt owner code (OWN_AG), so this is not straightforward.
+ *
+ * The scan of the reverse mapping information records the space used by OWN_AG
+ * in @old_allocbt_blocks, which (at this stage) is somewhat misnamed. While
+ * walking the rmapbt records, we create a second bitmap @not_allocbt_blocks to
+ * record all visited rmap btree blocks and all blocks owned by the AGFL.
+ *
+ * After that is where the definitions of old_allocbt_blocks shifts. This
+ * expression identifies possible former bnobt/cntbt blocks:
+ *
+ * (OWN_AG blocks) & ~(rmapbt blocks | agfl blocks);
+ *
+ * Substituting from above definitions, that becomes:
+ *
+ * old_allocbt_blocks & ~not_allocbt_blocks
+ *
+ * The OWN_AG bitmap itself isn't needed after this point, so what we really do
+ * instead is:
+ *
+ * old_allocbt_blocks &= ~not_allocbt_blocks;
+ *
+ * After this point, @old_allocbt_blocks is a bitmap of alleged former
+ * bnobt/cntbt blocks. The xagb_bitmap_disunion operation modifies its first
+ * parameter in place to avoid copying records around.
+ *
+ * Next, some of the space described by @free_records are diverted to the newbt
+ * reservation and used to format new btree blocks. The remaining records are
+ * written to the new btree indices. We reconstruct both bnobt and cntbt at
+ * the same time since we've already done all the work.
+ *
+ * We use the prefix 'xrep_abt' here because we regenerate both free space
+ * allocation btrees at the same time.
+ */
+
+struct xrep_abt {
+ /* Blocks owned by the rmapbt or the agfl. */
+ struct xagb_bitmap not_allocbt_blocks;
+
+ /* All OWN_AG blocks. */
+ struct xagb_bitmap old_allocbt_blocks;
+
+ /*
+ * New bnobt information. All btree block reservations are added to
+ * the reservation list in new_bnobt.
+ */
+ struct xrep_newbt new_bnobt;
+
+ /* new cntbt information */
+ struct xrep_newbt new_cntbt;
+
+ /* Free space extents. */
+ struct xfarray *free_records;
+
+ struct xfs_scrub *sc;
+
+ /* Number of non-null records in @free_records. */
+ uint64_t nr_real_records;
+
+ /* get_records()'s position in the free space record array. */
+ xfarray_idx_t array_cur;
+
+ /*
+ * Next block we anticipate seeing in the rmap records. If the next
+ * rmap record is greater than next_agbno, we have found unused space.
+ */
+ xfs_agblock_t next_agbno;
+
+ /* Number of free blocks in this AG. */
+ xfs_agblock_t nr_blocks;
+
+ /* Longest free extent we found in the AG. */
+ xfs_agblock_t longest;
+};
+
+/* Set up to repair AG free space btrees. */
+int
+xrep_setup_ag_allocbt(
+ struct xfs_scrub *sc)
+{
+ unsigned int busy_gen;
+
+ /*
+ * Make sure the busy extent list is clear because we can't put extents
+ * on there twice.
+ */
+ busy_gen = READ_ONCE(sc->sa.pag->pagb_gen);
+ if (xfs_extent_busy_list_empty(sc->sa.pag))
+ return 0;
+
+ return xfs_extent_busy_flush(sc->tp, sc->sa.pag, busy_gen, 0);
+}
+
+/* Check for any obvious conflicts in the free extent. */
+STATIC int
+xrep_abt_check_free_ext(
+ struct xfs_scrub *sc,
+ const struct xfs_alloc_rec_incore *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (xfs_alloc_check_irec(sc->sa.pag, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Must not be an inode chunk. */
+ error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+ rec->ar_startblock, rec->ar_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ /* Must not be shared or CoW staging. */
+ if (sc->sa.refc_cur) {
+ error = xfs_refcount_has_records(sc->sa.refc_cur,
+ XFS_REFC_DOMAIN_SHARED, rec->ar_startblock,
+ rec->ar_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ error = xfs_refcount_has_records(sc->sa.refc_cur,
+ XFS_REFC_DOMAIN_COW, rec->ar_startblock,
+ rec->ar_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/*
+ * Stash a free space record for all the space since the last bno we found
+ * all the way up to @end.
+ */
+static int
+xrep_abt_stash(
+ struct xrep_abt *ra,
+ xfs_agblock_t end)
+{
+ struct xfs_alloc_rec_incore arec = {
+ .ar_startblock = ra->next_agbno,
+ .ar_blockcount = end - ra->next_agbno,
+ };
+ struct xfs_scrub *sc = ra->sc;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ error = xrep_abt_check_free_ext(ra->sc, &arec);
+ if (error)
+ return error;
+
+ trace_xrep_abt_found(sc->mp, sc->sa.pag->pag_agno, &arec);
+
+ error = xfarray_append(ra->free_records, &arec);
+ if (error)
+ return error;
+
+ ra->nr_blocks += arec.ar_blockcount;
+ return 0;
+}
+
+/* Record extents that aren't in use from gaps in the rmap records. */
+STATIC int
+xrep_abt_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+ int error;
+
+ /* Record all the OWN_AG blocks... */
+ if (rec->rm_owner == XFS_RMAP_OWN_AG) {
+ error = xagb_bitmap_set(&ra->old_allocbt_blocks,
+ rec->rm_startblock, rec->rm_blockcount);
+ if (error)
+ return error;
+ }
+
+ /* ...and all the rmapbt blocks... */
+ error = xagb_bitmap_set_btcur_path(&ra->not_allocbt_blocks, cur);
+ if (error)
+ return error;
+
+ /* ...and all the free space. */
+ if (rec->rm_startblock > ra->next_agbno) {
+ error = xrep_abt_stash(ra, rec->rm_startblock);
+ if (error)
+ return error;
+ }
+
+ /*
+ * rmap records can overlap on reflink filesystems, so project
+ * next_agbno as far out into the AG space as we currently know about.
+ */
+ ra->next_agbno = max_t(xfs_agblock_t, ra->next_agbno,
+ rec->rm_startblock + rec->rm_blockcount);
+ return 0;
+}
+
+/* Collect an AGFL block for the not-to-release list. */
+static int
+xrep_abt_walk_agfl(
+ struct xfs_mount *mp,
+ xfs_agblock_t agbno,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+
+ return xagb_bitmap_set(&ra->not_allocbt_blocks, agbno, 1);
+}
+
+/*
+ * Compare two free space extents by block number. We want to sort in order of
+ * increasing block number.
+ */
+static int
+xrep_bnobt_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_alloc_rec_incore *ap = a;
+ const struct xfs_alloc_rec_incore *bp = b;
+
+ if (ap->ar_startblock > bp->ar_startblock)
+ return 1;
+ else if (ap->ar_startblock < bp->ar_startblock)
+ return -1;
+ return 0;
+}
+
+/*
+ * Re-sort the free extents by block number so that we can put the records into
+ * the bnobt in the correct order. Make sure the records do not overlap in
+ * physical space.
+ */
+STATIC int
+xrep_bnobt_sort_records(
+ struct xrep_abt *ra)
+{
+ struct xfs_alloc_rec_incore arec;
+ xfarray_idx_t cur = XFARRAY_CURSOR_INIT;
+ xfs_agblock_t next_agbno = 0;
+ int error;
+
+ error = xfarray_sort(ra->free_records, xrep_bnobt_extent_cmp, 0);
+ if (error)
+ return error;
+
+ while ((error = xfarray_iter(ra->free_records, &cur, &arec)) == 1) {
+ if (arec.ar_startblock < next_agbno)
+ return -EFSCORRUPTED;
+
+ next_agbno = arec.ar_startblock + arec.ar_blockcount;
+ }
+
+ return error;
+}
+
+/*
+ * Compare two free space extents by length and then block number. We want
+ * to sort first in order of increasing length and then in order of increasing
+ * block number.
+ */
+static int
+xrep_cntbt_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_alloc_rec_incore *ap = a;
+ const struct xfs_alloc_rec_incore *bp = b;
+
+ if (ap->ar_blockcount > bp->ar_blockcount)
+ return 1;
+ else if (ap->ar_blockcount < bp->ar_blockcount)
+ return -1;
+ return xrep_bnobt_extent_cmp(a, b);
+}
+
+/*
+ * Sort the free extents by length so so that we can put the records into the
+ * cntbt in the correct order. Don't let userspace kill us if we're resorting
+ * after allocating btree blocks.
+ */
+STATIC int
+xrep_cntbt_sort_records(
+ struct xrep_abt *ra,
+ bool is_resort)
+{
+ return xfarray_sort(ra->free_records, xrep_cntbt_extent_cmp,
+ is_resort ? 0 : XFARRAY_SORT_KILLABLE);
+}
+
+/*
+ * Iterate all reverse mappings to find (1) the gaps between rmap records (all
+ * unowned space), (2) the OWN_AG extents (which encompass the free space
+ * btrees, the rmapbt, and the agfl), (3) the rmapbt blocks, and (4) the AGFL
+ * blocks. The free space is (1) + (2) - (3) - (4).
+ */
+STATIC int
+xrep_abt_find_freespace(
+ struct xrep_abt *ra)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ struct xfs_buf *agfl_bp;
+ xfs_agblock_t agend;
+ int error;
+
+ xagb_bitmap_init(&ra->not_allocbt_blocks);
+
+ xrep_ag_btcur_init(sc, &sc->sa);
+
+ /*
+ * Iterate all the reverse mappings to find gaps in the physical
+ * mappings, all the OWN_AG blocks, and all the rmapbt extents.
+ */
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_abt_walk_rmap, ra);
+ if (error)
+ goto err;
+
+ /* Insert a record for space between the last rmap and EOAG. */
+ agend = be32_to_cpu(agf->agf_length);
+ if (ra->next_agbno < agend) {
+ error = xrep_abt_stash(ra, agend);
+ if (error)
+ goto err;
+ }
+
+ /* Collect all the AGFL blocks. */
+ error = xfs_alloc_read_agfl(sc->sa.pag, sc->tp, &agfl_bp);
+ if (error)
+ goto err;
+
+ error = xfs_agfl_walk(mp, agf, agfl_bp, xrep_abt_walk_agfl, ra);
+ if (error)
+ goto err_agfl;
+
+ /* Compute the old bnobt/cntbt blocks. */
+ error = xagb_bitmap_disunion(&ra->old_allocbt_blocks,
+ &ra->not_allocbt_blocks);
+ if (error)
+ goto err_agfl;
+
+ ra->nr_real_records = xfarray_length(ra->free_records);
+err_agfl:
+ xfs_trans_brelse(sc->tp, agfl_bp);
+err:
+ xchk_ag_btcur_free(&sc->sa);
+ xagb_bitmap_destroy(&ra->not_allocbt_blocks);
+ return error;
+}
+
+/*
+ * We're going to use the observed free space records to reserve blocks for the
+ * new free space btrees, so we play an iterative game where we try to converge
+ * on the number of blocks we need:
+ *
+ * 1. Estimate how many blocks we'll need to store the records.
+ * 2. If the first free record has more blocks than we need, we're done.
+ * We will have to re-sort the records prior to building the cntbt.
+ * 3. If that record has exactly the number of blocks we need, null out the
+ * record. We're done.
+ * 4. Otherwise, we still need more blocks. Null out the record, subtract its
+ * length from the number of blocks we need, and go back to step 1.
+ *
+ * Fortunately, we don't have to do any transaction work to play this game, so
+ * we don't have to tear down the staging cursors.
+ */
+STATIC int
+xrep_abt_reserve_space(
+ struct xrep_abt *ra,
+ struct xfs_btree_cur *bno_cur,
+ struct xfs_btree_cur *cnt_cur,
+ bool *needs_resort)
+{
+ struct xfs_scrub *sc = ra->sc;
+ xfarray_idx_t record_nr;
+ unsigned int allocated = 0;
+ int error = 0;
+
+ record_nr = xfarray_length(ra->free_records) - 1;
+ do {
+ struct xfs_alloc_rec_incore arec;
+ uint64_t required;
+ unsigned int desired;
+ unsigned int len;
+
+ /* Compute how many blocks we'll need. */
+ error = xfs_btree_bload_compute_geometry(cnt_cur,
+ &ra->new_cntbt.bload, ra->nr_real_records);
+ if (error)
+ break;
+
+ error = xfs_btree_bload_compute_geometry(bno_cur,
+ &ra->new_bnobt.bload, ra->nr_real_records);
+ if (error)
+ break;
+
+ /* How many btree blocks do we need to store all records? */
+ required = ra->new_bnobt.bload.nr_blocks +
+ ra->new_cntbt.bload.nr_blocks;
+ ASSERT(required < INT_MAX);
+
+ /* If we've reserved enough blocks, we're done. */
+ if (allocated >= required)
+ break;
+
+ desired = required - allocated;
+
+ /* We need space but there's none left; bye! */
+ if (ra->nr_real_records == 0) {
+ error = -ENOSPC;
+ break;
+ }
+
+ /* Grab the first record from the list. */
+ error = xfarray_load(ra->free_records, record_nr, &arec);
+ if (error)
+ break;
+
+ ASSERT(arec.ar_blockcount <= UINT_MAX);
+ len = min_t(unsigned int, arec.ar_blockcount, desired);
+
+ trace_xrep_newbt_alloc_ag_blocks(sc->mp, sc->sa.pag->pag_agno,
+ arec.ar_startblock, len, XFS_RMAP_OWN_AG);
+
+ error = xrep_newbt_add_extent(&ra->new_bnobt, sc->sa.pag,
+ arec.ar_startblock, len);
+ if (error)
+ break;
+ allocated += len;
+ ra->nr_blocks -= len;
+
+ if (arec.ar_blockcount > desired) {
+ /*
+ * Record has more space than we need. The number of
+ * free records doesn't change, so shrink the free
+ * record, inform the caller that the records are no
+ * longer sorted by length, and exit.
+ */
+ arec.ar_startblock += desired;
+ arec.ar_blockcount -= desired;
+ error = xfarray_store(ra->free_records, record_nr,
+ &arec);
+ if (error)
+ break;
+
+ *needs_resort = true;
+ return 0;
+ }
+
+ /*
+ * We're going to use up the entire record, so unset it and
+ * move on to the next one. This changes the number of free
+ * records (but doesn't break the sorting order), so we must
+ * go around the loop once more to re-run _bload_init.
+ */
+ error = xfarray_unset(ra->free_records, record_nr);
+ if (error)
+ break;
+ ra->nr_real_records--;
+ record_nr--;
+ } while (1);
+
+ return error;
+}
+
+STATIC int
+xrep_abt_dispose_one(
+ struct xrep_abt *ra,
+ struct xrep_newbt_resv *resv)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ xfs_agblock_t free_agbno = resv->agbno + resv->used;
+ xfs_extlen_t free_aglen = resv->len - resv->used;
+ int error;
+
+ ASSERT(pag == resv->pag);
+
+ /* Add a deferred rmap for each extent we used. */
+ if (resv->used > 0)
+ xfs_rmap_alloc_extent(sc->tp, pag->pag_agno, resv->agbno,
+ resv->used, XFS_RMAP_OWN_AG);
+
+ /*
+ * For each reserved btree block we didn't use, add it to the free
+ * space btree. We didn't touch fdblocks when we reserved them, so
+ * we don't touch it now.
+ */
+ if (free_aglen == 0)
+ return 0;
+
+ trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
+ free_aglen, ra->new_bnobt.oinfo.oi_owner);
+
+ error = __xfs_free_extent(sc->tp, resv->pag, free_agbno, free_aglen,
+ &ra->new_bnobt.oinfo, XFS_AG_RESV_IGNORE, true);
+ if (error)
+ return error;
+
+ return xrep_defer_finish(sc);
+}
+
+/*
+ * Deal with all the space we reserved. Blocks that were allocated for the
+ * free space btrees need to have a (deferred) rmap added for the OWN_AG
+ * allocation, and blocks that didn't get used can be freed via the usual
+ * (deferred) means.
+ */
+STATIC void
+xrep_abt_dispose_reservations(
+ struct xrep_abt *ra,
+ int error)
+{
+ struct xrep_newbt_resv *resv, *n;
+
+ if (error)
+ goto junkit;
+
+ list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
+ error = xrep_abt_dispose_one(ra, resv);
+ if (error)
+ goto junkit;
+ }
+
+junkit:
+ list_for_each_entry_safe(resv, n, &ra->new_bnobt.resv_list, list) {
+ xfs_perag_put(resv->pag);
+ list_del(&resv->list);
+ kfree(resv);
+ }
+
+ xrep_newbt_cancel(&ra->new_bnobt);
+ xrep_newbt_cancel(&ra->new_cntbt);
+}
+
+/* Retrieve free space data for bulk load. */
+STATIC int
+xrep_abt_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_alloc_rec_incore *arec = &cur->bc_rec.a;
+ struct xrep_abt *ra = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load_next(ra->free_records, &ra->array_cur,
+ arec);
+ if (error)
+ return error;
+
+ ra->longest = max(ra->longest, arec->ar_blockcount);
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_abt_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_abt *ra = priv;
+
+ return xrep_newbt_claim_block(cur, &ra->new_bnobt, ptr);
+}
+
+/*
+ * Reset the AGF counters to reflect the free space btrees that we just
+ * rebuilt, then reinitialize the per-AG data.
+ */
+STATIC int
+xrep_abt_reset_counters(
+ struct xrep_abt *ra)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_agf *agf = sc->sa.agf_bp->b_addr;
+ unsigned int freesp_btreeblks = 0;
+
+ /*
+ * Compute the contribution to agf_btreeblks for the new free space
+ * btrees. This is the computed btree size minus anything we didn't
+ * use.
+ */
+ freesp_btreeblks += ra->new_bnobt.bload.nr_blocks - 1;
+ freesp_btreeblks += ra->new_cntbt.bload.nr_blocks - 1;
+
+ freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_bnobt);
+ freesp_btreeblks -= xrep_newbt_unused_blocks(&ra->new_cntbt);
+
+ /*
+ * The AGF header contains extra information related to the free space
+ * btrees, so we must update those fields here.
+ */
+ agf->agf_btreeblks = cpu_to_be32(freesp_btreeblks +
+ (be32_to_cpu(agf->agf_rmap_blocks) - 1));
+ agf->agf_freeblks = cpu_to_be32(ra->nr_blocks);
+ agf->agf_longest = cpu_to_be32(ra->longest);
+ xfs_alloc_log_agf(sc->tp, sc->sa.agf_bp, XFS_AGF_BTREEBLKS |
+ XFS_AGF_LONGEST |
+ XFS_AGF_FREEBLKS);
+
+ /*
+ * After we commit the new btree to disk, it is possible that the
+ * process to reap the old btree blocks will race with the AIL trying
+ * to checkpoint the old btree blocks into the filesystem. If the new
+ * tree is shorter than the old one, the allocbt write verifier will
+ * fail and the AIL will shut down the filesystem.
+ *
+ * To avoid this, save the old incore btree height values as the alt
+ * height values before re-initializing the perag info from the updated
+ * AGF to capture all the new values.
+ */
+ pag->pagf_repair_levels[XFS_BTNUM_BNOi] = pag->pagf_levels[XFS_BTNUM_BNOi];
+ pag->pagf_repair_levels[XFS_BTNUM_CNTi] = pag->pagf_levels[XFS_BTNUM_CNTi];
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagf(sc);
+}
+
+/*
+ * Use the collected free space information to stage new free space btrees.
+ * If this is successful we'll return with the new btree root
+ * information logged to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_abt_build_new_trees(
+ struct xrep_abt *ra)
+{
+ struct xfs_scrub *sc = ra->sc;
+ struct xfs_btree_cur *bno_cur;
+ struct xfs_btree_cur *cnt_cur;
+ struct xfs_perag *pag = sc->sa.pag;
+ bool needs_resort = false;
+ int error;
+
+ /*
+ * Sort the free extents by length so that we can set up the free space
+ * btrees in as few extents as possible. This reduces the amount of
+ * deferred rmap / free work we have to do at the end.
+ */
+ error = xrep_cntbt_sort_records(ra, false);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the AG header.
+ */
+ xrep_newbt_init_bare(&ra->new_bnobt, sc);
+ xrep_newbt_init_bare(&ra->new_cntbt, sc);
+
+ ra->new_bnobt.bload.get_records = xrep_abt_get_records;
+ ra->new_cntbt.bload.get_records = xrep_abt_get_records;
+
+ ra->new_bnobt.bload.claim_block = xrep_abt_claim_block;
+ ra->new_cntbt.bload.claim_block = xrep_abt_claim_block;
+
+ /* Allocate cursors for the staged btrees. */
+ bno_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_bnobt.afake,
+ pag, XFS_BTNUM_BNO);
+ cnt_cur = xfs_allocbt_stage_cursor(sc->mp, &ra->new_cntbt.afake,
+ pag, XFS_BTNUM_CNT);
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btrees. */
+ error = xrep_abt_reserve_space(ra, bno_cur, cnt_cur, &needs_resort);
+ if (error)
+ goto err_cur;
+
+ /*
+ * If we need to re-sort the free extents by length, do so so that we
+ * can put the records into the cntbt in the correct order.
+ */
+ if (needs_resort) {
+ error = xrep_cntbt_sort_records(ra, needs_resort);
+ if (error)
+ goto err_cur;
+ }
+
+ /*
+ * Due to btree slack factors, it's possible for a new btree to be one
+ * level taller than the old btree. Update the alternate incore btree
+ * height so that we don't trip the verifiers when writing the new
+ * btree blocks to disk.
+ */
+ pag->pagf_repair_levels[XFS_BTNUM_BNOi] =
+ ra->new_bnobt.bload.btree_height;
+ pag->pagf_repair_levels[XFS_BTNUM_CNTi] =
+ ra->new_cntbt.bload.btree_height;
+
+ /* Load the free space by length tree. */
+ ra->array_cur = XFARRAY_CURSOR_INIT;
+ ra->longest = 0;
+ error = xfs_btree_bload(cnt_cur, &ra->new_cntbt.bload, ra);
+ if (error)
+ goto err_levels;
+
+ error = xrep_bnobt_sort_records(ra);
+ if (error)
+ return error;
+
+ /* Load the free space by block number tree. */
+ ra->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(bno_cur, &ra->new_bnobt.bload, ra);
+ if (error)
+ goto err_levels;
+
+ /*
+ * Install the new btrees in the AG header. After this point the old
+ * btrees are no longer accessible and the new trees are live.
+ */
+ xfs_allocbt_commit_staged_btree(bno_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(bno_cur, 0);
+ xfs_allocbt_commit_staged_btree(cnt_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(cnt_cur, 0);
+
+ /* Reset the AGF counters now that we've changed the btree shape. */
+ error = xrep_abt_reset_counters(ra);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ xrep_abt_dispose_reservations(ra, error);
+
+ return xrep_roll_ag_trans(sc);
+
+err_levels:
+ pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
+ pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+err_cur:
+ xfs_btree_del_cursor(cnt_cur, error);
+ xfs_btree_del_cursor(bno_cur, error);
+err_newbt:
+ xrep_abt_dispose_reservations(ra, error);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_abt_remove_old_trees(
+ struct xrep_abt *ra)
+{
+ struct xfs_perag *pag = ra->sc->sa.pag;
+ int error;
+
+ /* Free the old btree blocks if they're not in use. */
+ error = xrep_reap_agblocks(ra->sc, &ra->old_allocbt_blocks,
+ &XFS_RMAP_OINFO_AG, XFS_AG_RESV_IGNORE);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've zapped all the old allocbt blocks we can turn off
+ * the alternate height mechanism.
+ */
+ pag->pagf_repair_levels[XFS_BTNUM_BNOi] = 0;
+ pag->pagf_repair_levels[XFS_BTNUM_CNTi] = 0;
+ return 0;
+}
+
+/* Repair the freespace btrees for some AG. */
+int
+xrep_allocbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_abt *ra;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+
+ ra = kzalloc(sizeof(struct xrep_abt), XCHK_GFP_FLAGS);
+ if (!ra)
+ return -ENOMEM;
+ ra->sc = sc;
+
+ /* We rebuild both data structures. */
+ sc->sick_mask = XFS_SICK_AG_BNOBT | XFS_SICK_AG_CNTBT;
+
+ /*
+ * Make sure the busy extent list is clear because we can't put extents
+ * on there twice. In theory we cleared this before we started, but
+ * let's not risk the filesystem.
+ */
+ if (!xfs_extent_busy_list_empty(sc->sa.pag)) {
+ error = -EDEADLOCK;
+ goto out_ra;
+ }
+
+ /* Set up enough storage to handle maximally fragmented free space. */
+ descr = xchk_xfile_ag_descr(sc, "free space records");
+ error = xfarray_create(descr, mp->m_sb.sb_agblocks / 2,
+ sizeof(struct xfs_alloc_rec_incore),
+ &ra->free_records);
+ kfree(descr);
+ if (error)
+ goto out_ra;
+
+ /* Collect the free space data and find the old btree blocks. */
+ xagb_bitmap_init(&ra->old_allocbt_blocks);
+ error = xrep_abt_find_freespace(ra);
+ if (error)
+ goto out_bitmap;
+
+ /* Rebuild the free space information. */
+ error = xrep_abt_build_new_trees(ra);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old trees. */
+ error = xrep_abt_remove_old_trees(ra);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xagb_bitmap_destroy(&ra->old_allocbt_blocks);
+ xfarray_destroy(ra->free_records);
+out_ra:
+ kfree(ra);
+ return error;
+}
+
+/* Make sure both btrees are ok after we've rebuilt them. */
+int
+xrep_revalidate_allocbt(
+ struct xfs_scrub *sc)
+{
+ __u32 old_type = sc->sm->sm_type;
+ int error;
+
+ /*
+ * We must update sm_type temporarily so that the tree-to-tree cross
+ * reference checks will work in the correct direction, and also so
+ * that tracing will report correctly if there are more errors.
+ */
+ sc->sm->sm_type = XFS_SCRUB_TYPE_BNOBT;
+ error = xchk_allocbt(sc);
+ if (error)
+ goto out;
+
+ sc->sm->sm_type = XFS_SCRUB_TYPE_CNTBT;
+ error = xchk_allocbt(sc);
+out:
+ sc->sm->sm_type = old_type;
+ return error;
+}
diff --git a/fs/xfs/scrub/attr.c b/fs/xfs/scrub/attr.c
index 6c16d9530cca..83c7feb38714 100644
--- a/fs/xfs/scrub/attr.c
+++ b/fs/xfs/scrub/attr.c
@@ -527,28 +527,23 @@ xchk_xattr_check_sf(
struct xfs_scrub *sc)
{
struct xchk_xattr_buf *ab = sc->buf;
- struct xfs_attr_shortform *sf;
- struct xfs_attr_sf_entry *sfe;
+ struct xfs_ifork *ifp = &sc->ip->i_af;
+ struct xfs_attr_sf_hdr *sf = ifp->if_data;
+ struct xfs_attr_sf_entry *sfe = xfs_attr_sf_firstentry(sf);
struct xfs_attr_sf_entry *next;
- struct xfs_ifork *ifp;
- unsigned char *end;
+ unsigned char *end = ifp->if_data + ifp->if_bytes;
int i;
int error = 0;
- ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
-
bitmap_zero(ab->usedmap, ifp->if_bytes);
- sf = (struct xfs_attr_shortform *)sc->ip->i_af.if_u1.if_data;
- end = (unsigned char *)ifp->if_u1.if_data + ifp->if_bytes;
- xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(sf->hdr));
+ xchk_xattr_set_map(sc, ab->usedmap, 0, sizeof(*sf));
- sfe = &sf->list[0];
if ((unsigned char *)sfe > end) {
xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
return 0;
}
- for (i = 0; i < sf->hdr.count; i++) {
+ for (i = 0; i < sf->count; i++) {
unsigned char *name = sfe->nameval;
unsigned char *value = &sfe->nameval[sfe->namelen];
diff --git a/fs/xfs/scrub/bitmap.c b/fs/xfs/scrub/bitmap.c
index e0c89a9a0ca0..1449bb5262d9 100644
--- a/fs/xfs/scrub/bitmap.c
+++ b/fs/xfs/scrub/bitmap.c
@@ -16,7 +16,9 @@
#include <linux/interval_tree_generic.h>
-struct xbitmap_node {
+/* u64 bitmap */
+
+struct xbitmap64_node {
struct rb_node bn_rbnode;
/* First set bit of this interval and subtree. */
@@ -39,72 +41,72 @@ struct xbitmap_node {
* forward-declare them anyway for clarity.
*/
static inline void
-xbitmap_tree_insert(struct xbitmap_node *node, struct rb_root_cached *root);
+xbitmap64_tree_insert(struct xbitmap64_node *node, struct rb_root_cached *root);
static inline void
-xbitmap_tree_remove(struct xbitmap_node *node, struct rb_root_cached *root);
+xbitmap64_tree_remove(struct xbitmap64_node *node, struct rb_root_cached *root);
-static inline struct xbitmap_node *
-xbitmap_tree_iter_first(struct rb_root_cached *root, uint64_t start,
+static inline struct xbitmap64_node *
+xbitmap64_tree_iter_first(struct rb_root_cached *root, uint64_t start,
uint64_t last);
-static inline struct xbitmap_node *
-xbitmap_tree_iter_next(struct xbitmap_node *node, uint64_t start,
+static inline struct xbitmap64_node *
+xbitmap64_tree_iter_next(struct xbitmap64_node *node, uint64_t start,
uint64_t last);
-INTERVAL_TREE_DEFINE(struct xbitmap_node, bn_rbnode, uint64_t,
- __bn_subtree_last, START, LAST, static inline, xbitmap_tree)
+INTERVAL_TREE_DEFINE(struct xbitmap64_node, bn_rbnode, uint64_t,
+ __bn_subtree_last, START, LAST, static inline, xbitmap64_tree)
/* Iterate each interval of a bitmap. Do not change the bitmap. */
-#define for_each_xbitmap_extent(bn, bitmap) \
+#define for_each_xbitmap64_extent(bn, bitmap) \
for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
- struct xbitmap_node, bn_rbnode); \
+ struct xbitmap64_node, bn_rbnode); \
(bn) != NULL; \
(bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
- struct xbitmap_node, bn_rbnode))
+ struct xbitmap64_node, bn_rbnode))
/* Clear a range of this bitmap. */
int
-xbitmap_clear(
- struct xbitmap *bitmap,
+xbitmap64_clear(
+ struct xbitmap64 *bitmap,
uint64_t start,
uint64_t len)
{
- struct xbitmap_node *bn;
- struct xbitmap_node *new_bn;
+ struct xbitmap64_node *bn;
+ struct xbitmap64_node *new_bn;
uint64_t last = start + len - 1;
- while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last))) {
+ while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last))) {
if (bn->bn_start < start && bn->bn_last > last) {
uint64_t old_last = bn->bn_last;
/* overlaps with the entire clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_last = start - 1;
- xbitmap_tree_insert(bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(bn, &bitmap->xb_root);
/* add an extent */
- new_bn = kmalloc(sizeof(struct xbitmap_node),
+ new_bn = kmalloc(sizeof(struct xbitmap64_node),
XCHK_GFP_FLAGS);
if (!new_bn)
return -ENOMEM;
new_bn->bn_start = last + 1;
new_bn->bn_last = old_last;
- xbitmap_tree_insert(new_bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(new_bn, &bitmap->xb_root);
} else if (bn->bn_start < start) {
/* overlaps with the left side of the clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_last = start - 1;
- xbitmap_tree_insert(bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(bn, &bitmap->xb_root);
} else if (bn->bn_last > last) {
/* overlaps with the right side of the clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
bn->bn_start = last + 1;
- xbitmap_tree_insert(bn, &bitmap->xb_root);
+ xbitmap64_tree_insert(bn, &bitmap->xb_root);
break;
} else {
/* in the middle of the clearing range */
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
kfree(bn);
}
}
@@ -114,59 +116,59 @@ xbitmap_clear(
/* Set a range of this bitmap. */
int
-xbitmap_set(
- struct xbitmap *bitmap,
+xbitmap64_set(
+ struct xbitmap64 *bitmap,
uint64_t start,
uint64_t len)
{
- struct xbitmap_node *left;
- struct xbitmap_node *right;
+ struct xbitmap64_node *left;
+ struct xbitmap64_node *right;
uint64_t last = start + len - 1;
int error;
/* Is this whole range already set? */
- left = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+ left = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last);
if (left && left->bn_start <= start && left->bn_last >= last)
return 0;
/* Clear out everything in the range we want to set. */
- error = xbitmap_clear(bitmap, start, len);
+ error = xbitmap64_clear(bitmap, start, len);
if (error)
return error;
/* Do we have a left-adjacent extent? */
- left = xbitmap_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
+ left = xbitmap64_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
ASSERT(!left || left->bn_last + 1 == start);
/* Do we have a right-adjacent extent? */
- right = xbitmap_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
+ right = xbitmap64_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
ASSERT(!right || right->bn_start == last + 1);
if (left && right) {
/* combine left and right adjacent extent */
- xbitmap_tree_remove(left, &bitmap->xb_root);
- xbitmap_tree_remove(right, &bitmap->xb_root);
+ xbitmap64_tree_remove(left, &bitmap->xb_root);
+ xbitmap64_tree_remove(right, &bitmap->xb_root);
left->bn_last = right->bn_last;
- xbitmap_tree_insert(left, &bitmap->xb_root);
+ xbitmap64_tree_insert(left, &bitmap->xb_root);
kfree(right);
} else if (left) {
/* combine with left extent */
- xbitmap_tree_remove(left, &bitmap->xb_root);
+ xbitmap64_tree_remove(left, &bitmap->xb_root);
left->bn_last = last;
- xbitmap_tree_insert(left, &bitmap->xb_root);
+ xbitmap64_tree_insert(left, &bitmap->xb_root);
} else if (right) {
/* combine with right extent */
- xbitmap_tree_remove(right, &bitmap->xb_root);
+ xbitmap64_tree_remove(right, &bitmap->xb_root);
right->bn_start = start;
- xbitmap_tree_insert(right, &bitmap->xb_root);
+ xbitmap64_tree_insert(right, &bitmap->xb_root);
} else {
/* add an extent */
- left = kmalloc(sizeof(struct xbitmap_node), XCHK_GFP_FLAGS);
+ left = kmalloc(sizeof(struct xbitmap64_node), XCHK_GFP_FLAGS);
if (!left)
return -ENOMEM;
left->bn_start = start;
left->bn_last = last;
- xbitmap_tree_insert(left, &bitmap->xb_root);
+ xbitmap64_tree_insert(left, &bitmap->xb_root);
}
return 0;
@@ -174,21 +176,21 @@ xbitmap_set(
/* Free everything related to this bitmap. */
void
-xbitmap_destroy(
- struct xbitmap *bitmap)
+xbitmap64_destroy(
+ struct xbitmap64 *bitmap)
{
- struct xbitmap_node *bn;
+ struct xbitmap64_node *bn;
- while ((bn = xbitmap_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
- xbitmap_tree_remove(bn, &bitmap->xb_root);
+ while ((bn = xbitmap64_tree_iter_first(&bitmap->xb_root, 0, -1ULL))) {
+ xbitmap64_tree_remove(bn, &bitmap->xb_root);
kfree(bn);
}
}
/* Set up a per-AG block bitmap. */
void
-xbitmap_init(
- struct xbitmap *bitmap)
+xbitmap64_init(
+ struct xbitmap64 *bitmap)
{
bitmap->xb_root = RB_ROOT_CACHED;
}
@@ -208,18 +210,18 @@ xbitmap_init(
* This is the logical equivalent of bitmap &= ~sub.
*/
int
-xbitmap_disunion(
- struct xbitmap *bitmap,
- struct xbitmap *sub)
+xbitmap64_disunion(
+ struct xbitmap64 *bitmap,
+ struct xbitmap64 *sub)
{
- struct xbitmap_node *bn;
+ struct xbitmap64_node *bn;
int error;
- if (xbitmap_empty(bitmap) || xbitmap_empty(sub))
+ if (xbitmap64_empty(bitmap) || xbitmap64_empty(sub))
return 0;
- for_each_xbitmap_extent(bn, sub) {
- error = xbitmap_clear(bitmap, bn->bn_start,
+ for_each_xbitmap64_extent(bn, sub) {
+ error = xbitmap64_clear(bitmap, bn->bn_start,
bn->bn_last - bn->bn_start + 1);
if (error)
return error;
@@ -228,88 +230,273 @@ xbitmap_disunion(
return 0;
}
+/* How many bits are set in this bitmap? */
+uint64_t
+xbitmap64_hweight(
+ struct xbitmap64 *bitmap)
+{
+ struct xbitmap64_node *bn;
+ uint64_t ret = 0;
+
+ for_each_xbitmap64_extent(bn, bitmap)
+ ret += bn->bn_last - bn->bn_start + 1;
+
+ return ret;
+}
+
+/* Call a function for every run of set bits in this bitmap. */
+int
+xbitmap64_walk(
+ struct xbitmap64 *bitmap,
+ xbitmap64_walk_fn fn,
+ void *priv)
+{
+ struct xbitmap64_node *bn;
+ int error = 0;
+
+ for_each_xbitmap64_extent(bn, bitmap) {
+ error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
+ if (error)
+ break;
+ }
+
+ return error;
+}
+
+/* Does this bitmap have no bits set at all? */
+bool
+xbitmap64_empty(
+ struct xbitmap64 *bitmap)
+{
+ return bitmap->xb_root.rb_root.rb_node == NULL;
+}
+
+/* Is the start of the range set or clear? And for how long? */
+bool
+xbitmap64_test(
+ struct xbitmap64 *bitmap,
+ uint64_t start,
+ uint64_t *len)
+{
+ struct xbitmap64_node *bn;
+ uint64_t last = start + *len - 1;
+
+ bn = xbitmap64_tree_iter_first(&bitmap->xb_root, start, last);
+ if (!bn)
+ return false;
+ if (bn->bn_start <= start) {
+ if (bn->bn_last < last)
+ *len = bn->bn_last - start + 1;
+ return true;
+ }
+ *len = bn->bn_start - start;
+ return false;
+}
+
+/* u32 bitmap */
+
+struct xbitmap32_node {
+ struct rb_node bn_rbnode;
+
+ /* First set bit of this interval and subtree. */
+ uint32_t bn_start;
+
+ /* Last set bit of this interval. */
+ uint32_t bn_last;
+
+ /* Last set bit of this subtree. Do not touch this. */
+ uint32_t __bn_subtree_last;
+};
+
+/* Define our own interval tree type with uint32_t parameters. */
+
/*
- * Record all btree blocks seen while iterating all records of a btree.
- *
- * We know that the btree query_all function starts at the left edge and walks
- * towards the right edge of the tree. Therefore, we know that we can walk up
- * the btree cursor towards the root; if the pointer for a given level points
- * to the first record/key in that block, we haven't seen this block before;
- * and therefore we need to remember that we saw this block in the btree.
- *
- * So if our btree is:
- *
- * 4
- * / | \
- * 1 2 3
- *
- * Pretend for this example that each leaf block has 100 btree records. For
- * the first btree record, we'll observe that bc_levels[0].ptr == 1, so we
- * record that we saw block 1. Then we observe that bc_levels[1].ptr == 1, so
- * we record block 4. The list is [1, 4].
- *
- * For the second btree record, we see that bc_levels[0].ptr == 2, so we exit
- * the loop. The list remains [1, 4].
- *
- * For the 101st btree record, we've moved onto leaf block 2. Now
- * bc_levels[0].ptr == 1 again, so we record that we saw block 2. We see that
- * bc_levels[1].ptr == 2, so we exit the loop. The list is now [1, 4, 2].
- *
- * For the 102nd record, bc_levels[0].ptr == 2, so we continue.
- *
- * For the 201st record, we've moved on to leaf block 3.
- * bc_levels[0].ptr == 1, so we add 3 to the list. Now it is [1, 4, 2, 3].
- *
- * For the 300th record we just exit, with the list being [1, 4, 2, 3].
+ * These functions are defined by the INTERVAL_TREE_DEFINE macro, but we'll
+ * forward-declare them anyway for clarity.
*/
+static inline void
+xbitmap32_tree_insert(struct xbitmap32_node *node, struct rb_root_cached *root);
-/* Mark a btree block to the agblock bitmap. */
-STATIC int
-xagb_bitmap_visit_btblock(
- struct xfs_btree_cur *cur,
- int level,
- void *priv)
+static inline void
+xbitmap32_tree_remove(struct xbitmap32_node *node, struct rb_root_cached *root);
+
+static inline struct xbitmap32_node *
+xbitmap32_tree_iter_first(struct rb_root_cached *root, uint32_t start,
+ uint32_t last);
+
+static inline struct xbitmap32_node *
+xbitmap32_tree_iter_next(struct xbitmap32_node *node, uint32_t start,
+ uint32_t last);
+
+INTERVAL_TREE_DEFINE(struct xbitmap32_node, bn_rbnode, uint32_t,
+ __bn_subtree_last, START, LAST, static inline, xbitmap32_tree)
+
+/* Iterate each interval of a bitmap. Do not change the bitmap. */
+#define for_each_xbitmap32_extent(bn, bitmap) \
+ for ((bn) = rb_entry_safe(rb_first(&(bitmap)->xb_root.rb_root), \
+ struct xbitmap32_node, bn_rbnode); \
+ (bn) != NULL; \
+ (bn) = rb_entry_safe(rb_next(&(bn)->bn_rbnode), \
+ struct xbitmap32_node, bn_rbnode))
+
+/* Clear a range of this bitmap. */
+int
+xbitmap32_clear(
+ struct xbitmap32 *bitmap,
+ uint32_t start,
+ uint32_t len)
{
- struct xagb_bitmap *bitmap = priv;
- struct xfs_buf *bp;
- xfs_fsblock_t fsbno;
- xfs_agblock_t agbno;
+ struct xbitmap32_node *bn;
+ struct xbitmap32_node *new_bn;
+ uint32_t last = start + len - 1;
- xfs_btree_get_block(cur, level, &bp);
- if (!bp)
- return 0;
+ while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last))) {
+ if (bn->bn_start < start && bn->bn_last > last) {
+ uint32_t old_last = bn->bn_last;
- fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, xfs_buf_daddr(bp));
- agbno = XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno);
+ /* overlaps with the entire clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_last = start - 1;
+ xbitmap32_tree_insert(bn, &bitmap->xb_root);
- return xagb_bitmap_set(bitmap, agbno, 1);
+ /* add an extent */
+ new_bn = kmalloc(sizeof(struct xbitmap32_node),
+ XCHK_GFP_FLAGS);
+ if (!new_bn)
+ return -ENOMEM;
+ new_bn->bn_start = last + 1;
+ new_bn->bn_last = old_last;
+ xbitmap32_tree_insert(new_bn, &bitmap->xb_root);
+ } else if (bn->bn_start < start) {
+ /* overlaps with the left side of the clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_last = start - 1;
+ xbitmap32_tree_insert(bn, &bitmap->xb_root);
+ } else if (bn->bn_last > last) {
+ /* overlaps with the right side of the clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ bn->bn_start = last + 1;
+ xbitmap32_tree_insert(bn, &bitmap->xb_root);
+ break;
+ } else {
+ /* in the middle of the clearing range */
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ kfree(bn);
+ }
+ }
+
+ return 0;
}
-/* Mark all (per-AG) btree blocks in the agblock bitmap. */
+/* Set a range of this bitmap. */
int
-xagb_bitmap_set_btblocks(
- struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur)
+xbitmap32_set(
+ struct xbitmap32 *bitmap,
+ uint32_t start,
+ uint32_t len)
{
- return xfs_btree_visit_blocks(cur, xagb_bitmap_visit_btblock,
- XFS_BTREE_VISIT_ALL, bitmap);
+ struct xbitmap32_node *left;
+ struct xbitmap32_node *right;
+ uint32_t last = start + len - 1;
+ int error;
+
+ /* Is this whole range already set? */
+ left = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last);
+ if (left && left->bn_start <= start && left->bn_last >= last)
+ return 0;
+
+ /* Clear out everything in the range we want to set. */
+ error = xbitmap32_clear(bitmap, start, len);
+ if (error)
+ return error;
+
+ /* Do we have a left-adjacent extent? */
+ left = xbitmap32_tree_iter_first(&bitmap->xb_root, start - 1, start - 1);
+ ASSERT(!left || left->bn_last + 1 == start);
+
+ /* Do we have a right-adjacent extent? */
+ right = xbitmap32_tree_iter_first(&bitmap->xb_root, last + 1, last + 1);
+ ASSERT(!right || right->bn_start == last + 1);
+
+ if (left && right) {
+ /* combine left and right adjacent extent */
+ xbitmap32_tree_remove(left, &bitmap->xb_root);
+ xbitmap32_tree_remove(right, &bitmap->xb_root);
+ left->bn_last = right->bn_last;
+ xbitmap32_tree_insert(left, &bitmap->xb_root);
+ kfree(right);
+ } else if (left) {
+ /* combine with left extent */
+ xbitmap32_tree_remove(left, &bitmap->xb_root);
+ left->bn_last = last;
+ xbitmap32_tree_insert(left, &bitmap->xb_root);
+ } else if (right) {
+ /* combine with right extent */
+ xbitmap32_tree_remove(right, &bitmap->xb_root);
+ right->bn_start = start;
+ xbitmap32_tree_insert(right, &bitmap->xb_root);
+ } else {
+ /* add an extent */
+ left = kmalloc(sizeof(struct xbitmap32_node), XCHK_GFP_FLAGS);
+ if (!left)
+ return -ENOMEM;
+ left->bn_start = start;
+ left->bn_last = last;
+ xbitmap32_tree_insert(left, &bitmap->xb_root);
+ }
+
+ return 0;
+}
+
+/* Free everything related to this bitmap. */
+void
+xbitmap32_destroy(
+ struct xbitmap32 *bitmap)
+{
+ struct xbitmap32_node *bn;
+
+ while ((bn = xbitmap32_tree_iter_first(&bitmap->xb_root, 0, -1U))) {
+ xbitmap32_tree_remove(bn, &bitmap->xb_root);
+ kfree(bn);
+ }
+}
+
+/* Set up a per-AG block bitmap. */
+void
+xbitmap32_init(
+ struct xbitmap32 *bitmap)
+{
+ bitmap->xb_root = RB_ROOT_CACHED;
}
/*
- * Record all the buffers pointed to by the btree cursor. Callers already
- * engaged in a btree walk should call this function to capture the list of
- * blocks going from the leaf towards the root.
+ * Remove all the blocks mentioned in @sub from the extents in @bitmap.
+ *
+ * The intent is that callers will iterate the rmapbt for all of its records
+ * for a given owner to generate @bitmap; and iterate all the blocks of the
+ * metadata structures that are not being rebuilt and have the same rmapbt
+ * owner to generate @sub. This routine subtracts all the extents
+ * mentioned in sub from all the extents linked in @bitmap, which leaves
+ * @bitmap as the list of blocks that are not accounted for, which we assume
+ * are the dead blocks of the old metadata structure. The blocks mentioned in
+ * @bitmap can be reaped.
+ *
+ * This is the logical equivalent of bitmap &= ~sub.
*/
int
-xagb_bitmap_set_btcur_path(
- struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur)
+xbitmap32_disunion(
+ struct xbitmap32 *bitmap,
+ struct xbitmap32 *sub)
{
- int i;
+ struct xbitmap32_node *bn;
int error;
- for (i = 0; i < cur->bc_nlevels && cur->bc_levels[i].ptr == 1; i++) {
- error = xagb_bitmap_visit_btblock(cur, i, bitmap);
+ if (xbitmap32_empty(bitmap) || xbitmap32_empty(sub))
+ return 0;
+
+ for_each_xbitmap32_extent(bn, sub) {
+ error = xbitmap32_clear(bitmap, bn->bn_start,
+ bn->bn_last - bn->bn_start + 1);
if (error)
return error;
}
@@ -318,14 +505,14 @@ xagb_bitmap_set_btcur_path(
}
/* How many bits are set in this bitmap? */
-uint64_t
-xbitmap_hweight(
- struct xbitmap *bitmap)
+uint32_t
+xbitmap32_hweight(
+ struct xbitmap32 *bitmap)
{
- struct xbitmap_node *bn;
- uint64_t ret = 0;
+ struct xbitmap32_node *bn;
+ uint32_t ret = 0;
- for_each_xbitmap_extent(bn, bitmap)
+ for_each_xbitmap32_extent(bn, bitmap)
ret += bn->bn_last - bn->bn_start + 1;
return ret;
@@ -333,15 +520,15 @@ xbitmap_hweight(
/* Call a function for every run of set bits in this bitmap. */
int
-xbitmap_walk(
- struct xbitmap *bitmap,
- xbitmap_walk_fn fn,
+xbitmap32_walk(
+ struct xbitmap32 *bitmap,
+ xbitmap32_walk_fn fn,
void *priv)
{
- struct xbitmap_node *bn;
+ struct xbitmap32_node *bn;
int error = 0;
- for_each_xbitmap_extent(bn, bitmap) {
+ for_each_xbitmap32_extent(bn, bitmap) {
error = fn(bn->bn_start, bn->bn_last - bn->bn_start + 1, priv);
if (error)
break;
@@ -352,23 +539,23 @@ xbitmap_walk(
/* Does this bitmap have no bits set at all? */
bool
-xbitmap_empty(
- struct xbitmap *bitmap)
+xbitmap32_empty(
+ struct xbitmap32 *bitmap)
{
return bitmap->xb_root.rb_root.rb_node == NULL;
}
/* Is the start of the range set or clear? And for how long? */
bool
-xbitmap_test(
- struct xbitmap *bitmap,
- uint64_t start,
- uint64_t *len)
+xbitmap32_test(
+ struct xbitmap32 *bitmap,
+ uint32_t start,
+ uint32_t *len)
{
- struct xbitmap_node *bn;
- uint64_t last = start + *len - 1;
+ struct xbitmap32_node *bn;
+ uint32_t last = start + *len - 1;
- bn = xbitmap_tree_iter_first(&bitmap->xb_root, start, last);
+ bn = xbitmap32_tree_iter_first(&bitmap->xb_root, start, last);
if (!bn)
return false;
if (bn->bn_start <= start) {
diff --git a/fs/xfs/scrub/bitmap.h b/fs/xfs/scrub/bitmap.h
index 4fe58bad6734..2df8911606d6 100644
--- a/fs/xfs/scrub/bitmap.h
+++ b/fs/xfs/scrub/bitmap.h
@@ -6,17 +6,19 @@
#ifndef __XFS_SCRUB_BITMAP_H__
#define __XFS_SCRUB_BITMAP_H__
-struct xbitmap {
+/* u64 bitmap */
+
+struct xbitmap64 {
struct rb_root_cached xb_root;
};
-void xbitmap_init(struct xbitmap *bitmap);
-void xbitmap_destroy(struct xbitmap *bitmap);
+void xbitmap64_init(struct xbitmap64 *bitmap);
+void xbitmap64_destroy(struct xbitmap64 *bitmap);
-int xbitmap_clear(struct xbitmap *bitmap, uint64_t start, uint64_t len);
-int xbitmap_set(struct xbitmap *bitmap, uint64_t start, uint64_t len);
-int xbitmap_disunion(struct xbitmap *bitmap, struct xbitmap *sub);
-uint64_t xbitmap_hweight(struct xbitmap *bitmap);
+int xbitmap64_clear(struct xbitmap64 *bitmap, uint64_t start, uint64_t len);
+int xbitmap64_set(struct xbitmap64 *bitmap, uint64_t start, uint64_t len);
+int xbitmap64_disunion(struct xbitmap64 *bitmap, struct xbitmap64 *sub);
+uint64_t xbitmap64_hweight(struct xbitmap64 *bitmap);
/*
* Return codes for the bitmap iterator functions are 0 to continue iterating,
@@ -25,84 +27,39 @@ uint64_t xbitmap_hweight(struct xbitmap *bitmap);
* iteration, because neither bitmap iterator ever generates that error code on
* its own. Callers must not modify the bitmap while walking it.
*/
-typedef int (*xbitmap_walk_fn)(uint64_t start, uint64_t len, void *priv);
-int xbitmap_walk(struct xbitmap *bitmap, xbitmap_walk_fn fn,
+typedef int (*xbitmap64_walk_fn)(uint64_t start, uint64_t len, void *priv);
+int xbitmap64_walk(struct xbitmap64 *bitmap, xbitmap64_walk_fn fn,
void *priv);
-bool xbitmap_empty(struct xbitmap *bitmap);
-bool xbitmap_test(struct xbitmap *bitmap, uint64_t start, uint64_t *len);
+bool xbitmap64_empty(struct xbitmap64 *bitmap);
+bool xbitmap64_test(struct xbitmap64 *bitmap, uint64_t start, uint64_t *len);
-/* Bitmaps, but for type-checked for xfs_agblock_t */
+/* u32 bitmap */
-struct xagb_bitmap {
- struct xbitmap agbitmap;
+struct xbitmap32 {
+ struct rb_root_cached xb_root;
};
-static inline void xagb_bitmap_init(struct xagb_bitmap *bitmap)
-{
- xbitmap_init(&bitmap->agbitmap);
-}
-
-static inline void xagb_bitmap_destroy(struct xagb_bitmap *bitmap)
-{
- xbitmap_destroy(&bitmap->agbitmap);
-}
-
-static inline int xagb_bitmap_clear(struct xagb_bitmap *bitmap,
- xfs_agblock_t start, xfs_extlen_t len)
-{
- return xbitmap_clear(&bitmap->agbitmap, start, len);
-}
-static inline int xagb_bitmap_set(struct xagb_bitmap *bitmap,
- xfs_agblock_t start, xfs_extlen_t len)
-{
- return xbitmap_set(&bitmap->agbitmap, start, len);
-}
-
-static inline bool
-xagb_bitmap_test(
- struct xagb_bitmap *bitmap,
- xfs_agblock_t start,
- xfs_extlen_t *len)
-{
- uint64_t biglen = *len;
- bool ret;
-
- ret = xbitmap_test(&bitmap->agbitmap, start, &biglen);
-
- if (start + biglen >= UINT_MAX) {
- ASSERT(0);
- biglen = UINT_MAX - start;
- }
-
- *len = biglen;
- return ret;
-}
-
-static inline int xagb_bitmap_disunion(struct xagb_bitmap *bitmap,
- struct xagb_bitmap *sub)
-{
- return xbitmap_disunion(&bitmap->agbitmap, &sub->agbitmap);
-}
+void xbitmap32_init(struct xbitmap32 *bitmap);
+void xbitmap32_destroy(struct xbitmap32 *bitmap);
-static inline uint32_t xagb_bitmap_hweight(struct xagb_bitmap *bitmap)
-{
- return xbitmap_hweight(&bitmap->agbitmap);
-}
-static inline bool xagb_bitmap_empty(struct xagb_bitmap *bitmap)
-{
- return xbitmap_empty(&bitmap->agbitmap);
-}
+int xbitmap32_clear(struct xbitmap32 *bitmap, uint32_t start, uint32_t len);
+int xbitmap32_set(struct xbitmap32 *bitmap, uint32_t start, uint32_t len);
+int xbitmap32_disunion(struct xbitmap32 *bitmap, struct xbitmap32 *sub);
+uint32_t xbitmap32_hweight(struct xbitmap32 *bitmap);
-static inline int xagb_bitmap_walk(struct xagb_bitmap *bitmap,
- xbitmap_walk_fn fn, void *priv)
-{
- return xbitmap_walk(&bitmap->agbitmap, fn, priv);
-}
+/*
+ * Return codes for the bitmap iterator functions are 0 to continue iterating,
+ * and non-zero to stop iterating. Any non-zero value will be passed up to the
+ * iteration caller. The special value -ECANCELED can be used to stop
+ * iteration, because neither bitmap iterator ever generates that error code on
+ * its own. Callers must not modify the bitmap while walking it.
+ */
+typedef int (*xbitmap32_walk_fn)(uint32_t start, uint32_t len, void *priv);
+int xbitmap32_walk(struct xbitmap32 *bitmap, xbitmap32_walk_fn fn,
+ void *priv);
-int xagb_bitmap_set_btblocks(struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur);
-int xagb_bitmap_set_btcur_path(struct xagb_bitmap *bitmap,
- struct xfs_btree_cur *cur);
+bool xbitmap32_empty(struct xbitmap32 *bitmap);
+bool xbitmap32_test(struct xbitmap32 *bitmap, uint32_t start, uint32_t *len);
#endif /* __XFS_SCRUB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c
index 06d8c1996a33..b169cddde6da 100644
--- a/fs/xfs/scrub/bmap.c
+++ b/fs/xfs/scrub/bmap.c
@@ -19,9 +19,11 @@
#include "xfs_bmap_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/btree.h"
+#include "scrub/health.h"
#include "xfs_ag.h"
/* Set us up with an inode's bmap. */
@@ -48,9 +50,18 @@ xchk_setup_inode_bmap(
if (S_ISREG(VFS_I(sc->ip)->i_mode) &&
sc->sm->sm_type != XFS_SCRUB_TYPE_BMBTA) {
struct address_space *mapping = VFS_I(sc->ip)->i_mapping;
+ bool is_repair = xchk_could_repair(sc);
xchk_ilock(sc, XFS_MMAPLOCK_EXCL);
+ /* Break all our leases, we're going to mess with things. */
+ if (is_repair) {
+ error = xfs_break_layouts(VFS_I(sc->ip),
+ &sc->ilock_flags, BREAK_WRITE);
+ if (error)
+ goto out;
+ }
+
inode_dio_wait(VFS_I(sc->ip));
/*
@@ -71,6 +82,15 @@ xchk_setup_inode_bmap(
error = filemap_fdatawait_keep_errors(mapping);
if (error && (error != -ENOSPC && error != -EIO))
goto out;
+
+ /* Drop the page cache if we're repairing block mappings. */
+ if (is_repair) {
+ error = invalidate_inode_pages2(
+ VFS_I(sc->ip)->i_mapping);
+ if (error)
+ goto out;
+ }
+
}
/* Got the inode, lock it and we're ready to go. */
@@ -78,6 +98,10 @@ xchk_setup_inode_bmap(
if (error)
goto out;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ goto out;
+
xchk_ilock(sc, XFS_ILOCK_EXCL);
out:
/* scrub teardown will unlock and release the inode */
@@ -633,6 +657,82 @@ xchk_bmap_check_ag_rmaps(
}
/*
+ * Decide if we want to scan the reverse mappings to determine if the attr
+ * fork /really/ has zero space mappings.
+ */
+STATIC bool
+xchk_bmap_check_empty_attrfork(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *ifp = &ip->i_af;
+
+ /*
+ * If the dinode repair found a bad attr fork, it will reset the fork
+ * to extents format with zero records and wait for the this scrubber
+ * to reconstruct the block mappings. If the fork is not in this
+ * state, then the fork cannot have been zapped.
+ */
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0)
+ return false;
+
+ /*
+ * Files can have an attr fork in EXTENTS format with zero records for
+ * several reasons:
+ *
+ * a) an attr set created a fork but ran out of space
+ * b) attr replace deleted an old attr but failed during the set step
+ * c) the data fork was in btree format when all attrs were deleted, so
+ * the fork was left in place
+ * d) the inode repair code zapped the fork
+ *
+ * Only in case (d) do we want to scan the rmapbt to see if we need to
+ * rebuild the attr fork. The fork zap code clears all DAC permission
+ * bits and zeroes the uid and gid, so avoid the scan if any of those
+ * three conditions are not met.
+ */
+ if ((VFS_I(ip)->i_mode & 0777) != 0)
+ return false;
+ if (!uid_eq(VFS_I(ip)->i_uid, GLOBAL_ROOT_UID))
+ return false;
+ if (!gid_eq(VFS_I(ip)->i_gid, GLOBAL_ROOT_GID))
+ return false;
+
+ return true;
+}
+
+/*
+ * Decide if we want to scan the reverse mappings to determine if the data
+ * fork /really/ has zero space mappings.
+ */
+STATIC bool
+xchk_bmap_check_empty_datafork(
+ struct xfs_inode *ip)
+{
+ struct xfs_ifork *ifp = &ip->i_df;
+
+ /* Don't support realtime rmap checks yet. */
+ if (XFS_IS_REALTIME_INODE(ip))
+ return false;
+
+ /*
+ * If the dinode repair found a bad data fork, it will reset the fork
+ * to extents format with zero records and wait for the this scrubber
+ * to reconstruct the block mappings. If the fork is not in this
+ * state, then the fork cannot have been zapped.
+ */
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS || ifp->if_nextents != 0)
+ return false;
+
+ /*
+ * If we encounter an empty data fork along with evidence that the fork
+ * might not really be empty, we need to scan the reverse mappings to
+ * decide if we're going to rebuild the fork. Data forks with nonzero
+ * file size are scanned.
+ */
+ return i_size_read(VFS_I(ip)) != 0;
+}
+
+/*
* Decide if we want to walk every rmap btree in the fs to make sure that each
* rmap for this file fork has corresponding bmbt entries.
*/
@@ -641,7 +741,6 @@ xchk_bmap_want_check_rmaps(
struct xchk_bmap_info *info)
{
struct xfs_scrub *sc = info->sc;
- struct xfs_ifork *ifp;
if (!xfs_has_rmapbt(sc->mp))
return false;
@@ -650,28 +749,10 @@ xchk_bmap_want_check_rmaps(
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return false;
- /* Don't support realtime rmap checks yet. */
- if (info->is_rt)
- return false;
-
- /*
- * The inode repair code zaps broken inode forks by resetting them back
- * to EXTENTS format and zero extent records. If we encounter a fork
- * in this state along with evidence that the fork isn't supposed to be
- * empty, we need to scan the reverse mappings to decide if we're going
- * to rebuild the fork. Data forks with nonzero file size are scanned.
- * xattr forks are never empty of content, so they are always scanned.
- */
- ifp = xfs_ifork_ptr(sc->ip, info->whichfork);
- if (ifp->if_format == XFS_DINODE_FMT_EXTENTS && ifp->if_nextents == 0) {
- if (info->whichfork == XFS_DATA_FORK &&
- i_size_read(VFS_I(sc->ip)) == 0)
- return false;
-
- return true;
- }
+ if (info->whichfork == XFS_ATTR_FORK)
+ return xchk_bmap_check_empty_attrfork(sc->ip);
- return false;
+ return xchk_bmap_check_empty_datafork(sc->ip);
}
/* Make sure each rmap has a corresponding bmbt entry. */
@@ -939,7 +1020,20 @@ int
xchk_bmap_data(
struct xfs_scrub *sc)
{
- return xchk_bmap(sc, XFS_DATA_FORK);
+ int error;
+
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTD_ZAPPED)) {
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ error = xchk_bmap(sc, XFS_DATA_FORK);
+ if (error)
+ return error;
+
+ /* If the data fork is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTD_ZAPPED);
+ return 0;
}
/* Scrub an inode's attr fork. */
@@ -947,7 +1041,27 @@ int
xchk_bmap_attr(
struct xfs_scrub *sc)
{
- return xchk_bmap(sc, XFS_ATTR_FORK);
+ int error;
+
+ /*
+ * If the attr fork has been zapped, it's possible that forkoff was
+ * reset to zero and hence sc->ip->i_afp is NULL. We don't want the
+ * NULL ifp check in xchk_bmap to conclude that the attr fork is ok,
+ * so short circuit that logic by setting the corruption flag and
+ * returning immediately.
+ */
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_BMBTA_ZAPPED)) {
+ xchk_ino_set_corrupt(sc, sc->ip->i_ino);
+ return 0;
+ }
+
+ error = xchk_bmap(sc, XFS_ATTR_FORK);
+ if (error)
+ return error;
+
+ /* If the attr fork is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_BMBTA_ZAPPED);
+ return 0;
}
/* Scrub an inode's CoW fork. */
diff --git a/fs/xfs/scrub/bmap_repair.c b/fs/xfs/scrub/bmap_repair.c
new file mode 100644
index 000000000000..a4bb89fdd510
--- /dev/null
+++ b/fs/xfs/scrub/bmap_repair.c
@@ -0,0 +1,867 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_quota.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "xfs_reflink.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Inode Fork Block Mapping (BMBT) Repair
+ * ======================================
+ *
+ * Gather all the rmap records for the inode and fork we're fixing, reset the
+ * incore fork, then recreate the btree.
+ */
+
+enum reflink_scan_state {
+ RLS_IRRELEVANT = -1, /* not applicable to this file */
+ RLS_UNKNOWN, /* shared extent scans required */
+ RLS_SET_IFLAG, /* iflag must be set */
+};
+
+struct xrep_bmap {
+ /* Old bmbt blocks */
+ struct xfsb_bitmap old_bmbt_blocks;
+
+ /* New fork. */
+ struct xrep_newbt new_bmapbt;
+
+ /* List of new bmap records. */
+ struct xfarray *bmap_records;
+
+ struct xfs_scrub *sc;
+
+ /* How many blocks did we find allocated to this file? */
+ xfs_rfsblock_t nblocks;
+
+ /* How many bmbt blocks did we find for this fork? */
+ xfs_rfsblock_t old_bmbt_block_count;
+
+ /* get_records()'s position in the free space record array. */
+ xfarray_idx_t array_cur;
+
+ /* How many real (non-hole, non-delalloc) mappings do we have? */
+ uint64_t real_mappings;
+
+ /* Which fork are we fixing? */
+ int whichfork;
+
+ /* What d the REFLINK flag be set when the repair is over? */
+ enum reflink_scan_state reflink_scan;
+
+ /* Do we allow unwritten extents? */
+ bool allow_unwritten;
+};
+
+/* Is this space extent shared? Flag the inode if it is. */
+STATIC int
+xrep_bmap_discover_shared(
+ struct xrep_bmap *rb,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount)
+{
+ struct xfs_scrub *sc = rb->sc;
+ xfs_agblock_t agbno;
+ xfs_agblock_t fbno;
+ xfs_extlen_t flen;
+ int error;
+
+ agbno = XFS_FSB_TO_AGBNO(sc->mp, startblock);
+ error = xfs_refcount_find_shared(sc->sa.refc_cur, agbno, blockcount,
+ &fbno, &flen, false);
+ if (error)
+ return error;
+
+ if (fbno != NULLAGBLOCK)
+ rb->reflink_scan = RLS_SET_IFLAG;
+
+ return 0;
+}
+
+/* Remember this reverse-mapping as a series of bmap records. */
+STATIC int
+xrep_bmap_from_rmap(
+ struct xrep_bmap *rb,
+ xfs_fileoff_t startoff,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount,
+ bool unwritten)
+{
+ struct xfs_bmbt_irec irec = {
+ .br_startoff = startoff,
+ .br_startblock = startblock,
+ .br_state = unwritten ? XFS_EXT_UNWRITTEN : XFS_EXT_NORM,
+ };
+ struct xfs_bmbt_rec rbe;
+ struct xfs_scrub *sc = rb->sc;
+ int error = 0;
+
+ /*
+ * If we're repairing the data fork of a non-reflinked regular file on
+ * a reflink filesystem, we need to figure out if this space extent is
+ * shared.
+ */
+ if (rb->reflink_scan == RLS_UNKNOWN && !unwritten) {
+ error = xrep_bmap_discover_shared(rb, startblock, blockcount);
+ if (error)
+ return error;
+ }
+
+ do {
+ xfs_failaddr_t fa;
+
+ irec.br_blockcount = min_t(xfs_filblks_t, blockcount,
+ XFS_MAX_BMBT_EXTLEN);
+
+ fa = xfs_bmap_validate_extent(sc->ip, rb->whichfork, &irec);
+ if (fa)
+ return -EFSCORRUPTED;
+
+ xfs_bmbt_disk_set_all(&rbe, &irec);
+
+ trace_xrep_bmap_found(sc->ip, rb->whichfork, &irec);
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ error = xfarray_append(rb->bmap_records, &rbe);
+ if (error)
+ return error;
+
+ rb->real_mappings++;
+
+ irec.br_startblock += irec.br_blockcount;
+ irec.br_startoff += irec.br_blockcount;
+ blockcount -= irec.br_blockcount;
+ } while (blockcount > 0);
+
+ return 0;
+}
+
+/* Check for any obvious errors or conflicts in the file mapping. */
+STATIC int
+xrep_bmap_check_fork_rmap(
+ struct xrep_bmap *rb,
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec)
+{
+ struct xfs_scrub *sc = rb->sc;
+ enum xbtree_recpacking outcome;
+ int error;
+
+ /*
+ * Data extents for rt files are never stored on the data device, but
+ * everything else (xattrs, bmbt blocks) can be.
+ */
+ if (XFS_IS_REALTIME_INODE(sc->ip) &&
+ !(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)))
+ return -EFSCORRUPTED;
+
+ /* Check that this is within the AG. */
+ if (!xfs_verify_agbext(cur->bc_ag.pag, rec->rm_startblock,
+ rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* Check the file offset range. */
+ if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+ !xfs_verify_fileext(sc->mp, rec->rm_offset, rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ /* No contradictory flags. */
+ if ((rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)) &&
+ (rec->rm_flags & XFS_RMAP_UNWRITTEN))
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rm_startblock,
+ rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ /* Must not be an inode chunk. */
+ error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+ rec->rm_startblock, rec->rm_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Record extents that belong to this inode's fork. */
+STATIC int
+xrep_bmap_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_bmap *rb = priv;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_fsblock_t fsbno;
+ int error = 0;
+
+ if (xchk_should_terminate(rb->sc, &error))
+ return error;
+
+ if (rec->rm_owner != rb->sc->ip->i_ino)
+ return 0;
+
+ error = xrep_bmap_check_fork_rmap(rb, cur, rec);
+ if (error)
+ return error;
+
+ /*
+ * Record all blocks allocated to this file even if the extent isn't
+ * for the fork we're rebuilding so that we can reset di_nblocks later.
+ */
+ rb->nblocks += rec->rm_blockcount;
+
+ /* If this rmap isn't for the fork we want, we're done. */
+ if (rb->whichfork == XFS_DATA_FORK &&
+ (rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+ if (rb->whichfork == XFS_ATTR_FORK &&
+ !(rec->rm_flags & XFS_RMAP_ATTR_FORK))
+ return 0;
+
+ /* Reject unwritten extents if we don't allow those. */
+ if ((rec->rm_flags & XFS_RMAP_UNWRITTEN) && !rb->allow_unwritten)
+ return -EFSCORRUPTED;
+
+ fsbno = XFS_AGB_TO_FSB(mp, cur->bc_ag.pag->pag_agno,
+ rec->rm_startblock);
+
+ if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) {
+ rb->old_bmbt_block_count += rec->rm_blockcount;
+ return xfsb_bitmap_set(&rb->old_bmbt_blocks, fsbno,
+ rec->rm_blockcount);
+ }
+
+ return xrep_bmap_from_rmap(rb, rec->rm_offset, fsbno,
+ rec->rm_blockcount,
+ rec->rm_flags & XFS_RMAP_UNWRITTEN);
+}
+
+/*
+ * Compare two block mapping records. We want to sort in order of increasing
+ * file offset.
+ */
+static int
+xrep_bmap_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_bmbt_rec *ba = a;
+ const struct xfs_bmbt_rec *bb = b;
+ xfs_fileoff_t ao = xfs_bmbt_disk_get_startoff(ba);
+ xfs_fileoff_t bo = xfs_bmbt_disk_get_startoff(bb);
+
+ if (ao > bo)
+ return 1;
+ else if (ao < bo)
+ return -1;
+ return 0;
+}
+
+/*
+ * Sort the bmap extents by fork offset or else the records will be in the
+ * wrong order. Ensure there are no overlaps in the file offset ranges.
+ */
+STATIC int
+xrep_bmap_sort_records(
+ struct xrep_bmap *rb)
+{
+ struct xfs_bmbt_irec irec;
+ xfs_fileoff_t next_off = 0;
+ xfarray_idx_t array_cur;
+ int error;
+
+ error = xfarray_sort(rb->bmap_records, xrep_bmap_extent_cmp,
+ XFARRAY_SORT_KILLABLE);
+ if (error)
+ return error;
+
+ foreach_xfarray_idx(rb->bmap_records, array_cur) {
+ struct xfs_bmbt_rec rec;
+
+ if (xchk_should_terminate(rb->sc, &error))
+ return error;
+
+ error = xfarray_load(rb->bmap_records, array_cur, &rec);
+ if (error)
+ return error;
+
+ xfs_bmbt_disk_get_all(&rec, &irec);
+
+ if (irec.br_startoff < next_off)
+ return -EFSCORRUPTED;
+
+ next_off = irec.br_startoff + irec.br_blockcount;
+ }
+
+ return 0;
+}
+
+/* Scan one AG for reverse mappings that we can turn into extent maps. */
+STATIC int
+xrep_bmap_scan_ag(
+ struct xrep_bmap *rb,
+ struct xfs_perag *pag)
+{
+ struct xfs_scrub *sc = rb->sc;
+ int error;
+
+ error = xrep_ag_init(sc, pag, &sc->sa);
+ if (error)
+ return error;
+
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_bmap_walk_rmap, rb);
+ xchk_ag_free(sc, &sc->sa);
+ return error;
+}
+
+/* Find the delalloc extents from the old incore extent tree. */
+STATIC int
+xrep_bmap_find_delalloc(
+ struct xrep_bmap *rb)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_rec rbe;
+ struct xfs_inode *ip = rb->sc->ip;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, rb->whichfork);
+ int error = 0;
+
+ /*
+ * Skip this scan if we don't expect to find delayed allocation
+ * reservations in this fork.
+ */
+ if (rb->whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0)
+ return 0;
+
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ if (!isnullstartblock(irec.br_startblock))
+ continue;
+
+ xfs_bmbt_disk_set_all(&rbe, &irec);
+
+ trace_xrep_bmap_found(ip, rb->whichfork, &irec);
+
+ if (xchk_should_terminate(rb->sc, &error))
+ return error;
+
+ error = xfarray_append(rb->bmap_records, &rbe);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/*
+ * Collect block mappings for this fork of this inode and decide if we have
+ * enough space to rebuild. Caller is responsible for cleaning up the list if
+ * anything goes wrong.
+ */
+STATIC int
+xrep_bmap_find_mappings(
+ struct xrep_bmap *rb)
+{
+ struct xfs_scrub *sc = rb->sc;
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ int error = 0;
+
+ /* Iterate the rmaps for extents. */
+ for_each_perag(sc->mp, agno, pag) {
+ error = xrep_bmap_scan_ag(rb, pag);
+ if (error) {
+ xfs_perag_rele(pag);
+ return error;
+ }
+ }
+
+ return xrep_bmap_find_delalloc(rb);
+}
+
+/* Retrieve real extent mappings for bulk loading the bmap btree. */
+STATIC int
+xrep_bmap_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_bmbt_rec rec;
+ struct xfs_bmbt_irec *irec = &cur->bc_rec.b;
+ struct xrep_bmap *rb = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ do {
+ error = xfarray_load(rb->bmap_records, rb->array_cur++,
+ &rec);
+ if (error)
+ return error;
+
+ xfs_bmbt_disk_get_all(&rec, irec);
+ } while (isnullstartblock(irec->br_startblock));
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_bmap_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_bmap *rb = priv;
+
+ return xrep_newbt_claim_block(cur, &rb->new_bmapbt, ptr);
+}
+
+/* Figure out how much space we need to create the incore btree root block. */
+STATIC size_t
+xrep_bmap_iroot_size(
+ struct xfs_btree_cur *cur,
+ unsigned int level,
+ unsigned int nr_this_level,
+ void *priv)
+{
+ ASSERT(level > 0);
+
+ return XFS_BMAP_BROOT_SPACE_CALC(cur->bc_mp, nr_this_level);
+}
+
+/* Update the inode counters. */
+STATIC int
+xrep_bmap_reset_counters(
+ struct xrep_bmap *rb)
+{
+ struct xfs_scrub *sc = rb->sc;
+ struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake;
+ int64_t delta;
+
+ if (rb->reflink_scan == RLS_SET_IFLAG)
+ sc->ip->i_diflags2 |= XFS_DIFLAG2_REFLINK;
+
+ /*
+ * Update the inode block counts to reflect the extents we found in the
+ * rmapbt.
+ */
+ delta = ifake->if_blocks - rb->old_bmbt_block_count;
+ sc->ip->i_nblocks = rb->nblocks + delta;
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+
+ /*
+ * Adjust the quota counts by the difference in size between the old
+ * and new bmbt.
+ */
+ xfs_trans_mod_dquot_byino(sc->tp, sc->ip, XFS_TRANS_DQ_BCOUNT, delta);
+ return 0;
+}
+
+/*
+ * Create a new iext tree and load it with block mappings. If the inode is
+ * in extents format, that's all we need to do to commit the new mappings.
+ * If it is in btree format, this takes care of preloading the incore tree.
+ */
+STATIC int
+xrep_bmap_extents_load(
+ struct xrep_bmap *rb)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec irec;
+ struct xfs_ifork *ifp = rb->new_bmapbt.ifake.if_fork;
+ xfarray_idx_t array_cur;
+ int error;
+
+ ASSERT(ifp->if_bytes == 0);
+
+ /* Add all the mappings (incl. delalloc) to the incore extent tree. */
+ xfs_iext_first(ifp, &icur);
+ foreach_xfarray_idx(rb->bmap_records, array_cur) {
+ struct xfs_bmbt_rec rec;
+
+ error = xfarray_load(rb->bmap_records, array_cur, &rec);
+ if (error)
+ return error;
+
+ xfs_bmbt_disk_get_all(&rec, &irec);
+
+ xfs_iext_insert_raw(ifp, &icur, &irec);
+ if (!isnullstartblock(irec.br_startblock))
+ ifp->if_nextents++;
+
+ xfs_iext_next(ifp, &icur);
+ }
+
+ return xrep_ino_ensure_extent_count(rb->sc, rb->whichfork,
+ ifp->if_nextents);
+}
+
+/*
+ * Reserve new btree blocks, bulk load the bmap records into the ondisk btree,
+ * and load the incore extent tree.
+ */
+STATIC int
+xrep_bmap_btree_load(
+ struct xrep_bmap *rb,
+ struct xfs_btree_cur *bmap_cur)
+{
+ struct xfs_scrub *sc = rb->sc;
+ int error;
+
+ /* Compute how many blocks we'll need. */
+ error = xfs_btree_bload_compute_geometry(bmap_cur,
+ &rb->new_bmapbt.bload, rb->real_mappings);
+ if (error)
+ return error;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * Guess how many blocks we're going to need to rebuild an entire bmap
+ * from the number of extents we found, and pump up our transaction to
+ * have sufficient block reservation. We're allowed to exceed file
+ * quota to repair inconsistent metadata.
+ */
+ error = xfs_trans_reserve_more_inode(sc->tp, sc->ip,
+ rb->new_bmapbt.bload.nr_blocks, 0, true);
+ if (error)
+ return error;
+
+ /* Reserve the space we'll need for the new btree. */
+ error = xrep_newbt_alloc_blocks(&rb->new_bmapbt,
+ rb->new_bmapbt.bload.nr_blocks);
+ if (error)
+ return error;
+
+ /* Add all observed bmap records. */
+ rb->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(bmap_cur, &rb->new_bmapbt.bload, rb);
+ if (error)
+ return error;
+
+ /*
+ * Load the new bmap records into the new incore extent tree to
+ * preserve delalloc reservations for regular files. The directory
+ * code loads the extent tree during xfs_dir_open and assumes
+ * thereafter that it remains loaded, so we must not violate that
+ * assumption.
+ */
+ return xrep_bmap_extents_load(rb);
+}
+
+/*
+ * Use the collected bmap information to stage a new bmap fork. If this is
+ * successful we'll return with the new fork information logged to the repair
+ * transaction but not yet committed. The caller must ensure that the inode
+ * is joined to the transaction; the inode will be joined to a clean
+ * transaction when the function returns.
+ */
+STATIC int
+xrep_bmap_build_new_fork(
+ struct xrep_bmap *rb)
+{
+ struct xfs_owner_info oinfo;
+ struct xfs_scrub *sc = rb->sc;
+ struct xfs_btree_cur *bmap_cur;
+ struct xbtree_ifakeroot *ifake = &rb->new_bmapbt.ifake;
+ int error;
+
+ error = xrep_bmap_sort_records(rb);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new fork by initializing the new btree
+ * structure and creating a fake ifork in the ifakeroot structure.
+ */
+ xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork);
+ error = xrep_newbt_init_inode(&rb->new_bmapbt, sc, rb->whichfork,
+ &oinfo);
+ if (error)
+ return error;
+
+ rb->new_bmapbt.bload.get_records = xrep_bmap_get_records;
+ rb->new_bmapbt.bload.claim_block = xrep_bmap_claim_block;
+ rb->new_bmapbt.bload.iroot_size = xrep_bmap_iroot_size;
+ bmap_cur = xfs_bmbt_stage_cursor(sc->mp, sc->ip, ifake);
+
+ /*
+ * Figure out the size and format of the new fork, then fill it with
+ * all the bmap records we've found. Join the inode to the transaction
+ * so that we can roll the transaction while holding the inode locked.
+ */
+ if (rb->real_mappings <= XFS_IFORK_MAXEXT(sc->ip, rb->whichfork)) {
+ ifake->if_fork->if_format = XFS_DINODE_FMT_EXTENTS;
+ error = xrep_bmap_extents_load(rb);
+ } else {
+ ifake->if_fork->if_format = XFS_DINODE_FMT_BTREE;
+ error = xrep_bmap_btree_load(rb, bmap_cur);
+ }
+ if (error)
+ goto err_cur;
+
+ /*
+ * Install the new fork in the inode. After this point the old mapping
+ * data are no longer accessible and the new tree is live. We delete
+ * the cursor immediately after committing the staged root because the
+ * staged fork might be in extents format.
+ */
+ xfs_bmbt_commit_staged_btree(bmap_cur, sc->tp, rb->whichfork);
+ xfs_btree_del_cursor(bmap_cur, 0);
+
+ /* Reset the inode counters now that we've changed the fork. */
+ error = xrep_bmap_reset_counters(rb);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rb->new_bmapbt);
+ if (error)
+ return error;
+
+ return xrep_roll_trans(sc);
+
+err_cur:
+ if (bmap_cur)
+ xfs_btree_del_cursor(bmap_cur, error);
+err_newbt:
+ xrep_newbt_cancel(&rb->new_bmapbt);
+ return error;
+}
+
+/*
+ * Now that we've logged the new inode btree, invalidate all of the old blocks
+ * and free them, if there were any.
+ */
+STATIC int
+xrep_bmap_remove_old_tree(
+ struct xrep_bmap *rb)
+{
+ struct xfs_scrub *sc = rb->sc;
+ struct xfs_owner_info oinfo;
+
+ /* Free the old bmbt blocks if they're not in use. */
+ xfs_rmap_ino_bmbt_owner(&oinfo, sc->ip->i_ino, rb->whichfork);
+ return xrep_reap_fsblocks(sc, &rb->old_bmbt_blocks, &oinfo);
+}
+
+/* Check for garbage inputs. Returns -ECANCELED if there's nothing to do. */
+STATIC int
+xrep_bmap_check_inputs(
+ struct xfs_scrub *sc,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, whichfork);
+
+ ASSERT(whichfork == XFS_DATA_FORK || whichfork == XFS_ATTR_FORK);
+
+ if (!xfs_has_rmapbt(sc->mp))
+ return -EOPNOTSUPP;
+
+ /* No fork means nothing to rebuild. */
+ if (!ifp)
+ return -ECANCELED;
+
+ /*
+ * We only know how to repair extent mappings, which is to say that we
+ * only support extents and btree fork format. Repairs to a local
+ * format fork require a higher level repair function, so we do not
+ * have any work to do here.
+ */
+ switch (ifp->if_format) {
+ case XFS_DINODE_FMT_DEV:
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_UUID:
+ return -ECANCELED;
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ break;
+ default:
+ return -EFSCORRUPTED;
+ }
+
+ if (whichfork == XFS_ATTR_FORK)
+ return 0;
+
+ /* Only files, symlinks, and directories get to have data forks. */
+ switch (VFS_I(sc->ip)->i_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ /* ok */
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Don't know how to rebuild realtime data forks. */
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ return -EOPNOTSUPP;
+
+ return 0;
+}
+
+/* Set up the initial state of the reflink scan. */
+static inline enum reflink_scan_state
+xrep_bmap_init_reflink_scan(
+ struct xfs_scrub *sc,
+ int whichfork)
+{
+ /* cannot share on non-reflink filesystem */
+ if (!xfs_has_reflink(sc->mp))
+ return RLS_IRRELEVANT;
+
+ /* preserve flag if it's already set */
+ if (xfs_is_reflink_inode(sc->ip))
+ return RLS_SET_IFLAG;
+
+ /* can only share regular files */
+ if (!S_ISREG(VFS_I(sc->ip)->i_mode))
+ return RLS_IRRELEVANT;
+
+ /* cannot share attr fork extents */
+ if (whichfork != XFS_DATA_FORK)
+ return RLS_IRRELEVANT;
+
+ /* cannot share realtime extents */
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ return RLS_IRRELEVANT;
+
+ return RLS_UNKNOWN;
+}
+
+/* Repair an inode fork. */
+int
+xrep_bmap(
+ struct xfs_scrub *sc,
+ int whichfork,
+ bool allow_unwritten)
+{
+ struct xrep_bmap *rb;
+ char *descr;
+ unsigned int max_bmbt_recs;
+ bool large_extcount;
+ int error = 0;
+
+ error = xrep_bmap_check_inputs(sc, whichfork);
+ if (error == -ECANCELED)
+ return 0;
+ if (error)
+ return error;
+
+ rb = kzalloc(sizeof(struct xrep_bmap), XCHK_GFP_FLAGS);
+ if (!rb)
+ return -ENOMEM;
+ rb->sc = sc;
+ rb->whichfork = whichfork;
+ rb->reflink_scan = xrep_bmap_init_reflink_scan(sc, whichfork);
+ rb->allow_unwritten = allow_unwritten;
+
+ /* Set up enough storage to handle the max records for this fork. */
+ large_extcount = xfs_has_large_extent_counts(sc->mp);
+ max_bmbt_recs = xfs_iext_max_nextents(large_extcount, whichfork);
+ descr = xchk_xfile_ino_descr(sc, "%s fork mapping records",
+ whichfork == XFS_DATA_FORK ? "data" : "attr");
+ error = xfarray_create(descr, max_bmbt_recs,
+ sizeof(struct xfs_bmbt_rec), &rb->bmap_records);
+ kfree(descr);
+ if (error)
+ goto out_rb;
+
+ /* Collect all reverse mappings for this fork's extents. */
+ xfsb_bitmap_init(&rb->old_bmbt_blocks);
+ error = xrep_bmap_find_mappings(rb);
+ if (error)
+ goto out_bitmap;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Rebuild the bmap information. */
+ error = xrep_bmap_build_new_fork(rb);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old tree. */
+ error = xrep_bmap_remove_old_tree(rb);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xfsb_bitmap_destroy(&rb->old_bmbt_blocks);
+ xfarray_destroy(rb->bmap_records);
+out_rb:
+ kfree(rb);
+ return error;
+}
+
+/* Repair an inode's data fork. */
+int
+xrep_bmap_data(
+ struct xfs_scrub *sc)
+{
+ return xrep_bmap(sc, XFS_DATA_FORK, true);
+}
+
+/* Repair an inode's attr fork. */
+int
+xrep_bmap_attr(
+ struct xfs_scrub *sc)
+{
+ return xrep_bmap(sc, XFS_ATTR_FORK, false);
+}
diff --git a/fs/xfs/scrub/common.c b/fs/xfs/scrub/common.c
index de24532fe083..81f2b96bb5a7 100644
--- a/fs/xfs/scrub/common.c
+++ b/fs/xfs/scrub/common.c
@@ -25,6 +25,7 @@
#include "xfs_trans_priv.h"
#include "xfs_da_format.h"
#include "xfs_da_btree.h"
+#include "xfs_dir2_priv.h"
#include "xfs_attr.h"
#include "xfs_reflink.h"
#include "xfs_ag.h"
@@ -604,6 +605,7 @@ xchk_ag_free(
struct xchk_ag *sa)
{
xchk_ag_btcur_free(sa);
+ xrep_reset_perag_resv(sc);
if (sa->agf_bp) {
xfs_trans_brelse(sc->tp, sa->agf_bp);
sa->agf_bp = NULL;
@@ -733,6 +735,8 @@ xchk_iget(
xfs_ino_t inum,
struct xfs_inode **ipp)
{
+ ASSERT(sc->tp != NULL);
+
return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp);
}
@@ -816,6 +820,26 @@ again:
return 0;
}
+#ifdef CONFIG_XFS_QUOTA
+/*
+ * Try to attach dquots to this inode if we think we might want to repair it.
+ * Callers must not hold any ILOCKs. If the dquots are broken and cannot be
+ * attached, a quotacheck will be scheduled.
+ */
+int
+xchk_ino_dqattach(
+ struct xfs_scrub *sc)
+{
+ ASSERT(sc->tp != NULL);
+ ASSERT(sc->ip != NULL);
+
+ if (!xchk_could_repair(sc))
+ return 0;
+
+ return xrep_ino_dqattach(sc);
+}
+#endif
+
/* Install an inode that we opened by handle for scrubbing. */
int
xchk_install_handle_inode(
@@ -882,8 +906,8 @@ xchk_iget_for_scrubbing(
if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
return -ENOENT;
- /* Try a regular untrusted iget. */
- error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+ /* Try a safe untrusted iget. */
+ error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
if (!error)
return xchk_install_handle_inode(sc, ip);
if (error == -ENOENT)
@@ -1027,6 +1051,11 @@ xchk_setup_inode_contents(
error = xchk_trans_alloc(sc, resblks);
if (error)
goto out;
+
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ goto out;
+
xchk_ilock(sc, XFS_ILOCK_EXCL);
out:
/* scrub teardown will unlock and release the inode for us */
@@ -1132,6 +1161,7 @@ xchk_metadata_inode_subtype(
unsigned int scrub_type)
{
__u32 smtype = sc->sm->sm_type;
+ unsigned int sick_mask = sc->sick_mask;
int error;
sc->sm->sm_type = scrub_type;
@@ -1149,6 +1179,7 @@ xchk_metadata_inode_subtype(
break;
}
+ sc->sick_mask = sick_mask;
sc->sm->sm_type = smtype;
return error;
}
diff --git a/fs/xfs/scrub/common.h b/fs/xfs/scrub/common.h
index cabdc0e16838..da09580b454a 100644
--- a/fs/xfs/scrub/common.h
+++ b/fs/xfs/scrub/common.h
@@ -103,9 +103,15 @@ xchk_setup_rtsummary(struct xfs_scrub *sc)
}
#endif
#ifdef CONFIG_XFS_QUOTA
+int xchk_ino_dqattach(struct xfs_scrub *sc);
int xchk_setup_quota(struct xfs_scrub *sc);
#else
static inline int
+xchk_ino_dqattach(struct xfs_scrub *sc)
+{
+ return 0;
+}
+static inline int
xchk_setup_quota(struct xfs_scrub *sc)
{
return -ENOENT;
@@ -151,6 +157,11 @@ void xchk_iunlock(struct xfs_scrub *sc, unsigned int ilock_flags);
void xchk_buffer_recheck(struct xfs_scrub *sc, struct xfs_buf *bp);
+/*
+ * Grab the inode at @inum. The caller must have created a scrub transaction
+ * so that we can confirm the inumber by walking the inobt and not deadlock on
+ * a loop in the inobt.
+ */
int xchk_iget(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp);
int xchk_iget_agi(struct xfs_scrub *sc, xfs_ino_t inum,
struct xfs_buf **agi_bpp, struct xfs_inode **ipp);
@@ -158,6 +169,26 @@ void xchk_irele(struct xfs_scrub *sc, struct xfs_inode *ip);
int xchk_install_handle_inode(struct xfs_scrub *sc, struct xfs_inode *ip);
/*
+ * Safe version of (untrusted) xchk_iget that uses an empty transaction to
+ * avoid deadlocking on loops in the inobt. This should only be used in a
+ * scrub or repair setup routine, and only prior to grabbing a transaction.
+ */
+static inline int
+xchk_iget_safe(struct xfs_scrub *sc, xfs_ino_t inum, struct xfs_inode **ipp)
+{
+ int error;
+
+ ASSERT(sc->tp == NULL);
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+ error = xchk_iget(sc, inum, ipp);
+ xchk_trans_cancel(sc);
+ return error;
+}
+
+/*
* Don't bother cross-referencing if we already found corruption or cross
* referencing discrepancies.
*/
@@ -167,6 +198,8 @@ static inline bool xchk_skip_xref(struct xfs_scrub_metadata *sm)
XFS_SCRUB_OFLAG_XCORRUPT);
}
+bool xchk_dir_looks_zapped(struct xfs_inode *dp);
+
#ifdef CONFIG_XFS_ONLINE_REPAIR
/* Decide if a repair is required. */
static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
@@ -175,8 +208,21 @@ static inline bool xchk_needs_repair(const struct xfs_scrub_metadata *sm)
XFS_SCRUB_OFLAG_XCORRUPT |
XFS_SCRUB_OFLAG_PREEN);
}
+
+/*
+ * "Should we prepare for a repair?"
+ *
+ * Return true if the caller permits us to repair metadata and we're not
+ * setting up for a post-repair evaluation.
+ */
+static inline bool xchk_could_repair(const struct xfs_scrub *sc)
+{
+ return (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
+ !(sc->flags & XREP_ALREADY_FIXED);
+}
#else
# define xchk_needs_repair(sc) (false)
+# define xchk_could_repair(sc) (false)
#endif /* CONFIG_XFS_ONLINE_REPAIR */
int xchk_metadata_inode_forks(struct xfs_scrub *sc);
@@ -188,6 +234,16 @@ int xchk_metadata_inode_forks(struct xfs_scrub *sc);
#define xchk_xfile_descr(sc, fmt, ...) \
kasprintf(XCHK_GFP_FLAGS, "XFS (%s): " fmt, \
(sc)->mp->m_super->s_id, ##__VA_ARGS__)
+#define xchk_xfile_ag_descr(sc, fmt, ...) \
+ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): AG 0x%x " fmt, \
+ (sc)->mp->m_super->s_id, \
+ (sc)->sa.pag ? (sc)->sa.pag->pag_agno : (sc)->sm->sm_agno, \
+ ##__VA_ARGS__)
+#define xchk_xfile_ino_descr(sc, fmt, ...) \
+ kasprintf(XCHK_GFP_FLAGS, "XFS (%s): inode 0x%llx " fmt, \
+ (sc)->mp->m_super->s_id, \
+ (sc)->ip ? (sc)->ip->i_ino : (sc)->sm->sm_ino, \
+ ##__VA_ARGS__)
/*
* Setting up a hook to wait for intents to drain is costly -- we have to take
diff --git a/fs/xfs/scrub/cow_repair.c b/fs/xfs/scrub/cow_repair.c
new file mode 100644
index 000000000000..1e82c727af8e
--- /dev/null
+++ b/fs/xfs/scrub/cow_repair.c
@@ -0,0 +1,614 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_rmap.h"
+#include "xfs_refcount.h"
+#include "xfs_quota.h"
+#include "xfs_ialloc.h"
+#include "xfs_ag.h"
+#include "xfs_error.h"
+#include "xfs_errortag.h"
+#include "xfs_icache.h"
+#include "xfs_refcount_btree.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/off_bitmap.h"
+#include "scrub/fsb_bitmap.h"
+#include "scrub/reap.h"
+
+/*
+ * CoW Fork Mapping Repair
+ * =======================
+ *
+ * Although CoW staging extents are owned by incore CoW inode forks, on disk
+ * they are owned by the refcount btree. The ondisk metadata does not record
+ * any ownership information, which limits what we can do to repair the
+ * mappings in the CoW fork. At most, we can replace ifork mappings that lack
+ * an entry in the refcount btree or are described by a reverse mapping record
+ * whose owner is not OWN_COW.
+ *
+ * Replacing extents is also tricky -- we can't touch written CoW fork extents
+ * since they are undergoing writeback, and delalloc extents do not require
+ * repair since they only exist incore. Hence the most we can do is find the
+ * bad parts of unwritten mappings, allocate a replacement set of blocks, and
+ * replace the incore mapping. We use the regular reaping process to unmap
+ * or free the discarded blocks, as appropriate.
+ */
+struct xrep_cow {
+ struct xfs_scrub *sc;
+
+ /* Bitmap of file offset ranges that need replacing. */
+ struct xoff_bitmap bad_fileoffs;
+
+ /* Bitmap of fsblocks that were removed from the CoW fork. */
+ struct xfsb_bitmap old_cowfork_fsblocks;
+
+ /* CoW fork mappings used to scan for bad CoW staging extents. */
+ struct xfs_bmbt_irec irec;
+
+ /* refcount btree block number of irec.br_startblock */
+ unsigned int irec_startbno;
+
+ /* refcount btree block number of the next refcount record we expect */
+ unsigned int next_bno;
+};
+
+/* CoW staging extent. */
+struct xrep_cow_extent {
+ xfs_fsblock_t fsbno;
+ xfs_extlen_t len;
+};
+
+/*
+ * Mark the part of the file range that corresponds to the given physical
+ * space. Caller must ensure that the physical range is within xc->irec.
+ */
+STATIC int
+xrep_cow_mark_file_range(
+ struct xrep_cow *xc,
+ xfs_fsblock_t startblock,
+ xfs_filblks_t blockcount)
+{
+ xfs_fileoff_t startoff;
+
+ startoff = xc->irec.br_startoff +
+ (startblock - xc->irec.br_startblock);
+
+ trace_xrep_cow_mark_file_range(xc->sc->ip, startblock, startoff,
+ blockcount);
+
+ return xoff_bitmap_set(&xc->bad_fileoffs, startoff, blockcount);
+}
+
+/*
+ * Trim @src to fit within the CoW fork mapping being examined, and put the
+ * result in @dst.
+ */
+static inline void
+xrep_cow_trim_refcount(
+ struct xrep_cow *xc,
+ struct xfs_refcount_irec *dst,
+ const struct xfs_refcount_irec *src)
+{
+ unsigned int adj;
+
+ memcpy(dst, src, sizeof(*dst));
+
+ if (dst->rc_startblock < xc->irec_startbno) {
+ adj = xc->irec_startbno - dst->rc_startblock;
+ dst->rc_blockcount -= adj;
+ dst->rc_startblock += adj;
+ }
+
+ if (dst->rc_startblock + dst->rc_blockcount >
+ xc->irec_startbno + xc->irec.br_blockcount) {
+ adj = (dst->rc_startblock + dst->rc_blockcount) -
+ (xc->irec_startbno + xc->irec.br_blockcount);
+ dst->rc_blockcount -= adj;
+ }
+}
+
+/* Mark any shared CoW staging extents. */
+STATIC int
+xrep_cow_mark_shared_staging(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *rec,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ struct xfs_refcount_irec rrec;
+ xfs_fsblock_t fsbno;
+
+ if (!xfs_refcount_check_domain(rec) ||
+ rec->rc_domain != XFS_REFC_DOMAIN_SHARED)
+ return -EFSCORRUPTED;
+
+ xrep_cow_trim_refcount(xc, &rrec, rec);
+
+ fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
+ rrec.rc_startblock);
+ return xrep_cow_mark_file_range(xc, fsbno, rrec.rc_blockcount);
+}
+
+/*
+ * Mark any portion of the CoW fork file offset range where there is not a CoW
+ * staging extent record in the refcountbt, and keep a record of where we did
+ * find correct refcountbt records. Staging records are always cleaned out at
+ * mount time, so any two inodes trying to map the same staging area would have
+ * already taken the fs down due to refcount btree verifier errors. Hence this
+ * inode should be the sole creator of the staging extent records ondisk.
+ */
+STATIC int
+xrep_cow_mark_missing_staging(
+ struct xfs_btree_cur *cur,
+ const struct xfs_refcount_irec *rec,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ struct xfs_refcount_irec rrec;
+ int error;
+
+ if (!xfs_refcount_check_domain(rec) ||
+ rec->rc_domain != XFS_REFC_DOMAIN_COW)
+ return -EFSCORRUPTED;
+
+ xrep_cow_trim_refcount(xc, &rrec, rec);
+
+ if (xc->next_bno >= rrec.rc_startblock)
+ goto next;
+
+ error = xrep_cow_mark_file_range(xc,
+ XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno,
+ xc->next_bno),
+ rrec.rc_startblock - xc->next_bno);
+ if (error)
+ return error;
+
+next:
+ xc->next_bno = rrec.rc_startblock + rrec.rc_blockcount;
+ return 0;
+}
+
+/*
+ * Mark any area that does not correspond to a CoW staging rmap. These are
+ * cross-linked areas that must be avoided.
+ */
+STATIC int
+xrep_cow_mark_missing_staging_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ xfs_fsblock_t fsbno;
+ xfs_agblock_t rec_bno;
+ xfs_extlen_t rec_len;
+ unsigned int adj;
+
+ if (rec->rm_owner == XFS_RMAP_OWN_COW)
+ return 0;
+
+ rec_bno = rec->rm_startblock;
+ rec_len = rec->rm_blockcount;
+ if (rec_bno < xc->irec_startbno) {
+ adj = xc->irec_startbno - rec_bno;
+ rec_len -= adj;
+ rec_bno += adj;
+ }
+
+ if (rec_bno + rec_len > xc->irec_startbno + xc->irec.br_blockcount) {
+ adj = (rec_bno + rec_len) -
+ (xc->irec_startbno + xc->irec.br_blockcount);
+ rec_len -= adj;
+ }
+
+ fsbno = XFS_AGB_TO_FSB(xc->sc->mp, cur->bc_ag.pag->pag_agno, rec_bno);
+ return xrep_cow_mark_file_range(xc, fsbno, rec_len);
+}
+
+/*
+ * Find any part of the CoW fork mapping that isn't a single-owner CoW staging
+ * extent and mark the corresponding part of the file range in the bitmap.
+ */
+STATIC int
+xrep_cow_find_bad(
+ struct xrep_cow *xc)
+{
+ struct xfs_refcount_irec rc_low = { 0 };
+ struct xfs_refcount_irec rc_high = { 0 };
+ struct xfs_rmap_irec rm_low = { 0 };
+ struct xfs_rmap_irec rm_high = { 0 };
+ struct xfs_perag *pag;
+ struct xfs_scrub *sc = xc->sc;
+ xfs_agnumber_t agno;
+ int error;
+
+ agno = XFS_FSB_TO_AGNO(sc->mp, xc->irec.br_startblock);
+ xc->irec_startbno = XFS_FSB_TO_AGBNO(sc->mp, xc->irec.br_startblock);
+
+ pag = xfs_perag_get(sc->mp, agno);
+ if (!pag)
+ return -EFSCORRUPTED;
+
+ error = xrep_ag_init(sc, pag, &sc->sa);
+ if (error)
+ goto out_pag;
+
+ /* Mark any CoW fork extents that are shared. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_SHARED;
+ error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_shared_staging, xc);
+ if (error)
+ goto out_sa;
+
+ /* Make sure there are CoW staging extents for the whole mapping. */
+ rc_low.rc_startblock = xc->irec_startbno;
+ rc_high.rc_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ rc_low.rc_domain = rc_high.rc_domain = XFS_REFC_DOMAIN_COW;
+ xc->next_bno = xc->irec_startbno;
+ error = xfs_refcount_query_range(sc->sa.refc_cur, &rc_low, &rc_high,
+ xrep_cow_mark_missing_staging, xc);
+ if (error)
+ goto out_sa;
+
+ if (xc->next_bno < xc->irec_startbno + xc->irec.br_blockcount) {
+ error = xrep_cow_mark_file_range(xc,
+ XFS_AGB_TO_FSB(sc->mp, pag->pag_agno,
+ xc->next_bno),
+ xc->irec_startbno + xc->irec.br_blockcount -
+ xc->next_bno);
+ if (error)
+ goto out_sa;
+ }
+
+ /* Mark any area has an rmap that isn't a COW staging extent. */
+ rm_low.rm_startblock = xc->irec_startbno;
+ memset(&rm_high, 0xFF, sizeof(rm_high));
+ rm_high.rm_startblock = xc->irec_startbno + xc->irec.br_blockcount - 1;
+ error = xfs_rmap_query_range(sc->sa.rmap_cur, &rm_low, &rm_high,
+ xrep_cow_mark_missing_staging_rmap, xc);
+ if (error)
+ goto out_sa;
+
+ /*
+ * If userspace is forcing us to rebuild the CoW fork or someone turned
+ * on the debugging knob, replace everything in the CoW fork.
+ */
+ if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD) ||
+ XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) {
+ error = xrep_cow_mark_file_range(xc, xc->irec.br_startblock,
+ xc->irec.br_blockcount);
+ if (error)
+ return error;
+ }
+
+out_sa:
+ xchk_ag_free(sc, &sc->sa);
+out_pag:
+ xfs_perag_put(pag);
+ return 0;
+}
+
+/*
+ * Allocate a replacement CoW staging extent of up to the given number of
+ * blocks, and fill out the mapping.
+ */
+STATIC int
+xrep_cow_alloc(
+ struct xfs_scrub *sc,
+ xfs_extlen_t maxlen,
+ struct xrep_cow_extent *repl)
+{
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = sc->mp,
+ .oinfo = XFS_RMAP_OINFO_SKIP_UPDATE,
+ .minlen = 1,
+ .maxlen = maxlen,
+ .prod = 1,
+ .resv = XFS_AG_RESV_NONE,
+ .datatype = XFS_ALLOC_USERDATA,
+ };
+ int error;
+
+ error = xfs_trans_reserve_more(sc->tp, maxlen, 0);
+ if (error)
+ return error;
+
+ error = xfs_alloc_vextent_start_ag(&args,
+ XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino));
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ xfs_refcount_alloc_cow_extent(sc->tp, args.fsbno, args.len);
+
+ repl->fsbno = args.fsbno;
+ repl->len = args.len;
+ return 0;
+}
+
+/*
+ * Look up the current CoW fork mapping so that we only allocate enough to
+ * replace a single mapping. If we don't find a mapping that covers the start
+ * of the file range, or we find a delalloc or written extent, something is
+ * seriously wrong, since we didn't drop the ILOCK.
+ */
+static inline int
+xrep_cow_find_mapping(
+ struct xrep_cow *xc,
+ struct xfs_iext_cursor *icur,
+ xfs_fileoff_t startoff,
+ struct xfs_bmbt_irec *got)
+{
+ struct xfs_inode *ip = xc->sc->ip;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
+
+ if (!xfs_iext_lookup_extent(ip, ifp, startoff, icur, got))
+ goto bad;
+
+ if (got->br_startoff > startoff)
+ goto bad;
+
+ if (got->br_blockcount == 0)
+ goto bad;
+
+ if (isnullstartblock(got->br_startblock))
+ goto bad;
+
+ if (xfs_bmap_is_written_extent(got))
+ goto bad;
+
+ return 0;
+bad:
+ ASSERT(0);
+ return -EFSCORRUPTED;
+}
+
+#define REPLACE_LEFT_SIDE (1U << 0)
+#define REPLACE_RIGHT_SIDE (1U << 1)
+
+/*
+ * Given a CoW fork mapping @got and a replacement mapping @repl, remap the
+ * beginning of @got with the space described by @rep.
+ */
+static inline void
+xrep_cow_replace_mapping(
+ struct xfs_inode *ip,
+ struct xfs_iext_cursor *icur,
+ const struct xfs_bmbt_irec *got,
+ const struct xrep_cow_extent *repl)
+{
+ struct xfs_bmbt_irec new = *got; /* struct copy */
+
+ ASSERT(repl->len > 0);
+ ASSERT(!isnullstartblock(got->br_startblock));
+
+ trace_xrep_cow_replace_mapping(ip, got, repl->fsbno, repl->len);
+
+ if (got->br_blockcount == repl->len) {
+ /*
+ * The new extent is a complete replacement for the existing
+ * extent. Update the COW fork record.
+ */
+ new.br_startblock = repl->fsbno;
+ xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
+ return;
+ }
+
+ /*
+ * The new extent can replace the beginning of the COW fork record.
+ * Move the left side of @got upwards, then insert the new record.
+ */
+ new.br_startoff += repl->len;
+ new.br_startblock += repl->len;
+ new.br_blockcount -= repl->len;
+ xfs_iext_update_extent(ip, BMAP_COWFORK, icur, &new);
+
+ new.br_startoff = got->br_startoff;
+ new.br_startblock = repl->fsbno;
+ new.br_blockcount = repl->len;
+ xfs_iext_insert(ip, icur, &new, BMAP_COWFORK);
+}
+
+/*
+ * Replace the unwritten CoW staging extent backing the given file range with a
+ * new space extent that isn't as problematic.
+ */
+STATIC int
+xrep_cow_replace_range(
+ struct xrep_cow *xc,
+ xfs_fileoff_t startoff,
+ xfs_extlen_t *blockcount)
+{
+ struct xfs_iext_cursor icur;
+ struct xrep_cow_extent repl;
+ struct xfs_bmbt_irec got;
+ struct xfs_scrub *sc = xc->sc;
+ xfs_fileoff_t nextoff;
+ xfs_extlen_t alloc_len;
+ int error;
+
+ /*
+ * Put the existing CoW fork mapping in @got. If @got ends before
+ * @rep, truncate @rep so we only replace one extent mapping at a time.
+ */
+ error = xrep_cow_find_mapping(xc, &icur, startoff, &got);
+ if (error)
+ return error;
+ nextoff = min(startoff + *blockcount,
+ got.br_startoff + got.br_blockcount);
+
+ /*
+ * Allocate a replacement extent. If we don't fill all the blocks,
+ * shorten the quantity that will be deleted in this step.
+ */
+ alloc_len = min_t(xfs_fileoff_t, XFS_MAX_BMBT_EXTLEN,
+ nextoff - startoff);
+ error = xrep_cow_alloc(sc, alloc_len, &repl);
+ if (error)
+ return error;
+
+ /*
+ * Replace the old mapping with the new one, and commit the metadata
+ * changes made so far.
+ */
+ xrep_cow_replace_mapping(sc->ip, &icur, &got, &repl);
+
+ xfs_inode_set_cowblocks_tag(sc->ip);
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ return error;
+
+ /* Note the old CoW staging extents; we'll reap them all later. */
+ error = xfsb_bitmap_set(&xc->old_cowfork_fsblocks, got.br_startblock,
+ repl.len);
+ if (error)
+ return error;
+
+ *blockcount = repl.len;
+ return 0;
+}
+
+/*
+ * Replace a bad part of an unwritten CoW staging extent with a fresh delalloc
+ * reservation.
+ */
+STATIC int
+xrep_cow_replace(
+ uint64_t startoff,
+ uint64_t blockcount,
+ void *priv)
+{
+ struct xrep_cow *xc = priv;
+ int error = 0;
+
+ while (blockcount > 0) {
+ xfs_extlen_t len = min_t(xfs_filblks_t, blockcount,
+ XFS_MAX_BMBT_EXTLEN);
+
+ error = xrep_cow_replace_range(xc, startoff, &len);
+ if (error)
+ break;
+
+ blockcount -= len;
+ startoff += len;
+ }
+
+ return error;
+}
+
+/*
+ * Repair an inode's CoW fork. The CoW fork is an in-core structure, so
+ * there's no btree to rebuid. Instead, we replace any mappings that are
+ * cross-linked or lack ondisk CoW fork records in the refcount btree.
+ */
+int
+xrep_bmap_cow(
+ struct xfs_scrub *sc)
+{
+ struct xrep_cow *xc;
+ struct xfs_iext_cursor icur;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(sc->ip, XFS_COW_FORK);
+ int error;
+
+ if (!xfs_has_rmapbt(sc->mp) || !xfs_has_reflink(sc->mp))
+ return -EOPNOTSUPP;
+
+ if (!ifp)
+ return 0;
+
+ /* realtime files aren't supported yet */
+ if (XFS_IS_REALTIME_INODE(sc->ip))
+ return -EOPNOTSUPP;
+
+ /*
+ * If we're somehow not in extents format, then reinitialize it to
+ * an empty extent mapping fork and exit.
+ */
+ if (ifp->if_format != XFS_DINODE_FMT_EXTENTS) {
+ ifp->if_format = XFS_DINODE_FMT_EXTENTS;
+ ifp->if_nextents = 0;
+ return 0;
+ }
+
+ xc = kzalloc(sizeof(struct xrep_cow), XCHK_GFP_FLAGS);
+ if (!xc)
+ return -ENOMEM;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ xc->sc = sc;
+ xoff_bitmap_init(&xc->bad_fileoffs);
+ xfsb_bitmap_init(&xc->old_cowfork_fsblocks);
+
+ for_each_xfs_iext(ifp, &icur, &xc->irec) {
+ if (xchk_should_terminate(sc, &error))
+ goto out_bitmap;
+
+ /*
+ * delalloc reservations only exist incore, so there is no
+ * ondisk metadata that we can examine. Hence we leave them
+ * alone.
+ */
+ if (isnullstartblock(xc->irec.br_startblock))
+ continue;
+
+ /*
+ * COW fork extents are only in the written state if writeback
+ * is actively writing to disk. We cannot restart the write
+ * at a different disk address since we've already issued the
+ * IO, so we leave these alone and hope for the best.
+ */
+ if (xfs_bmap_is_written_extent(&xc->irec))
+ continue;
+
+ error = xrep_cow_find_bad(xc);
+ if (error)
+ goto out_bitmap;
+ }
+
+ /* Replace any bad unwritten mappings with fresh reservations. */
+ error = xoff_bitmap_walk(&xc->bad_fileoffs, xrep_cow_replace, xc);
+ if (error)
+ goto out_bitmap;
+
+ /*
+ * Reap as many of the old CoW blocks as we can. They are owned ondisk
+ * by the refcount btree, not the inode, so it is correct to treat them
+ * like inode metadata.
+ */
+ error = xrep_reap_fsblocks(sc, &xc->old_cowfork_fsblocks,
+ &XFS_RMAP_OINFO_COW);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xfsb_bitmap_destroy(&xc->old_cowfork_fsblocks);
+ xoff_bitmap_destroy(&xc->bad_fileoffs);
+ kmem_free(xc);
+ return error;
+}
diff --git a/fs/xfs/scrub/dir.c b/fs/xfs/scrub/dir.c
index 0b491784b759..d86ab51af928 100644
--- a/fs/xfs/scrub/dir.c
+++ b/fs/xfs/scrub/dir.c
@@ -15,10 +15,12 @@
#include "xfs_icache.h"
#include "xfs_dir2.h"
#include "xfs_dir2_priv.h"
+#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/dabtree.h"
#include "scrub/readdir.h"
+#include "scrub/health.h"
/* Set us up to scrub directories. */
int
@@ -760,6 +762,11 @@ xchk_directory(
if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
return -ENOENT;
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_DIR_ZAPPED)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
/* Plausible size? */
if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) {
xchk_ino_set_corrupt(sc, sc->ip->i_ino);
@@ -784,7 +791,36 @@ xchk_directory(
/* Look up every name in this directory by hash. */
error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, NULL);
- if (error == -ECANCELED)
- error = 0;
- return error;
+ if (error && error != -ECANCELED)
+ return error;
+
+ /* If the dir is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED);
+ return 0;
+}
+
+/*
+ * Decide if this directory has been zapped to satisfy the inode and ifork
+ * verifiers. Checking and repairing should be postponed until the directory
+ * is fixed.
+ */
+bool
+xchk_dir_looks_zapped(
+ struct xfs_inode *dp)
+{
+ /* Repair zapped this dir's data fork a short time ago */
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return true;
+
+ /*
+ * If the dinode repair found a bad data fork, it will reset the fork
+ * to extents format with zero records and wait for the bmapbtd
+ * scrubber to reconstruct the block mappings. Directories always
+ * contain some content, so this is a clear sign of a zapped directory.
+ * The state checked by xfs_ifork_zapped is not persisted, so this is
+ * the secondary strategy if repairs are interrupted by a crash or an
+ * unmount.
+ */
+ return dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS &&
+ dp->i_df.if_nextents == 0;
}
diff --git a/fs/xfs/scrub/dqiterate.c b/fs/xfs/scrub/dqiterate.c
new file mode 100644
index 000000000000..20c4daedd48d
--- /dev/null
+++ b/fs/xfs/scrub/dqiterate.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_bmap.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/quota.h"
+#include "scrub/trace.h"
+
+/* Initialize a dquot iteration cursor. */
+void
+xchk_dqiter_init(
+ struct xchk_dqiter *cursor,
+ struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype)
+{
+ cursor->sc = sc;
+ cursor->bmap.br_startoff = NULLFILEOFF;
+ cursor->dqtype = dqtype & XFS_DQTYPE_REC_MASK;
+ cursor->quota_ip = xfs_quota_inode(sc->mp, cursor->dqtype);
+ cursor->id = 0;
+}
+
+/*
+ * Ensure that the cached data fork mapping for the dqiter cursor is fresh and
+ * covers the dquot pointed to by the scan cursor.
+ */
+STATIC int
+xchk_dquot_iter_revalidate_bmap(
+ struct xchk_dqiter *cursor)
+{
+ struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip,
+ XFS_DATA_FORK);
+ xfs_fileoff_t fileoff;
+ xfs_dqid_t this_id = cursor->id;
+ int nmaps = 1;
+ int error;
+
+ fileoff = this_id / qi->qi_dqperchunk;
+
+ /*
+ * If we have a mapping for cursor->id and it's still fresh, there's
+ * no need to reread the bmbt.
+ */
+ if (cursor->bmap.br_startoff != NULLFILEOFF &&
+ cursor->if_seq == ifp->if_seq &&
+ cursor->bmap.br_startoff + cursor->bmap.br_blockcount > fileoff)
+ return 0;
+
+ /* Look up the data fork mapping for the dquot id of interest. */
+ error = xfs_bmapi_read(cursor->quota_ip, fileoff,
+ XFS_MAX_FILEOFF - fileoff, &cursor->bmap, &nmaps, 0);
+ if (error)
+ return error;
+ if (!nmaps) {
+ ASSERT(nmaps > 0);
+ return -EFSCORRUPTED;
+ }
+ if (cursor->bmap.br_startoff > fileoff) {
+ ASSERT(cursor->bmap.br_startoff == fileoff);
+ return -EFSCORRUPTED;
+ }
+
+ cursor->if_seq = ifp->if_seq;
+ trace_xchk_dquot_iter_revalidate_bmap(cursor, cursor->id);
+ return 0;
+}
+
+/* Advance the dqiter cursor to the next non-sparse region of the quota file. */
+STATIC int
+xchk_dquot_iter_advance_bmap(
+ struct xchk_dqiter *cursor,
+ uint64_t *next_ondisk_id)
+{
+ struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp = xfs_ifork_ptr(cursor->quota_ip,
+ XFS_DATA_FORK);
+ xfs_fileoff_t fileoff;
+ uint64_t next_id;
+ int nmaps = 1;
+ int error;
+
+ /* Find the dquot id for the next non-hole mapping. */
+ do {
+ fileoff = cursor->bmap.br_startoff + cursor->bmap.br_blockcount;
+ if (fileoff > XFS_DQ_ID_MAX / qi->qi_dqperchunk) {
+ /* The hole goes beyond the max dquot id, we're done */
+ *next_ondisk_id = -1ULL;
+ return 0;
+ }
+
+ error = xfs_bmapi_read(cursor->quota_ip, fileoff,
+ XFS_MAX_FILEOFF - fileoff, &cursor->bmap,
+ &nmaps, 0);
+ if (error)
+ return error;
+ if (!nmaps) {
+ /* Must have reached the end of the mappings. */
+ *next_ondisk_id = -1ULL;
+ return 0;
+ }
+ if (cursor->bmap.br_startoff > fileoff) {
+ ASSERT(cursor->bmap.br_startoff == fileoff);
+ return -EFSCORRUPTED;
+ }
+ } while (!xfs_bmap_is_real_extent(&cursor->bmap));
+
+ next_id = cursor->bmap.br_startoff * qi->qi_dqperchunk;
+ if (next_id > XFS_DQ_ID_MAX) {
+ /* The hole goes beyond the max dquot id, we're done */
+ *next_ondisk_id = -1ULL;
+ return 0;
+ }
+
+ /* Propose jumping forward to the dquot in the next allocated block. */
+ *next_ondisk_id = next_id;
+ cursor->if_seq = ifp->if_seq;
+ trace_xchk_dquot_iter_advance_bmap(cursor, *next_ondisk_id);
+ return 0;
+}
+
+/*
+ * Find the id of the next highest incore dquot. Normally this will correspond
+ * exactly with the quota file block mappings, but repair might have erased a
+ * mapping because it was crosslinked; in that case, we need to re-allocate the
+ * space so that we can reset q_blkno.
+ */
+STATIC void
+xchk_dquot_iter_advance_incore(
+ struct xchk_dqiter *cursor,
+ uint64_t *next_incore_id)
+{
+ struct xfs_quotainfo *qi = cursor->sc->mp->m_quotainfo;
+ struct radix_tree_root *tree = xfs_dquot_tree(qi, cursor->dqtype);
+ struct xfs_dquot *dq;
+ unsigned int nr_found;
+
+ *next_incore_id = -1ULL;
+
+ mutex_lock(&qi->qi_tree_lock);
+ nr_found = radix_tree_gang_lookup(tree, (void **)&dq, cursor->id, 1);
+ if (nr_found)
+ *next_incore_id = dq->q_id;
+ mutex_unlock(&qi->qi_tree_lock);
+
+ trace_xchk_dquot_iter_advance_incore(cursor, *next_incore_id);
+}
+
+/*
+ * Walk all incore dquots of this filesystem. Caller must set *@cursorp to
+ * zero before the first call, and must not hold the quota file ILOCK.
+ * Returns 1 and a valid *@dqpp; 0 and *@dqpp == NULL when there are no more
+ * dquots to iterate; or a negative errno.
+ */
+int
+xchk_dquot_iter(
+ struct xchk_dqiter *cursor,
+ struct xfs_dquot **dqpp)
+{
+ struct xfs_mount *mp = cursor->sc->mp;
+ struct xfs_dquot *dq = NULL;
+ uint64_t next_ondisk, next_incore = -1ULL;
+ unsigned int lock_mode;
+ int error = 0;
+
+ if (cursor->id > XFS_DQ_ID_MAX)
+ return 0;
+ next_ondisk = cursor->id;
+
+ /* Revalidate and/or advance the cursor. */
+ lock_mode = xfs_ilock_data_map_shared(cursor->quota_ip);
+ error = xchk_dquot_iter_revalidate_bmap(cursor);
+ if (!error && !xfs_bmap_is_real_extent(&cursor->bmap))
+ error = xchk_dquot_iter_advance_bmap(cursor, &next_ondisk);
+ xfs_iunlock(cursor->quota_ip, lock_mode);
+ if (error)
+ return error;
+
+ if (next_ondisk > cursor->id)
+ xchk_dquot_iter_advance_incore(cursor, &next_incore);
+
+ /* Pick the next dquot in the sequence and return it. */
+ cursor->id = min(next_ondisk, next_incore);
+ if (cursor->id > XFS_DQ_ID_MAX)
+ return 0;
+
+ trace_xchk_dquot_iter(cursor, cursor->id);
+
+ error = xfs_qm_dqget(mp, cursor->id, cursor->dqtype, false, &dq);
+ if (error)
+ return error;
+
+ cursor->id = dq->q_id + 1;
+ *dqpp = dq;
+ return 1;
+}
diff --git a/fs/xfs/scrub/fsb_bitmap.h b/fs/xfs/scrub/fsb_bitmap.h
new file mode 100644
index 000000000000..40b462c1dd0d
--- /dev/null
+++ b/fs/xfs/scrub/fsb_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_FSB_BITMAP_H__
+#define __XFS_SCRUB_FSB_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_fsblock_t */
+
+struct xfsb_bitmap {
+ struct xbitmap64 fsbitmap;
+};
+
+static inline void xfsb_bitmap_init(struct xfsb_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->fsbitmap);
+}
+
+static inline void xfsb_bitmap_destroy(struct xfsb_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->fsbitmap);
+}
+
+static inline int xfsb_bitmap_set(struct xfsb_bitmap *bitmap,
+ xfs_fsblock_t start, xfs_filblks_t len)
+{
+ return xbitmap64_set(&bitmap->fsbitmap, start, len);
+}
+
+static inline int xfsb_bitmap_walk(struct xfsb_bitmap *bitmap,
+ xbitmap64_walk_fn fn, void *priv)
+{
+ return xbitmap64_walk(&bitmap->fsbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_FSB_BITMAP_H__ */
diff --git a/fs/xfs/scrub/health.c b/fs/xfs/scrub/health.c
index 5e2b09ed6e29..531006910ca9 100644
--- a/fs/xfs/scrub/health.c
+++ b/fs/xfs/scrub/health.c
@@ -10,8 +10,6 @@
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
#include "xfs_btree.h"
-#include "xfs_trans_resv.h"
-#include "xfs_mount.h"
#include "xfs_ag.h"
#include "xfs_health.h"
#include "scrub/scrub.h"
@@ -118,6 +116,38 @@ xchk_health_mask_for_scrub_type(
}
/*
+ * If the scrub state is clean, add @mask to the scrub sick mask to clear
+ * additional sick flags from the metadata object's sick state.
+ */
+void
+xchk_mark_healthy_if_clean(
+ struct xfs_scrub *sc,
+ unsigned int mask)
+{
+ if (!(sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT |
+ XFS_SCRUB_OFLAG_XCORRUPT)))
+ sc->sick_mask |= mask;
+}
+
+/*
+ * If we're scrubbing a piece of file metadata for the first time, does it look
+ * like it has been zapped? Skip the check if we just repaired the metadata
+ * and are revalidating it.
+ */
+bool
+xchk_file_looks_zapped(
+ struct xfs_scrub *sc,
+ unsigned int mask)
+{
+ ASSERT((mask & ~XFS_SICK_INO_ZAPPED) == 0);
+
+ if (sc->flags & XREP_ALREADY_FIXED)
+ return false;
+
+ return xfs_inode_has_sickness(sc->ip, mask);
+}
+
+/*
* Update filesystem health assessments based on what we found and did.
*
* If the scrubber finds errors, we mark sick whatever's mentioned in
diff --git a/fs/xfs/scrub/health.h b/fs/xfs/scrub/health.h
index 66a273f8585b..a731b2467399 100644
--- a/fs/xfs/scrub/health.h
+++ b/fs/xfs/scrub/health.h
@@ -10,5 +10,7 @@ unsigned int xchk_health_mask_for_scrub_type(__u32 scrub_type);
void xchk_update_health(struct xfs_scrub *sc);
bool xchk_ag_btree_healthy_enough(struct xfs_scrub *sc, struct xfs_perag *pag,
xfs_btnum_t btnum);
+void xchk_mark_healthy_if_clean(struct xfs_scrub *sc, unsigned int mask);
+bool xchk_file_looks_zapped(struct xfs_scrub *sc, unsigned int mask);
#endif /* __XFS_SCRUB_HEALTH_H__ */
diff --git a/fs/xfs/scrub/ialloc.c b/fs/xfs/scrub/ialloc.c
index fb7bbf47ae5d..a720fc62262a 100644
--- a/fs/xfs/scrub/ialloc.c
+++ b/fs/xfs/scrub/ialloc.c
@@ -585,7 +585,7 @@ xchk_iallocbt_rec(
uint16_t holemask;
xfs_inobt_btrec_to_irec(mp, rec, &irec);
- if (xfs_inobt_check_irec(bs->cur, &irec) != NULL) {
+ if (xfs_inobt_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
@@ -708,11 +708,10 @@ xchk_iallocbt_xref_rmap_inodes(
xchk_btree_xref_set_corrupt(sc, sc->sa.rmap_cur, 0);
}
-/* Scrub the inode btrees for some AG. */
-STATIC int
+/* Scrub one of the inode btrees for some AG. */
+int
xchk_iallocbt(
- struct xfs_scrub *sc,
- xfs_btnum_t which)
+ struct xfs_scrub *sc)
{
struct xfs_btree_cur *cur;
struct xchk_iallocbt iabt = {
@@ -720,9 +719,23 @@ xchk_iallocbt(
.next_startino = NULLAGINO,
.next_cluster_ino = NULLAGINO,
};
+ xfs_btnum_t which;
int error;
- cur = which == XFS_BTNUM_INO ? sc->sa.ino_cur : sc->sa.fino_cur;
+ switch (sc->sm->sm_type) {
+ case XFS_SCRUB_TYPE_INOBT:
+ cur = sc->sa.ino_cur;
+ which = XFS_BTNUM_INO;
+ break;
+ case XFS_SCRUB_TYPE_FINOBT:
+ cur = sc->sa.fino_cur;
+ which = XFS_BTNUM_FINO;
+ break;
+ default:
+ ASSERT(0);
+ return -EIO;
+ }
+
error = xchk_btree(sc, cur, xchk_iallocbt_rec, &XFS_RMAP_OINFO_INOBT,
&iabt);
if (error)
@@ -743,20 +756,6 @@ xchk_iallocbt(
return error;
}
-int
-xchk_inobt(
- struct xfs_scrub *sc)
-{
- return xchk_iallocbt(sc, XFS_BTNUM_INO);
-}
-
-int
-xchk_finobt(
- struct xfs_scrub *sc)
-{
- return xchk_iallocbt(sc, XFS_BTNUM_FINO);
-}
-
/* See if an inode btree has (or doesn't have) an inode chunk record. */
static inline void
xchk_xref_inode_check(
diff --git a/fs/xfs/scrub/ialloc_repair.c b/fs/xfs/scrub/ialloc_repair.c
new file mode 100644
index 000000000000..b3f7182dd2f5
--- /dev/null
+++ b/fs/xfs/scrub/ialloc_repair.c
@@ -0,0 +1,884 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_icache.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_log.h"
+#include "xfs_trans_priv.h"
+#include "xfs_error.h"
+#include "xfs_health.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Inode Btree Repair
+ * ==================
+ *
+ * A quick refresher of inode btrees on a v5 filesystem:
+ *
+ * - Inode records are read into memory in units of 'inode clusters'. However
+ * many inodes fit in a cluster buffer is the smallest number of inodes that
+ * can be allocated or freed. Clusters are never smaller than one fs block
+ * though they can span multiple blocks. The size (in fs blocks) is
+ * computed with xfs_icluster_size_fsb(). The fs block alignment of a
+ * cluster is computed with xfs_ialloc_cluster_alignment().
+ *
+ * - Each inode btree record can describe a single 'inode chunk'. The chunk
+ * size is defined to be 64 inodes. If sparse inodes are enabled, every
+ * inobt record must be aligned to the chunk size; if not, every record must
+ * be aligned to the start of a cluster. It is possible to construct an XFS
+ * geometry where one inobt record maps to multiple inode clusters; it is
+ * also possible to construct a geometry where multiple inobt records map to
+ * different parts of one inode cluster.
+ *
+ * - If sparse inodes are not enabled, the smallest unit of allocation for
+ * inode records is enough to contain one inode chunk's worth of inodes.
+ *
+ * - If sparse inodes are enabled, the holemask field will be active. Each
+ * bit of the holemask represents 4 potential inodes; if set, the
+ * corresponding space does *not* contain inodes and must be left alone.
+ * Clusters cannot be smaller than 4 inodes. The smallest unit of allocation
+ * of inode records is one inode cluster.
+ *
+ * So what's the rebuild algorithm?
+ *
+ * Iterate the reverse mapping records looking for OWN_INODES and OWN_INOBT
+ * records. The OWN_INOBT records are the old inode btree blocks and will be
+ * cleared out after we've rebuilt the tree. Each possible inode cluster
+ * within an OWN_INODES record will be read in; for each possible inobt record
+ * associated with that cluster, compute the freemask calculated from the
+ * i_mode data in the inode chunk. For sparse inodes the holemask will be
+ * calculated by creating the properly aligned inobt record and punching out
+ * any chunk that's missing. Inode allocations and frees grab the AGI first,
+ * so repair protects itself from concurrent access by locking the AGI.
+ *
+ * Once we've reconstructed all the inode records, we can create new inode
+ * btree roots and reload the btrees. We rebuild both inode trees at the same
+ * time because they have the same rmap owner and it would be more complex to
+ * figure out if the other tree isn't in need of a rebuild and which OWN_INOBT
+ * blocks it owns. We have all the data we need to build both, so dump
+ * everything and start over.
+ *
+ * We use the prefix 'xrep_ibt' because we rebuild both inode btrees at once.
+ */
+
+struct xrep_ibt {
+ /* Record under construction. */
+ struct xfs_inobt_rec_incore rie;
+
+ /* new inobt information */
+ struct xrep_newbt new_inobt;
+
+ /* new finobt information */
+ struct xrep_newbt new_finobt;
+
+ /* Old inode btree blocks we found in the rmap. */
+ struct xagb_bitmap old_iallocbt_blocks;
+
+ /* Reconstructed inode records. */
+ struct xfarray *inode_records;
+
+ struct xfs_scrub *sc;
+
+ /* Number of inodes assigned disk space. */
+ unsigned int icount;
+
+ /* Number of inodes in use. */
+ unsigned int iused;
+
+ /* Number of finobt records needed. */
+ unsigned int finobt_recs;
+
+ /* get_records()'s position in the inode record array. */
+ xfarray_idx_t array_cur;
+};
+
+/*
+ * Is this inode in use? If the inode is in memory we can tell from i_mode,
+ * otherwise we have to check di_mode in the on-disk buffer. We only care
+ * that the high (i.e. non-permission) bits of _mode are zero. This should be
+ * safe because repair keeps all AG headers locked until the end, and process
+ * trying to perform an inode allocation/free must lock the AGI.
+ *
+ * @cluster_ag_base is the inode offset of the cluster within the AG.
+ * @cluster_bp is the cluster buffer.
+ * @cluster_index is the inode offset within the inode cluster.
+ */
+STATIC int
+xrep_ibt_check_ifree(
+ struct xrep_ibt *ri,
+ xfs_agino_t cluster_ag_base,
+ struct xfs_buf *cluster_bp,
+ unsigned int cluster_index,
+ bool *inuse)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_dinode *dip;
+ xfs_ino_t fsino;
+ xfs_agino_t agino;
+ xfs_agnumber_t agno = ri->sc->sa.pag->pag_agno;
+ unsigned int cluster_buf_base;
+ unsigned int offset;
+ int error;
+
+ agino = cluster_ag_base + cluster_index;
+ fsino = XFS_AGINO_TO_INO(mp, agno, agino);
+
+ /* Inode uncached or half assembled, read disk buffer */
+ cluster_buf_base = XFS_INO_TO_OFFSET(mp, cluster_ag_base);
+ offset = (cluster_buf_base + cluster_index) * mp->m_sb.sb_inodesize;
+ if (offset >= BBTOB(cluster_bp->b_length))
+ return -EFSCORRUPTED;
+ dip = xfs_buf_offset(cluster_bp, offset);
+ if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)
+ return -EFSCORRUPTED;
+
+ if (dip->di_version >= 3 && be64_to_cpu(dip->di_ino) != fsino)
+ return -EFSCORRUPTED;
+
+ /* Will the in-core inode tell us if it's in use? */
+ error = xchk_inode_is_allocated(sc, agino, inuse);
+ if (!error)
+ return 0;
+
+ *inuse = dip->di_mode != 0;
+ return 0;
+}
+
+/* Stash the accumulated inobt record for rebuilding. */
+STATIC int
+xrep_ibt_stash(
+ struct xrep_ibt *ri)
+{
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ ri->rie.ir_freecount = xfs_inobt_rec_freecount(&ri->rie);
+ if (xfs_inobt_check_irec(ri->sc->sa.pag, &ri->rie) != NULL)
+ return -EFSCORRUPTED;
+
+ if (ri->rie.ir_freecount > 0)
+ ri->finobt_recs++;
+
+ trace_xrep_ibt_found(ri->sc->mp, ri->sc->sa.pag->pag_agno, &ri->rie);
+
+ error = xfarray_append(ri->inode_records, &ri->rie);
+ if (error)
+ return error;
+
+ ri->rie.ir_startino = NULLAGINO;
+ return 0;
+}
+
+/*
+ * Given an extent of inodes and an inode cluster buffer, calculate the
+ * location of the corresponding inobt record (creating it if necessary),
+ * then update the parts of the holemask and freemask of that record that
+ * correspond to the inode extent we were given.
+ *
+ * @cluster_ir_startino is the AG inode number of an inobt record that we're
+ * proposing to create for this inode cluster. If sparse inodes are enabled,
+ * we must round down to a chunk boundary to find the actual sparse record.
+ * @cluster_bp is the buffer of the inode cluster.
+ * @nr_inodes is the number of inodes to check from the cluster.
+ */
+STATIC int
+xrep_ibt_cluster_record(
+ struct xrep_ibt *ri,
+ xfs_agino_t cluster_ir_startino,
+ struct xfs_buf *cluster_bp,
+ unsigned int nr_inodes)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agino_t ir_startino;
+ unsigned int cluster_base;
+ unsigned int cluster_index;
+ int error = 0;
+
+ ir_startino = cluster_ir_startino;
+ if (xfs_has_sparseinodes(mp))
+ ir_startino = rounddown(ir_startino, XFS_INODES_PER_CHUNK);
+ cluster_base = cluster_ir_startino - ir_startino;
+
+ /*
+ * If the accumulated inobt record doesn't map this cluster, add it to
+ * the list and reset it.
+ */
+ if (ri->rie.ir_startino != NULLAGINO &&
+ ri->rie.ir_startino + XFS_INODES_PER_CHUNK <= ir_startino) {
+ error = xrep_ibt_stash(ri);
+ if (error)
+ return error;
+ }
+
+ if (ri->rie.ir_startino == NULLAGINO) {
+ ri->rie.ir_startino = ir_startino;
+ ri->rie.ir_free = XFS_INOBT_ALL_FREE;
+ ri->rie.ir_holemask = 0xFFFF;
+ ri->rie.ir_count = 0;
+ }
+
+ /* Record the whole cluster. */
+ ri->icount += nr_inodes;
+ ri->rie.ir_count += nr_inodes;
+ ri->rie.ir_holemask &= ~xfs_inobt_maskn(
+ cluster_base / XFS_INODES_PER_HOLEMASK_BIT,
+ nr_inodes / XFS_INODES_PER_HOLEMASK_BIT);
+
+ /* Which inodes within this cluster are free? */
+ for (cluster_index = 0; cluster_index < nr_inodes; cluster_index++) {
+ bool inuse = false;
+
+ error = xrep_ibt_check_ifree(ri, cluster_ir_startino,
+ cluster_bp, cluster_index, &inuse);
+ if (error)
+ return error;
+ if (!inuse)
+ continue;
+ ri->iused++;
+ ri->rie.ir_free &= ~XFS_INOBT_MASK(cluster_base +
+ cluster_index);
+ }
+ return 0;
+}
+
+/*
+ * For each inode cluster covering the physical extent recorded by the rmapbt,
+ * we must calculate the properly aligned startino of that cluster, then
+ * iterate each cluster to fill in used and filled masks appropriately. We
+ * then use the (startino, used, filled) information to construct the
+ * appropriate inode records.
+ */
+STATIC int
+xrep_ibt_process_cluster(
+ struct xrep_ibt *ri,
+ xfs_agblock_t cluster_bno)
+{
+ struct xfs_imap imap;
+ struct xfs_buf *cluster_bp;
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agino_t cluster_ag_base;
+ xfs_agino_t irec_index;
+ unsigned int nr_inodes;
+ int error;
+
+ nr_inodes = min_t(unsigned int, igeo->inodes_per_cluster,
+ XFS_INODES_PER_CHUNK);
+
+ /*
+ * Grab the inode cluster buffer. This is safe to do with a broken
+ * inobt because imap_to_bp directly maps the buffer without touching
+ * either inode btree.
+ */
+ imap.im_blkno = XFS_AGB_TO_DADDR(mp, sc->sa.pag->pag_agno, cluster_bno);
+ imap.im_len = XFS_FSB_TO_BB(mp, igeo->blocks_per_cluster);
+ imap.im_boffset = 0;
+ error = xfs_imap_to_bp(mp, sc->tp, &imap, &cluster_bp);
+ if (error)
+ return error;
+
+ /*
+ * Record the contents of each possible inobt record mapping this
+ * cluster.
+ */
+ cluster_ag_base = XFS_AGB_TO_AGINO(mp, cluster_bno);
+ for (irec_index = 0;
+ irec_index < igeo->inodes_per_cluster;
+ irec_index += XFS_INODES_PER_CHUNK) {
+ error = xrep_ibt_cluster_record(ri,
+ cluster_ag_base + irec_index, cluster_bp,
+ nr_inodes);
+ if (error)
+ break;
+
+ }
+
+ xfs_trans_brelse(sc->tp, cluster_bp);
+ return error;
+}
+
+/* Check for any obvious conflicts in the inode chunk extent. */
+STATIC int
+xrep_ibt_check_inode_ext(
+ struct xfs_scrub *sc,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agino_t agino;
+ enum xbtree_recpacking outcome;
+ int error;
+
+ /* Inode records must be within the AG. */
+ if (!xfs_verify_agbext(sc->sa.pag, agbno, len))
+ return -EFSCORRUPTED;
+
+ /* The entire record must align to the inode cluster size. */
+ if (!IS_ALIGNED(agbno, igeo->blocks_per_cluster) ||
+ !IS_ALIGNED(agbno + len, igeo->blocks_per_cluster))
+ return -EFSCORRUPTED;
+
+ /*
+ * The entire record must also adhere to the inode cluster alignment
+ * size if sparse inodes are not enabled.
+ */
+ if (!xfs_has_sparseinodes(mp) &&
+ (!IS_ALIGNED(agbno, igeo->cluster_align) ||
+ !IS_ALIGNED(agbno + len, igeo->cluster_align)))
+ return -EFSCORRUPTED;
+
+ /*
+ * On a sparse inode fs, this cluster could be part of a sparse chunk.
+ * Sparse clusters must be aligned to sparse chunk alignment.
+ */
+ if (xfs_has_sparseinodes(mp) &&
+ (!IS_ALIGNED(agbno, mp->m_sb.sb_spino_align) ||
+ !IS_ALIGNED(agbno + len, mp->m_sb.sb_spino_align)))
+ return -EFSCORRUPTED;
+
+ /* Make sure the entire range of blocks are valid AG inodes. */
+ agino = XFS_AGB_TO_AGINO(mp, agbno);
+ if (!xfs_verify_agino(sc->sa.pag, agino))
+ return -EFSCORRUPTED;
+
+ agino = XFS_AGB_TO_AGINO(mp, agbno + len) - 1;
+ if (!xfs_verify_agino(sc->sa.pag, agino))
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, agbno, len, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Found a fragment of the old inode btrees; dispose of them later. */
+STATIC int
+xrep_ibt_record_old_btree_blocks(
+ struct xrep_ibt *ri,
+ const struct xfs_rmap_irec *rec)
+{
+ if (!xfs_verify_agbext(ri->sc->sa.pag, rec->rm_startblock,
+ rec->rm_blockcount))
+ return -EFSCORRUPTED;
+
+ return xagb_bitmap_set(&ri->old_iallocbt_blocks, rec->rm_startblock,
+ rec->rm_blockcount);
+}
+
+/* Record extents that belong to inode cluster blocks. */
+STATIC int
+xrep_ibt_record_inode_blocks(
+ struct xrep_ibt *ri,
+ const struct xfs_rmap_irec *rec)
+{
+ struct xfs_mount *mp = ri->sc->mp;
+ struct xfs_ino_geometry *igeo = M_IGEO(mp);
+ xfs_agblock_t cluster_base;
+ int error;
+
+ error = xrep_ibt_check_inode_ext(ri->sc, rec->rm_startblock,
+ rec->rm_blockcount);
+ if (error)
+ return error;
+
+ trace_xrep_ibt_walk_rmap(mp, ri->sc->sa.pag->pag_agno,
+ rec->rm_startblock, rec->rm_blockcount, rec->rm_owner,
+ rec->rm_offset, rec->rm_flags);
+
+ /*
+ * Record the free/hole masks for each inode cluster that could be
+ * mapped by this rmap record.
+ */
+ for (cluster_base = 0;
+ cluster_base < rec->rm_blockcount;
+ cluster_base += igeo->blocks_per_cluster) {
+ error = xrep_ibt_process_cluster(ri,
+ rec->rm_startblock + cluster_base);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+STATIC int
+xrep_ibt_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_ibt *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ switch (rec->rm_owner) {
+ case XFS_RMAP_OWN_INOBT:
+ return xrep_ibt_record_old_btree_blocks(ri, rec);
+ case XFS_RMAP_OWN_INODES:
+ return xrep_ibt_record_inode_blocks(ri, rec);
+ }
+ return 0;
+}
+
+/*
+ * Iterate all reverse mappings to find the inodes (OWN_INODES) and the inode
+ * btrees (OWN_INOBT). Figure out if we have enough free space to reconstruct
+ * the inode btrees. The caller must clean up the lists if anything goes
+ * wrong.
+ */
+STATIC int
+xrep_ibt_find_inodes(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ int error;
+
+ ri->rie.ir_startino = NULLAGINO;
+
+ /* Collect all reverse mappings for inode blocks. */
+ xrep_ag_btcur_init(sc, &sc->sa);
+ error = xfs_rmap_query_all(sc->sa.rmap_cur, xrep_ibt_walk_rmap, ri);
+ xchk_ag_btcur_free(&sc->sa);
+ if (error)
+ return error;
+
+ /* If we have a record ready to go, add it to the array. */
+ if (ri->rie.ir_startino != NULLAGINO)
+ return xrep_ibt_stash(ri);
+
+ return 0;
+}
+
+/* Update the AGI counters. */
+STATIC int
+xrep_ibt_reset_counters(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_agi *agi = sc->sa.agi_bp->b_addr;
+ unsigned int freecount = ri->icount - ri->iused;
+
+ /* Trigger inode count recalculation */
+ xfs_force_summary_recalc(sc->mp);
+
+ /*
+ * The AGI header contains extra information related to the inode
+ * btrees, so we must update those fields here.
+ */
+ agi->agi_count = cpu_to_be32(ri->icount);
+ agi->agi_freecount = cpu_to_be32(freecount);
+ xfs_ialloc_log_agi(sc->tp, sc->sa.agi_bp,
+ XFS_AGI_COUNT | XFS_AGI_FREECOUNT);
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagi(sc);
+}
+
+/* Retrieve finobt data for bulk load. */
+STATIC int
+xrep_fibt_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
+ struct xrep_ibt *ri = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ do {
+ error = xfarray_load(ri->inode_records,
+ ri->array_cur++, irec);
+ } while (error == 0 && xfs_inobt_rec_freecount(irec) == 0);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Retrieve inobt data for bulk load. */
+STATIC int
+xrep_ibt_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_inobt_rec_incore *irec = &cur->bc_rec.i;
+ struct xrep_ibt *ri = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load(ri->inode_records, ri->array_cur++, irec);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new inobt blocks to the bulk loader. */
+STATIC int
+xrep_ibt_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_ibt *ri = priv;
+
+ return xrep_newbt_claim_block(cur, &ri->new_inobt, ptr);
+}
+
+/* Feed one of the new finobt blocks to the bulk loader. */
+STATIC int
+xrep_fibt_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_ibt *ri = priv;
+
+ return xrep_newbt_claim_block(cur, &ri->new_finobt, ptr);
+}
+
+/* Make sure the records do not overlap in inumber address space. */
+STATIC int
+xrep_ibt_check_overlap(
+ struct xrep_ibt *ri)
+{
+ struct xfs_inobt_rec_incore irec;
+ xfarray_idx_t cur;
+ xfs_agino_t next_agino = 0;
+ int error = 0;
+
+ foreach_xfarray_idx(ri->inode_records, cur) {
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ error = xfarray_load(ri->inode_records, cur, &irec);
+ if (error)
+ return error;
+
+ if (irec.ir_startino < next_agino)
+ return -EFSCORRUPTED;
+
+ next_agino = irec.ir_startino + XFS_INODES_PER_CHUNK;
+ }
+
+ return error;
+}
+
+/* Build new inode btrees and dispose of the old one. */
+STATIC int
+xrep_ibt_build_new_trees(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_btree_cur *ino_cur;
+ struct xfs_btree_cur *fino_cur = NULL;
+ xfs_fsblock_t fsbno;
+ bool need_finobt;
+ int error;
+
+ need_finobt = xfs_has_finobt(sc->mp);
+
+ /*
+ * Create new btrees for staging all the inobt records we collected
+ * earlier. The records were collected in order of increasing agino,
+ * so we do not have to sort them. Ensure there are no overlapping
+ * records.
+ */
+ error = xrep_ibt_check_overlap(ri);
+ if (error)
+ return error;
+
+ /*
+ * The new inode btrees will not be rooted in the AGI until we've
+ * successfully rebuilt the tree.
+ *
+ * Start by setting up the inobt staging cursor.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+ XFS_IBT_BLOCK(sc->mp)),
+ xrep_newbt_init_ag(&ri->new_inobt, sc, &XFS_RMAP_OINFO_INOBT, fsbno,
+ XFS_AG_RESV_NONE);
+ ri->new_inobt.bload.claim_block = xrep_ibt_claim_block;
+ ri->new_inobt.bload.get_records = xrep_ibt_get_records;
+
+ ino_cur = xfs_inobt_stage_cursor(sc->sa.pag, &ri->new_inobt.afake,
+ XFS_BTNUM_INO);
+ error = xfs_btree_bload_compute_geometry(ino_cur, &ri->new_inobt.bload,
+ xfarray_length(ri->inode_records));
+ if (error)
+ goto err_inocur;
+
+ /* Set up finobt staging cursor. */
+ if (need_finobt) {
+ enum xfs_ag_resv_type resv = XFS_AG_RESV_METADATA;
+
+ if (sc->mp->m_finobt_nores)
+ resv = XFS_AG_RESV_NONE;
+
+ fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+ XFS_FIBT_BLOCK(sc->mp)),
+ xrep_newbt_init_ag(&ri->new_finobt, sc, &XFS_RMAP_OINFO_INOBT,
+ fsbno, resv);
+ ri->new_finobt.bload.claim_block = xrep_fibt_claim_block;
+ ri->new_finobt.bload.get_records = xrep_fibt_get_records;
+
+ fino_cur = xfs_inobt_stage_cursor(sc->sa.pag,
+ &ri->new_finobt.afake, XFS_BTNUM_FINO);
+ error = xfs_btree_bload_compute_geometry(fino_cur,
+ &ri->new_finobt.bload, ri->finobt_recs);
+ if (error)
+ goto err_finocur;
+ }
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_finocur;
+
+ /* Reserve all the space we need to build the new btrees. */
+ error = xrep_newbt_alloc_blocks(&ri->new_inobt,
+ ri->new_inobt.bload.nr_blocks);
+ if (error)
+ goto err_finocur;
+
+ if (need_finobt) {
+ error = xrep_newbt_alloc_blocks(&ri->new_finobt,
+ ri->new_finobt.bload.nr_blocks);
+ if (error)
+ goto err_finocur;
+ }
+
+ /* Add all inobt records. */
+ ri->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(ino_cur, &ri->new_inobt.bload, ri);
+ if (error)
+ goto err_finocur;
+
+ /* Add all finobt records. */
+ if (need_finobt) {
+ ri->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(fino_cur, &ri->new_finobt.bload, ri);
+ if (error)
+ goto err_finocur;
+ }
+
+ /*
+ * Install the new btrees in the AG header. After this point the old
+ * btrees are no longer accessible and the new trees are live.
+ */
+ xfs_inobt_commit_staged_btree(ino_cur, sc->tp, sc->sa.agi_bp);
+ xfs_btree_del_cursor(ino_cur, 0);
+
+ if (fino_cur) {
+ xfs_inobt_commit_staged_btree(fino_cur, sc->tp, sc->sa.agi_bp);
+ xfs_btree_del_cursor(fino_cur, 0);
+ }
+
+ /* Reset the AGI counters now that we've changed the inode roots. */
+ error = xrep_ibt_reset_counters(ri);
+ if (error)
+ goto err_finobt;
+
+ /* Free unused blocks and bitmap. */
+ if (need_finobt) {
+ error = xrep_newbt_commit(&ri->new_finobt);
+ if (error)
+ goto err_inobt;
+ }
+ error = xrep_newbt_commit(&ri->new_inobt);
+ if (error)
+ return error;
+
+ return xrep_roll_ag_trans(sc);
+
+err_finocur:
+ if (need_finobt)
+ xfs_btree_del_cursor(fino_cur, error);
+err_inocur:
+ xfs_btree_del_cursor(ino_cur, error);
+err_finobt:
+ if (need_finobt)
+ xrep_newbt_cancel(&ri->new_finobt);
+err_inobt:
+ xrep_newbt_cancel(&ri->new_inobt);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_ibt_remove_old_trees(
+ struct xrep_ibt *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ int error;
+
+ /*
+ * Free the old inode btree blocks if they're not in use. It's ok to
+ * reap with XFS_AG_RESV_NONE even if the finobt had a per-AG
+ * reservation because we reset the reservation before releasing the
+ * AGI and AGF header buffer locks.
+ */
+ error = xrep_reap_agblocks(sc, &ri->old_iallocbt_blocks,
+ &XFS_RMAP_OINFO_INOBT, XFS_AG_RESV_NONE);
+ if (error)
+ return error;
+
+ /*
+ * If the finobt is enabled and has a per-AG reservation, make sure we
+ * reinitialize the per-AG reservations.
+ */
+ if (xfs_has_finobt(sc->mp) && !sc->mp->m_finobt_nores)
+ sc->flags |= XREP_RESET_PERAG_RESV;
+
+ return 0;
+}
+
+/* Repair both inode btrees. */
+int
+xrep_iallocbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_ibt *ri;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ xfs_agino_t first_agino, last_agino;
+ int error = 0;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+
+ ri = kzalloc(sizeof(struct xrep_ibt), XCHK_GFP_FLAGS);
+ if (!ri)
+ return -ENOMEM;
+ ri->sc = sc;
+
+ /* We rebuild both inode btrees. */
+ sc->sick_mask = XFS_SICK_AG_INOBT | XFS_SICK_AG_FINOBT;
+
+ /* Set up enough storage to handle an AG with nothing but inodes. */
+ xfs_agino_range(mp, sc->sa.pag->pag_agno, &first_agino, &last_agino);
+ last_agino /= XFS_INODES_PER_CHUNK;
+ descr = xchk_xfile_ag_descr(sc, "inode index records");
+ error = xfarray_create(descr, last_agino,
+ sizeof(struct xfs_inobt_rec_incore),
+ &ri->inode_records);
+ kfree(descr);
+ if (error)
+ goto out_ri;
+
+ /* Collect the inode data and find the old btree blocks. */
+ xagb_bitmap_init(&ri->old_iallocbt_blocks);
+ error = xrep_ibt_find_inodes(ri);
+ if (error)
+ goto out_bitmap;
+
+ /* Rebuild the inode indexes. */
+ error = xrep_ibt_build_new_trees(ri);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old tree. */
+ error = xrep_ibt_remove_old_trees(ri);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xagb_bitmap_destroy(&ri->old_iallocbt_blocks);
+ xfarray_destroy(ri->inode_records);
+out_ri:
+ kfree(ri);
+ return error;
+}
+
+/* Make sure both btrees are ok after we've rebuilt them. */
+int
+xrep_revalidate_iallocbt(
+ struct xfs_scrub *sc)
+{
+ __u32 old_type = sc->sm->sm_type;
+ int error;
+
+ /*
+ * We must update sm_type temporarily so that the tree-to-tree cross
+ * reference checks will work in the correct direction, and also so
+ * that tracing will report correctly if there are more errors.
+ */
+ sc->sm->sm_type = XFS_SCRUB_TYPE_INOBT;
+ error = xchk_iallocbt(sc);
+ if (error)
+ goto out;
+
+ if (xfs_has_finobt(sc->mp)) {
+ sc->sm->sm_type = XFS_SCRUB_TYPE_FINOBT;
+ error = xchk_iallocbt(sc);
+ }
+
+out:
+ sc->sm->sm_type = old_type;
+ return error;
+}
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 889f556bc98f..6e2fe2d6250b 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -25,6 +25,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/trace.h"
+#include "scrub/repair.h"
/* Prepare the attached inode for scrubbing. */
static inline int
@@ -39,6 +40,10 @@ xchk_prepare_iscrub(
if (error)
return error;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ return error;
+
xchk_ilock(sc, XFS_ILOCK_EXCL);
return 0;
}
@@ -95,8 +100,8 @@ xchk_setup_inode(
if (!xfs_verify_ino(sc->mp, sc->sm->sm_ino))
return -ENOENT;
- /* Try a regular untrusted iget. */
- error = xchk_iget(sc, sc->sm->sm_ino, &ip);
+ /* Try a safe untrusted iget. */
+ error = xchk_iget_safe(sc, sc->sm->sm_ino, &ip);
if (!error)
return xchk_install_handle_iscrub(sc, ip);
if (error == -ENOENT)
@@ -181,8 +186,11 @@ xchk_setup_inode(
* saying the inode is allocated and the icache being unable to load
* the inode until we can flag the corruption in xchk_inode. The
* scrub function has to note the corruption, since we're not really
- * supposed to do that from the setup function.
+ * supposed to do that from the setup function. Save the mapping to
+ * make repairs to the ondisk inode buffer.
*/
+ if (xchk_could_repair(sc))
+ xrep_setup_inode(sc, &imap);
return 0;
out_cancel:
@@ -338,6 +346,10 @@ xchk_inode_flags2(
if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp))
goto bad;
+ /* no large extent counts without the filesystem feature */
+ if ((flags2 & XFS_DIFLAG2_NREXT64) && !xfs_has_large_extent_counts(mp))
+ goto bad;
+
return;
bad:
xchk_ino_set_corrupt(sc, ino);
@@ -548,7 +560,7 @@ xchk_dinode(
}
/* di_forkoff */
- if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize)
+ if (XFS_DFORK_BOFF(dip) >= mp->m_sb.sb_inodesize)
xchk_ino_set_corrupt(sc, ino);
if (naextents != 0 && dip->di_forkoff == 0)
xchk_ino_set_corrupt(sc, ino);
diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c
new file mode 100644
index 000000000000..0ca62d59f84a
--- /dev/null
+++ b/fs/xfs/scrub/inode_repair.c
@@ -0,0 +1,1525 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_icache.h"
+#include "xfs_inode_buf.h"
+#include "xfs_inode_fork.h"
+#include "xfs_ialloc.h"
+#include "xfs_da_format.h"
+#include "xfs_reflink.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_bmap_util.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_quota_defs.h"
+#include "xfs_quota.h"
+#include "xfs_ag.h"
+#include "xfs_rtbitmap.h"
+#include "xfs_attr_leaf.h"
+#include "xfs_log_priv.h"
+#include "xfs_health.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Inode Record Repair
+ * ===================
+ *
+ * Roughly speaking, inode problems can be classified based on whether or not
+ * they trip the dinode verifiers. If those trip, then we won't be able to
+ * xfs_iget ourselves the inode.
+ *
+ * Therefore, the xrep_dinode_* functions fix anything that will cause the
+ * inode buffer verifier or the dinode verifier. The xrep_inode_* functions
+ * fix things on live incore inodes. The inode repair functions make decisions
+ * with security and usability implications when reviving a file:
+ *
+ * - Files with zero di_mode or a garbage di_mode are converted to regular file
+ * that only root can read. This file may not actually contain user data,
+ * if the file was not previously a regular file. Setuid and setgid bits
+ * are cleared.
+ *
+ * - Zero-size directories can be truncated to look empty. It is necessary to
+ * run the bmapbtd and directory repair functions to fully rebuild the
+ * directory.
+ *
+ * - Zero-size symbolic link targets can be truncated to '?'. It is necessary
+ * to run the bmapbtd and symlink repair functions to salvage the symlink.
+ *
+ * - Invalid extent size hints will be removed.
+ *
+ * - Quotacheck will be scheduled if we repaired an inode that was so badly
+ * damaged that the ondisk inode had to be rebuilt.
+ *
+ * - Invalid user, group, or project IDs (aka -1U) will be reset to zero.
+ * Setuid and setgid bits are cleared.
+ *
+ * - Data and attr forks are reset to extents format with zero extents if the
+ * fork data is inconsistent. It is necessary to run the bmapbtd or bmapbta
+ * repair functions to recover the space mapping.
+ *
+ * - ACLs will not be recovered if the attr fork is zapped or the extended
+ * attribute structure itself requires salvaging.
+ *
+ * - If the attr fork is zapped, the user and group ids are reset to root and
+ * the setuid and setgid bits are removed.
+ */
+
+/*
+ * All the information we need to repair the ondisk inode if we can't iget the
+ * incore inode. We don't allocate this buffer unless we're going to perform
+ * a repair to the ondisk inode cluster buffer.
+ */
+struct xrep_inode {
+ /* Inode mapping that we saved from the initial lookup attempt. */
+ struct xfs_imap imap;
+
+ struct xfs_scrub *sc;
+
+ /* Blocks in use on the data device by data extents or bmbt blocks. */
+ xfs_rfsblock_t data_blocks;
+
+ /* Blocks in use on the rt device. */
+ xfs_rfsblock_t rt_blocks;
+
+ /* Blocks in use by the attr fork. */
+ xfs_rfsblock_t attr_blocks;
+
+ /* Number of data device extents for the data fork. */
+ xfs_extnum_t data_extents;
+
+ /*
+ * Number of realtime device extents for the data fork. If
+ * data_extents and rt_extents indicate that the data fork has extents
+ * on both devices, we'll just back away slowly.
+ */
+ xfs_extnum_t rt_extents;
+
+ /* Number of (data device) extents for the attr fork. */
+ xfs_aextnum_t attr_extents;
+
+ /* Sick state to set after zapping parts of the inode. */
+ unsigned int ino_sick_mask;
+
+ /* Must we remove all access from this file? */
+ bool zap_acls;
+};
+
+/*
+ * Setup function for inode repair. @imap contains the ondisk inode mapping
+ * information so that we can correct the ondisk inode cluster buffer if
+ * necessary to make iget work.
+ */
+int
+xrep_setup_inode(
+ struct xfs_scrub *sc,
+ const struct xfs_imap *imap)
+{
+ struct xrep_inode *ri;
+
+ sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
+ if (!sc->buf)
+ return -ENOMEM;
+
+ ri = sc->buf;
+ memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
+ ri->sc = sc;
+ return 0;
+}
+
+/*
+ * Make sure this ondisk inode can pass the inode buffer verifier. This is
+ * not the same as the dinode verifier.
+ */
+STATIC void
+xrep_dinode_buf_core(
+ struct xfs_scrub *sc,
+ struct xfs_buf *bp,
+ unsigned int ioffset)
+{
+ struct xfs_dinode *dip = xfs_buf_offset(bp, ioffset);
+ struct xfs_trans *tp = sc->tp;
+ struct xfs_mount *mp = sc->mp;
+ xfs_agino_t agino;
+ bool crc_ok = false;
+ bool magic_ok = false;
+ bool unlinked_ok = false;
+
+ agino = be32_to_cpu(dip->di_next_unlinked);
+
+ if (xfs_verify_agino_or_null(bp->b_pag, agino))
+ unlinked_ok = true;
+
+ if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ xfs_dinode_good_version(mp, dip->di_version))
+ magic_ok = true;
+
+ if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
+ XFS_DINODE_CRC_OFF))
+ crc_ok = true;
+
+ if (magic_ok && unlinked_ok && crc_ok)
+ return;
+
+ if (!magic_ok) {
+ dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ dip->di_version = 3;
+ }
+ if (!unlinked_ok)
+ dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
+ xfs_dinode_calc_crc(mp, dip);
+ xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
+ xfs_trans_log_buf(tp, bp, ioffset,
+ ioffset + sizeof(struct xfs_dinode) - 1);
+}
+
+/* Make sure this inode cluster buffer can pass the inode buffer verifier. */
+STATIC void
+xrep_dinode_buf(
+ struct xfs_scrub *sc,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = sc->mp;
+ int i;
+ int ni;
+
+ ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+ for (i = 0; i < ni; i++)
+ xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog);
+}
+
+/* Reinitialize things that never change in an inode. */
+STATIC void
+xrep_dinode_header(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip)
+{
+ trace_xrep_dinode_header(sc, dip);
+
+ dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ if (!xfs_dinode_good_version(sc->mp, dip->di_version))
+ dip->di_version = 3;
+ dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
+ uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
+}
+
+/* Turn di_mode into /something/ recognizable. */
+STATIC void
+xrep_dinode_mode(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+
+ trace_xrep_dinode_mode(sc, dip);
+
+ if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
+ return;
+
+ /* bad mode, so we set it to a file that only root can read */
+ mode = S_IFREG;
+ dip->di_mode = cpu_to_be16(mode);
+ dip->di_uid = 0;
+ dip->di_gid = 0;
+ ri->zap_acls = true;
+}
+
+/* Fix any conflicting flags that the verifiers complain about. */
+STATIC void
+xrep_dinode_flags(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ bool isrt)
+{
+ struct xfs_mount *mp = sc->mp;
+ uint64_t flags2 = be64_to_cpu(dip->di_flags2);
+ uint16_t flags = be16_to_cpu(dip->di_flags);
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+
+ trace_xrep_dinode_flags(sc, dip);
+
+ if (isrt)
+ flags |= XFS_DIFLAG_REALTIME;
+ else
+ flags &= ~XFS_DIFLAG_REALTIME;
+
+ /*
+ * For regular files on a reflink filesystem, set the REFLINK flag to
+ * protect shared extents. A later stage will actually check those
+ * extents and clear the flag if possible.
+ */
+ if (xfs_has_reflink(mp) && S_ISREG(mode))
+ flags2 |= XFS_DIFLAG2_REFLINK;
+ else
+ flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
+ if (flags & XFS_DIFLAG_REALTIME)
+ flags2 &= ~XFS_DIFLAG2_REFLINK;
+ if (!xfs_has_bigtime(mp))
+ flags2 &= ~XFS_DIFLAG2_BIGTIME;
+ if (!xfs_has_large_extent_counts(mp))
+ flags2 &= ~XFS_DIFLAG2_NREXT64;
+ if (flags2 & XFS_DIFLAG2_NREXT64)
+ dip->di_nrext64_pad = 0;
+ else if (dip->di_version >= 3)
+ dip->di_v3_pad = 0;
+ dip->di_flags = cpu_to_be16(flags);
+ dip->di_flags2 = cpu_to_be64(flags2);
+}
+
+/*
+ * Blow out symlink; now it points nowhere. We don't have to worry about
+ * incore state because this inode is failing the verifiers.
+ */
+STATIC void
+xrep_dinode_zap_symlink(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ char *p;
+
+ trace_xrep_dinode_zap_symlink(sc, dip);
+
+ dip->di_format = XFS_DINODE_FMT_LOCAL;
+ dip->di_size = cpu_to_be64(1);
+ p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ *p = '?';
+ ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
+}
+
+/*
+ * Blow out dir, make the parent point to the root. In the future repair will
+ * reconstruct this directory for us. Note that there's no in-core directory
+ * inode because the sf verifier tripped, so we don't have to worry about the
+ * dentry cache.
+ */
+STATIC void
+xrep_dinode_zap_dir(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_dir2_sf_hdr *sfp;
+ int i8count;
+
+ trace_xrep_dinode_zap_dir(sc, dip);
+
+ dip->di_format = XFS_DINODE_FMT_LOCAL;
+ i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
+ sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ sfp->count = 0;
+ sfp->i8count = i8count;
+ xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
+ dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
+ ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED;
+}
+
+/* Make sure we don't have a garbage file size. */
+STATIC void
+xrep_dinode_size(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ uint64_t size = be64_to_cpu(dip->di_size);
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+
+ trace_xrep_dinode_size(sc, dip);
+
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ /* di_size can't be nonzero for special files */
+ dip->di_size = 0;
+ break;
+ case S_IFREG:
+ /* Regular files can't be larger than 2^63-1 bytes. */
+ dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
+ break;
+ case S_IFLNK:
+ /*
+ * Truncate ridiculously oversized symlinks. If the size is
+ * zero, reset it to point to the current directory. Both of
+ * these conditions trigger dinode verifier errors, so there
+ * is no in-core state to reset.
+ */
+ if (size > XFS_SYMLINK_MAXLEN)
+ dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
+ else if (size == 0)
+ xrep_dinode_zap_symlink(ri, dip);
+ break;
+ case S_IFDIR:
+ /*
+ * Directories can't have a size larger than 32G. If the size
+ * is zero, reset it to an empty directory. Both of these
+ * conditions trigger dinode verifier errors, so there is no
+ * in-core state to reset.
+ */
+ if (size > XFS_DIR2_SPACE_SIZE)
+ dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
+ else if (size == 0)
+ xrep_dinode_zap_dir(ri, dip);
+ break;
+ }
+}
+
+/* Fix extent size hints. */
+STATIC void
+xrep_dinode_extsize_hints(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip)
+{
+ struct xfs_mount *mp = sc->mp;
+ uint64_t flags2 = be64_to_cpu(dip->di_flags2);
+ uint16_t flags = be16_to_cpu(dip->di_flags);
+ uint16_t mode = be16_to_cpu(dip->di_mode);
+
+ xfs_failaddr_t fa;
+
+ trace_xrep_dinode_extsize_hints(sc, dip);
+
+ fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
+ mode, flags);
+ if (fa) {
+ dip->di_extsize = 0;
+ dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
+ XFS_DIFLAG_EXTSZINHERIT);
+ }
+
+ if (dip->di_version < 3)
+ return;
+
+ fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
+ mode, flags, flags2);
+ if (fa) {
+ dip->di_cowextsize = 0;
+ dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
+ }
+}
+
+/* Count extents and blocks for an inode given an rmap. */
+STATIC int
+xrep_dinode_walk_rmap(
+ struct xfs_btree_cur *cur,
+ const struct xfs_rmap_irec *rec,
+ void *priv)
+{
+ struct xrep_inode *ri = priv;
+ int error = 0;
+
+ if (xchk_should_terminate(ri->sc, &error))
+ return error;
+
+ /* We only care about this inode. */
+ if (rec->rm_owner != ri->sc->sm->sm_ino)
+ return 0;
+
+ if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
+ ri->attr_blocks += rec->rm_blockcount;
+ if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ ri->attr_extents++;
+
+ return 0;
+ }
+
+ ri->data_blocks += rec->rm_blockcount;
+ if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
+ ri->data_extents++;
+
+ return 0;
+}
+
+/* Count extents and blocks for an inode from all AG rmap data. */
+STATIC int
+xrep_dinode_count_ag_rmaps(
+ struct xrep_inode *ri,
+ struct xfs_perag *pag)
+{
+ struct xfs_btree_cur *cur;
+ struct xfs_buf *agf;
+ int error;
+
+ error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf);
+ if (error)
+ return error;
+
+ cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag);
+ error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri);
+ xfs_btree_del_cursor(cur, error);
+ xfs_trans_brelse(ri->sc->tp, agf);
+ return error;
+}
+
+/* Count extents and blocks for a given inode from all rmap data. */
+STATIC int
+xrep_dinode_count_rmaps(
+ struct xrep_inode *ri)
+{
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+ int error;
+
+ if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
+ return -EOPNOTSUPP;
+
+ for_each_perag(ri->sc->mp, agno, pag) {
+ error = xrep_dinode_count_ag_rmaps(ri, pag);
+ if (error) {
+ xfs_perag_rele(pag);
+ return error;
+ }
+ }
+
+ /* Can't have extents on both the rt and the data device. */
+ if (ri->data_extents && ri->rt_extents)
+ return -EFSCORRUPTED;
+
+ trace_xrep_dinode_count_rmaps(ri->sc,
+ ri->data_blocks, ri->rt_blocks, ri->attr_blocks,
+ ri->data_extents, ri->rt_extents, ri->attr_extents);
+ return 0;
+}
+
+/* Return true if this extents-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_extents_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size,
+ int whichfork)
+{
+ struct xfs_bmbt_irec new;
+ struct xfs_bmbt_rec *dp;
+ xfs_extnum_t nex;
+ bool isrt;
+ unsigned int i;
+
+ nex = xfs_dfork_nextents(dip, whichfork);
+ if (nex > dfork_size / sizeof(struct xfs_bmbt_rec))
+ return true;
+
+ dp = XFS_DFORK_PTR(dip, whichfork);
+
+ isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
+ for (i = 0; i < nex; i++, dp++) {
+ xfs_failaddr_t fa;
+
+ xfs_bmbt_disk_get_all(dp, &new);
+ fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork,
+ &new);
+ if (fa)
+ return true;
+ }
+
+ return false;
+}
+
+/* Return true if this btree-format ifork looks like garbage. */
+STATIC bool
+xrep_dinode_bad_bmbt_fork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ unsigned int dfork_size,
+ int whichfork)
+{
+ struct xfs_bmdr_block *dfp;
+ xfs_extnum_t nex;
+ unsigned int i;
+ unsigned int dmxr;
+ unsigned int nrecs;
+ unsigned int level;
+
+ nex = xfs_dfork_nextents(dip, whichfork);
+ if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec))
+ return true;
+
+ if (dfork_size < sizeof(struct xfs_bmdr_block))
+ return true;
+
+ dfp = XFS_DFORK_PTR(dip, whichfork);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
+ level = be16_to_cpu(dfp->bb_level);
+
+ if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
+ return true;
+ if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
+ return true;
+
+ dmxr = xfs_bmdr_maxrecs(dfork_size, 0);
+ for (i = 1; i <= nrecs; i++) {
+ struct xfs_bmbt_key *fkp;
+ xfs_bmbt_ptr_t *fpp;
+ xfs_fileoff_t fileoff;
+ xfs_fsblock_t fsbno;
+
+ fkp = XFS_BMDR_KEY_ADDR(dfp, i);
+ fileoff = be64_to_cpu(fkp->br_startoff);
+ if (!xfs_verify_fileoff(sc->mp, fileoff))
+ return true;
+
+ fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
+ fsbno = be64_to_cpu(*fpp);
+ if (!xfs_verify_fsbno(sc->mp, fsbno))
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Check the data fork for things that will fail the ifork verifiers or the
+ * ifork formatters.
+ */
+STATIC bool
+xrep_dinode_check_dfork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ void *dfork_ptr;
+ int64_t data_size;
+ unsigned int fmt;
+ unsigned int dfork_size;
+
+ /*
+ * Verifier functions take signed int64_t, so check for bogus negative
+ * values first.
+ */
+ data_size = be64_to_cpu(dip->di_size);
+ if (data_size < 0)
+ return true;
+
+ fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK);
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ if (fmt != XFS_DINODE_FMT_DEV)
+ return true;
+ break;
+ case S_IFREG:
+ if (fmt == XFS_DINODE_FMT_LOCAL)
+ return true;
+ fallthrough;
+ case S_IFLNK:
+ case S_IFDIR:
+ switch (fmt) {
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ break;
+ default:
+ return true;
+ }
+ break;
+ default:
+ return true;
+ }
+
+ dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
+ dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+
+ switch (fmt) {
+ case XFS_DINODE_FMT_DEV:
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ /* dir/symlink structure cannot be larger than the fork */
+ if (data_size > dfork_size)
+ return true;
+ /* directory structure must pass verification. */
+ if (S_ISDIR(mode) &&
+ xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL)
+ return true;
+ /* symlink structure must pass verification. */
+ if (S_ISLNK(mode) &&
+ xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL)
+ return true;
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size,
+ XFS_DATA_FORK))
+ return true;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size,
+ XFS_DATA_FORK))
+ return true;
+ break;
+ default:
+ return true;
+ }
+
+ return false;
+}
+
+static void
+xrep_dinode_set_data_nextents(
+ struct xfs_dinode *dip,
+ xfs_extnum_t nextents)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ dip->di_big_nextents = cpu_to_be64(nextents);
+ else
+ dip->di_nextents = cpu_to_be32(nextents);
+}
+
+static void
+xrep_dinode_set_attr_nextents(
+ struct xfs_dinode *dip,
+ xfs_extnum_t nextents)
+{
+ if (xfs_dinode_has_large_extent_counts(dip))
+ dip->di_big_anextents = cpu_to_be32(nextents);
+ else
+ dip->di_anextents = cpu_to_be16(nextents);
+}
+
+/* Reset the data fork to something sane. */
+STATIC void
+xrep_dinode_zap_dfork(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ struct xfs_scrub *sc = ri->sc;
+
+ trace_xrep_dinode_zap_dfork(sc, dip);
+
+ ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED;
+
+ xrep_dinode_set_data_nextents(dip, 0);
+ ri->data_blocks = 0;
+ ri->rt_blocks = 0;
+
+ /* Special files always get reset to DEV */
+ switch (mode & S_IFMT) {
+ case S_IFIFO:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFSOCK:
+ dip->di_format = XFS_DINODE_FMT_DEV;
+ dip->di_size = 0;
+ return;
+ }
+
+ /*
+ * If we have data extents, reset to an empty map and hope the user
+ * will run the bmapbtd checker next.
+ */
+ if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) {
+ dip->di_format = XFS_DINODE_FMT_EXTENTS;
+ return;
+ }
+
+ /* Otherwise, reset the local format to the minimum. */
+ switch (mode & S_IFMT) {
+ case S_IFLNK:
+ xrep_dinode_zap_symlink(ri, dip);
+ break;
+ case S_IFDIR:
+ xrep_dinode_zap_dir(ri, dip);
+ break;
+ }
+}
+
+/*
+ * Check the attr fork for things that will fail the ifork verifiers or the
+ * ifork formatters.
+ */
+STATIC bool
+xrep_dinode_check_afork(
+ struct xfs_scrub *sc,
+ struct xfs_dinode *dip)
+{
+ struct xfs_attr_sf_hdr *afork_ptr;
+ size_t attr_size;
+ unsigned int afork_size;
+
+ if (XFS_DFORK_BOFF(dip) == 0)
+ return dip->di_aformat != XFS_DINODE_FMT_EXTENTS ||
+ xfs_dfork_attr_extents(dip) != 0;
+
+ afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
+ afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
+
+ switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
+ case XFS_DINODE_FMT_LOCAL:
+ /* Fork has to be large enough to extract the xattr size. */
+ if (afork_size < sizeof(struct xfs_attr_sf_hdr))
+ return true;
+
+ /* xattr structure cannot be larger than the fork */
+ attr_size = be16_to_cpu(afork_ptr->totsize);
+ if (attr_size > afork_size)
+ return true;
+
+ /* xattr structure must pass verification. */
+ return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL;
+ case XFS_DINODE_FMT_EXTENTS:
+ if (xrep_dinode_bad_extents_fork(sc, dip, afork_size,
+ XFS_ATTR_FORK))
+ return true;
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size,
+ XFS_ATTR_FORK))
+ return true;
+ break;
+ default:
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Reset the attr fork to empty. Since the attr fork could have contained
+ * ACLs, make the file readable only by root.
+ */
+STATIC void
+xrep_dinode_zap_afork(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ struct xfs_scrub *sc = ri->sc;
+
+ trace_xrep_dinode_zap_afork(sc, dip);
+
+ ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED;
+
+ dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
+ xrep_dinode_set_attr_nextents(dip, 0);
+ ri->attr_blocks = 0;
+
+ /*
+ * If the data fork is in btree format, removing the attr fork entirely
+ * might cause verifier failures if the next level down in the bmbt
+ * could now fit in the data fork area.
+ */
+ if (dip->di_format != XFS_DINODE_FMT_BTREE)
+ dip->di_forkoff = 0;
+ dip->di_mode = cpu_to_be16(mode & ~0777);
+ dip->di_uid = 0;
+ dip->di_gid = 0;
+}
+
+/* Make sure the fork offset is a sensible value. */
+STATIC void
+xrep_dinode_ensure_forkoff(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip,
+ uint16_t mode)
+{
+ struct xfs_bmdr_block *bmdr;
+ struct xfs_scrub *sc = ri->sc;
+ xfs_extnum_t attr_extents, data_extents;
+ size_t bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
+ unsigned int lit_sz = XFS_LITINO(sc->mp);
+ unsigned int afork_min, dfork_min;
+
+ trace_xrep_dinode_ensure_forkoff(sc, dip);
+
+ /*
+ * Before calling this function, xrep_dinode_core ensured that both
+ * forks actually fit inside their respective literal areas. If this
+ * was not the case, the fork was reset to FMT_EXTENTS with zero
+ * records. If the rmapbt scan found attr or data fork blocks, this
+ * will be noted in the dinode_stats, and we must leave enough room
+ * for the bmap repair code to reconstruct the mapping structure.
+ *
+ * First, compute the minimum space required for the attr fork.
+ */
+ switch (dip->di_aformat) {
+ case XFS_DINODE_FMT_LOCAL:
+ /*
+ * If we still have a shortform xattr structure at all, that
+ * means the attr fork area was exactly large enough to fit
+ * the sf structure.
+ */
+ afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ attr_extents = xfs_dfork_attr_extents(dip);
+ if (attr_extents) {
+ /*
+ * We must maintain sufficient space to hold the entire
+ * extent map array in the data fork. Note that we
+ * previously zapped the fork if it had no chance of
+ * fitting in the inode.
+ */
+ afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents;
+ } else if (ri->attr_extents > 0) {
+ /*
+ * The attr fork thinks it has zero extents, but we
+ * found some xattr extents. We need to leave enough
+ * empty space here so that the incore attr fork will
+ * get created (and hence trigger the attr fork bmap
+ * repairer).
+ */
+ afork_min = bmdr_minsz;
+ } else {
+ /* No extents on disk or found in rmapbt. */
+ afork_min = 0;
+ }
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ /* Must have space for btree header and key/pointers. */
+ bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
+ afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+ break;
+ default:
+ /* We should never see any other formats. */
+ afork_min = 0;
+ break;
+ }
+
+ /* Compute the minimum space required for the data fork. */
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ dfork_min = sizeof(__be32);
+ break;
+ case XFS_DINODE_FMT_UUID:
+ dfork_min = sizeof(uuid_t);
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ /*
+ * If we still have a shortform data fork at all, that means
+ * the data fork area was large enough to fit whatever was in
+ * there.
+ */
+ dfork_min = be64_to_cpu(dip->di_size);
+ break;
+ case XFS_DINODE_FMT_EXTENTS:
+ data_extents = xfs_dfork_data_extents(dip);
+ if (data_extents) {
+ /*
+ * We must maintain sufficient space to hold the entire
+ * extent map array in the data fork. Note that we
+ * previously zapped the fork if it had no chance of
+ * fitting in the inode.
+ */
+ dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents;
+ } else if (ri->data_extents > 0 || ri->rt_extents > 0) {
+ /*
+ * The data fork thinks it has zero extents, but we
+ * found some data extents. We need to leave enough
+ * empty space here so that the data fork bmap repair
+ * will recover the mappings.
+ */
+ dfork_min = bmdr_minsz;
+ } else {
+ /* No extents on disk or found in rmapbt. */
+ dfork_min = 0;
+ }
+ break;
+ case XFS_DINODE_FMT_BTREE:
+ /* Must have space for btree header and key/pointers. */
+ bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
+ dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
+ break;
+ default:
+ dfork_min = 0;
+ break;
+ }
+
+ /*
+ * Round all values up to the nearest 8 bytes, because that is the
+ * precision of di_forkoff.
+ */
+ afork_min = roundup(afork_min, 8);
+ dfork_min = roundup(dfork_min, 8);
+ bmdr_minsz = roundup(bmdr_minsz, 8);
+
+ ASSERT(dfork_min <= lit_sz);
+ ASSERT(afork_min <= lit_sz);
+
+ /*
+ * If the data fork was zapped and we don't have enough space for the
+ * recovery fork, move the attr fork up.
+ */
+ if (dip->di_format == XFS_DINODE_FMT_EXTENTS &&
+ xfs_dfork_data_extents(dip) == 0 &&
+ (ri->data_extents > 0 || ri->rt_extents > 0) &&
+ bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) {
+ if (bmdr_minsz + afork_min > lit_sz) {
+ /*
+ * The attr for and the stub fork we need to recover
+ * the data fork won't both fit. Zap the attr fork.
+ */
+ xrep_dinode_zap_afork(ri, dip, mode);
+ afork_min = bmdr_minsz;
+ } else {
+ void *before, *after;
+
+ /* Otherwise, just slide the attr fork up. */
+ before = XFS_DFORK_APTR(dip);
+ dip->di_forkoff = bmdr_minsz >> 3;
+ after = XFS_DFORK_APTR(dip);
+ memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp));
+ }
+ }
+
+ /*
+ * If the attr fork was zapped and we don't have enough space for the
+ * recovery fork, move the attr fork down.
+ */
+ if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ xfs_dfork_attr_extents(dip) == 0 &&
+ ri->attr_extents > 0 &&
+ bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) {
+ if (dip->di_format == XFS_DINODE_FMT_BTREE) {
+ /*
+ * If the data fork is in btree format then we can't
+ * adjust forkoff because that runs the risk of
+ * violating the extents/btree format transition rules.
+ */
+ } else if (bmdr_minsz + dfork_min > lit_sz) {
+ /*
+ * If we can't move the attr fork, too bad, we lose the
+ * attr fork and leak its blocks.
+ */
+ xrep_dinode_zap_afork(ri, dip, mode);
+ } else {
+ /*
+ * Otherwise, just slide the attr fork down. The attr
+ * fork is empty, so we don't have any old contents to
+ * move here.
+ */
+ dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3;
+ }
+ }
+}
+
+/*
+ * Zap the data/attr forks if we spot anything that isn't going to pass the
+ * ifork verifiers or the ifork formatters, because we need to get the inode
+ * into good enough shape that the higher level repair functions can run.
+ */
+STATIC void
+xrep_dinode_zap_forks(
+ struct xrep_inode *ri,
+ struct xfs_dinode *dip)
+{
+ struct xfs_scrub *sc = ri->sc;
+ xfs_extnum_t data_extents;
+ xfs_extnum_t attr_extents;
+ xfs_filblks_t nblocks;
+ uint16_t mode;
+ bool zap_datafork = false;
+ bool zap_attrfork = ri->zap_acls;
+
+ trace_xrep_dinode_zap_forks(sc, dip);
+
+ mode = be16_to_cpu(dip->di_mode);
+
+ data_extents = xfs_dfork_data_extents(dip);
+ attr_extents = xfs_dfork_attr_extents(dip);
+ nblocks = be64_to_cpu(dip->di_nblocks);
+
+ /* Inode counters don't make sense? */
+ if (data_extents > nblocks)
+ zap_datafork = true;
+ if (attr_extents > nblocks)
+ zap_attrfork = true;
+ if (data_extents + attr_extents > nblocks)
+ zap_datafork = zap_attrfork = true;
+
+ if (!zap_datafork)
+ zap_datafork = xrep_dinode_check_dfork(sc, dip, mode);
+ if (!zap_attrfork)
+ zap_attrfork = xrep_dinode_check_afork(sc, dip);
+
+ /* Zap whatever's bad. */
+ if (zap_attrfork)
+ xrep_dinode_zap_afork(ri, dip, mode);
+ if (zap_datafork)
+ xrep_dinode_zap_dfork(ri, dip, mode);
+ xrep_dinode_ensure_forkoff(ri, dip, mode);
+
+ /*
+ * Zero di_nblocks if we don't have any extents at all to satisfy the
+ * buffer verifier.
+ */
+ data_extents = xfs_dfork_data_extents(dip);
+ attr_extents = xfs_dfork_attr_extents(dip);
+ if (data_extents + attr_extents == 0)
+ dip->di_nblocks = 0;
+}
+
+/* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */
+STATIC int
+xrep_dinode_core(
+ struct xrep_inode *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ struct xfs_buf *bp;
+ struct xfs_dinode *dip;
+ xfs_ino_t ino = sc->sm->sm_ino;
+ int error;
+ int iget_error;
+
+ /* Figure out what this inode had mapped in both forks. */
+ error = xrep_dinode_count_rmaps(ri);
+ if (error)
+ return error;
+
+ /* Read the inode cluster buffer. */
+ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
+ ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
+ NULL);
+ if (error)
+ return error;
+
+ /* Make sure we can pass the inode buffer verifier. */
+ xrep_dinode_buf(sc, bp);
+ bp->b_ops = &xfs_inode_buf_ops;
+
+ /* Fix everything the verifier will complain about. */
+ dip = xfs_buf_offset(bp, ri->imap.im_boffset);
+ xrep_dinode_header(sc, dip);
+ xrep_dinode_mode(ri, dip);
+ xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
+ xrep_dinode_size(ri, dip);
+ xrep_dinode_extsize_hints(sc, dip);
+ xrep_dinode_zap_forks(ri, dip);
+
+ /* Write out the inode. */
+ trace_xrep_dinode_fixed(sc, dip);
+ xfs_dinode_calc_crc(sc->mp, dip);
+ xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
+ xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
+ ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
+
+ /*
+ * In theory, we've fixed the ondisk inode record enough that we should
+ * be able to load the inode into the cache. Try to iget that inode
+ * now while we hold the AGI and the inode cluster buffer and take the
+ * IOLOCK so that we can continue with repairs without anyone else
+ * accessing the inode. If iget fails, we still need to commit the
+ * changes.
+ */
+ iget_error = xchk_iget(sc, ino, &sc->ip);
+ if (!iget_error)
+ xchk_ilock(sc, XFS_IOLOCK_EXCL);
+
+ /*
+ * Commit the inode cluster buffer updates and drop the AGI buffer that
+ * we've been holding since scrub setup. From here on out, repairs
+ * deal only with the cached inode.
+ */
+ error = xrep_trans_commit(sc);
+ if (error)
+ return error;
+
+ if (iget_error)
+ return iget_error;
+
+ error = xchk_trans_alloc(sc, 0);
+ if (error)
+ return error;
+
+ error = xrep_ino_dqattach(sc);
+ if (error)
+ return error;
+
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ if (ri->ino_sick_mask)
+ xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask);
+ return 0;
+}
+
+/* Fix everything xfs_dinode_verify cares about. */
+STATIC int
+xrep_dinode_problems(
+ struct xrep_inode *ri)
+{
+ struct xfs_scrub *sc = ri->sc;
+ int error;
+
+ error = xrep_dinode_core(ri);
+ if (error)
+ return error;
+
+ /* We had to fix a totally busted inode, schedule quotacheck. */
+ if (XFS_IS_UQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
+ if (XFS_IS_GQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
+ if (XFS_IS_PQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+
+ return 0;
+}
+
+/*
+ * Fix problems that the verifiers don't care about. In general these are
+ * errors that don't cause problems elsewhere in the kernel that we can easily
+ * detect, so we don't check them all that rigorously.
+ */
+
+/* Make sure block and extent counts are ok. */
+STATIC int
+xrep_inode_blockcounts(
+ struct xfs_scrub *sc)
+{
+ struct xfs_ifork *ifp;
+ xfs_filblks_t count;
+ xfs_filblks_t acount;
+ xfs_extnum_t nextents;
+ int error;
+
+ trace_xrep_inode_blockcounts(sc);
+
+ /* Set data fork counters from the data fork mappings. */
+ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
+ &nextents, &count);
+ if (error)
+ return error;
+ if (xfs_is_reflink_inode(sc->ip)) {
+ /*
+ * data fork blockcount can exceed physical storage if a user
+ * reflinks the same block over and over again.
+ */
+ ;
+ } else if (XFS_IS_REALTIME_INODE(sc->ip)) {
+ if (count >= sc->mp->m_sb.sb_rblocks)
+ return -EFSCORRUPTED;
+ } else {
+ if (count >= sc->mp->m_sb.sb_dblocks)
+ return -EFSCORRUPTED;
+ }
+ error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
+ if (error)
+ return error;
+ sc->ip->i_df.if_nextents = nextents;
+
+ /* Set attr fork counters from the attr fork mappings. */
+ ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
+ if (ifp) {
+ error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
+ &nextents, &acount);
+ if (error)
+ return error;
+ if (count >= sc->mp->m_sb.sb_dblocks)
+ return -EFSCORRUPTED;
+ error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
+ nextents);
+ if (error)
+ return error;
+ ifp->if_nextents = nextents;
+ } else {
+ acount = 0;
+ }
+
+ sc->ip->i_nblocks = count + acount;
+ return 0;
+}
+
+/* Check for invalid uid/gid/prid. */
+STATIC void
+xrep_inode_ids(
+ struct xfs_scrub *sc)
+{
+ bool dirty = false;
+
+ trace_xrep_inode_ids(sc);
+
+ if (!uid_valid(VFS_I(sc->ip)->i_uid)) {
+ i_uid_write(VFS_I(sc->ip), 0);
+ dirty = true;
+ if (XFS_IS_UQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
+ }
+
+ if (!gid_valid(VFS_I(sc->ip)->i_gid)) {
+ i_gid_write(VFS_I(sc->ip), 0);
+ dirty = true;
+ if (XFS_IS_GQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
+ }
+
+ if (sc->ip->i_projid == -1U) {
+ sc->ip->i_projid = 0;
+ dirty = true;
+ if (XFS_IS_PQUOTA_ON(sc->mp))
+ xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
+ }
+
+ /* strip setuid/setgid if we touched any of the ids */
+ if (dirty)
+ VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
+}
+
+static inline void
+xrep_clamp_timestamp(
+ struct xfs_inode *ip,
+ struct timespec64 *ts)
+{
+ ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
+ *ts = timestamp_truncate(*ts, VFS_I(ip));
+}
+
+/* Nanosecond counters can't have more than 1 billion. */
+STATIC void
+xrep_inode_timestamps(
+ struct xfs_inode *ip)
+{
+ struct timespec64 tstamp;
+ struct inode *inode = VFS_I(ip);
+
+ tstamp = inode_get_atime(inode);
+ xrep_clamp_timestamp(ip, &tstamp);
+ inode_set_atime_to_ts(inode, tstamp);
+
+ tstamp = inode_get_mtime(inode);
+ xrep_clamp_timestamp(ip, &tstamp);
+ inode_set_mtime_to_ts(inode, tstamp);
+
+ tstamp = inode_get_ctime(inode);
+ xrep_clamp_timestamp(ip, &tstamp);
+ inode_set_ctime_to_ts(inode, tstamp);
+
+ xrep_clamp_timestamp(ip, &ip->i_crtime);
+}
+
+/* Fix inode flags that don't make sense together. */
+STATIC void
+xrep_inode_flags(
+ struct xfs_scrub *sc)
+{
+ uint16_t mode;
+
+ trace_xrep_inode_flags(sc);
+
+ mode = VFS_I(sc->ip)->i_mode;
+
+ /* Clear junk flags */
+ if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
+ sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
+
+ /* NEWRTBM only applies to realtime bitmaps */
+ if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
+ sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
+ else
+ sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
+
+ /* These only make sense for directories. */
+ if (!S_ISDIR(mode))
+ sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT |
+ XFS_DIFLAG_EXTSZINHERIT |
+ XFS_DIFLAG_PROJINHERIT |
+ XFS_DIFLAG_NOSYMLINKS);
+
+ /* These only make sense for files. */
+ if (!S_ISREG(mode))
+ sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME |
+ XFS_DIFLAG_EXTSIZE);
+
+ /* These only make sense for non-rt files. */
+ if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
+ sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
+
+ /* Immutable and append only? Drop the append. */
+ if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
+ (sc->ip->i_diflags & XFS_DIFLAG_APPEND))
+ sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
+
+ /* Clear junk flags. */
+ if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
+
+ /* No reflink flag unless we support it and it's a file. */
+ if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode))
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+
+ /* DAX only applies to files and dirs. */
+ if (!(S_ISREG(mode) || S_ISDIR(mode)))
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
+
+ /* No reflink files on the realtime device. */
+ if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+}
+
+/*
+ * Fix size problems with block/node format directories. If we fail to find
+ * the extent list, just bail out and let the bmapbtd repair functions clean
+ * up that mess.
+ */
+STATIC void
+xrep_inode_blockdir_size(
+ struct xfs_scrub *sc)
+{
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t off;
+ int error;
+
+ trace_xrep_inode_blockdir_size(sc);
+
+ error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
+ if (error)
+ return;
+
+ /* Find the last block before 32G; this is the dir size. */
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
+ if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
+ /* zero-extents directory? */
+ return;
+ }
+
+ off = got.br_startoff + got.br_blockcount;
+ sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
+ XFS_FSB_TO_B(sc->mp, off));
+}
+
+/* Fix size problems with short format directories. */
+STATIC void
+xrep_inode_sfdir_size(
+ struct xfs_scrub *sc)
+{
+ struct xfs_ifork *ifp;
+
+ trace_xrep_inode_sfdir_size(sc);
+
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ sc->ip->i_disk_size = ifp->if_bytes;
+}
+
+/*
+ * Fix any irregularities in a directory inode's size now that we can iterate
+ * extent maps and access other regular inode data.
+ */
+STATIC void
+xrep_inode_dir_size(
+ struct xfs_scrub *sc)
+{
+ trace_xrep_inode_dir_size(sc);
+
+ switch (sc->ip->i_df.if_format) {
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ xrep_inode_blockdir_size(sc);
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ xrep_inode_sfdir_size(sc);
+ break;
+ }
+}
+
+/* Fix extent size hint problems. */
+STATIC void
+xrep_inode_extsize(
+ struct xfs_scrub *sc)
+{
+ /* Fix misaligned extent size hints on a directory. */
+ if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
+ (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
+ xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) {
+ sc->ip->i_extsize = 0;
+ sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
+ }
+}
+
+/* Fix any irregularities in an inode that the verifiers don't catch. */
+STATIC int
+xrep_inode_problems(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ error = xrep_inode_blockcounts(sc);
+ if (error)
+ return error;
+ xrep_inode_timestamps(sc->ip);
+ xrep_inode_flags(sc);
+ xrep_inode_ids(sc);
+ /*
+ * We can now do a better job fixing the size of a directory now that
+ * we can scan the data fork extents than we could in xrep_dinode_size.
+ */
+ if (S_ISDIR(VFS_I(sc->ip)->i_mode))
+ xrep_inode_dir_size(sc);
+ xrep_inode_extsize(sc);
+
+ trace_xrep_inode_fixed(sc);
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ return xrep_roll_trans(sc);
+}
+
+/* Repair an inode's fields. */
+int
+xrep_inode(
+ struct xfs_scrub *sc)
+{
+ int error = 0;
+
+ /*
+ * No inode? That means we failed the _iget verifiers. Repair all
+ * the things that the inode verifiers care about, then retry _iget.
+ */
+ if (!sc->ip) {
+ struct xrep_inode *ri = sc->buf;
+
+ ASSERT(ri != NULL);
+
+ error = xrep_dinode_problems(ri);
+ if (error)
+ return error;
+
+ /* By this point we had better have a working incore inode. */
+ if (!sc->ip)
+ return -EFSCORRUPTED;
+ }
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* If we found corruption of any kind, try to fix it. */
+ if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) ||
+ (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
+ error = xrep_inode_problems(sc);
+ if (error)
+ return error;
+ }
+
+ /* See if we can clear the reflink flag. */
+ if (xfs_is_reflink_inode(sc->ip)) {
+ error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
+ if (error)
+ return error;
+ }
+
+ return xrep_defer_finish(sc);
+}
diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c
new file mode 100644
index 000000000000..bb6d980b4fcd
--- /dev/null
+++ b/fs/xfs/scrub/newbt.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_alloc.h"
+#include "xfs_rmap.h"
+#include "xfs_ag.h"
+#include "xfs_defer.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/newbt.h"
+
+/*
+ * Estimate proper slack values for a btree that's being reloaded.
+ *
+ * Under most circumstances, we'll take whatever default loading value the
+ * btree bulk loading code calculates for us. However, there are some
+ * exceptions to this rule:
+ *
+ * (0) If someone turned one of the debug knobs.
+ * (1) If this is a per-AG btree and the AG has less than 10% space free.
+ * (2) If this is an inode btree and the FS has less than 10% space free.
+
+ * In either case, format the new btree blocks almost completely full to
+ * minimize space usage.
+ */
+static void
+xrep_newbt_estimate_slack(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_btree_bload *bload = &xnr->bload;
+ uint64_t free;
+ uint64_t sz;
+
+ /*
+ * The xfs_globals values are set to -1 (i.e. take the bload defaults)
+ * unless someone has set them otherwise, so we just pull the values
+ * here.
+ */
+ bload->leaf_slack = xfs_globals.bload_leaf_slack;
+ bload->node_slack = xfs_globals.bload_node_slack;
+
+ if (sc->ops->type == ST_PERAG) {
+ free = sc->sa.pag->pagf_freeblks;
+ sz = xfs_ag_block_count(sc->mp, sc->sa.pag->pag_agno);
+ } else {
+ free = percpu_counter_sum(&sc->mp->m_fdblocks);
+ sz = sc->mp->m_sb.sb_dblocks;
+ }
+
+ /* No further changes if there's more than 10% free space left. */
+ if (free >= div_u64(sz, 10))
+ return;
+
+ /*
+ * We're low on space; load the btrees as tightly as possible. Leave
+ * a couple of open slots in each btree block so that we don't end up
+ * splitting the btrees like crazy after a mount.
+ */
+ if (bload->leaf_slack < 0)
+ bload->leaf_slack = 2;
+ if (bload->node_slack < 0)
+ bload->node_slack = 2;
+}
+
+/* Initialize accounting resources for staging a new AG btree. */
+void
+xrep_newbt_init_ag(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo,
+ xfs_fsblock_t alloc_hint,
+ enum xfs_ag_resv_type resv)
+{
+ memset(xnr, 0, sizeof(struct xrep_newbt));
+ xnr->sc = sc;
+ xnr->oinfo = *oinfo; /* structure copy */
+ xnr->alloc_hint = alloc_hint;
+ xnr->resv = resv;
+ INIT_LIST_HEAD(&xnr->resv_list);
+ xnr->bload.max_dirty = XFS_B_TO_FSBT(sc->mp, 256U << 10); /* 256K */
+ xrep_newbt_estimate_slack(xnr);
+}
+
+/* Initialize accounting resources for staging a new inode fork btree. */
+int
+xrep_newbt_init_inode(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc,
+ int whichfork,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xfs_ifork *ifp;
+
+ ifp = kmem_cache_zalloc(xfs_ifork_cache, XCHK_GFP_FLAGS);
+ if (!ifp)
+ return -ENOMEM;
+
+ xrep_newbt_init_ag(xnr, sc, oinfo,
+ XFS_INO_TO_FSB(sc->mp, sc->ip->i_ino),
+ XFS_AG_RESV_NONE);
+ xnr->ifake.if_fork = ifp;
+ xnr->ifake.if_fork_size = xfs_inode_fork_size(sc->ip, whichfork);
+ return 0;
+}
+
+/*
+ * Initialize accounting resources for staging a new btree. Callers are
+ * expected to add their own reservations (and clean them up) manually.
+ */
+void
+xrep_newbt_init_bare(
+ struct xrep_newbt *xnr,
+ struct xfs_scrub *sc)
+{
+ xrep_newbt_init_ag(xnr, sc, &XFS_RMAP_OINFO_ANY_OWNER, NULLFSBLOCK,
+ XFS_AG_RESV_NONE);
+}
+
+/*
+ * Designate specific blocks to be used to build our new btree. @pag must be
+ * a passive reference.
+ */
+STATIC int
+xrep_newbt_add_blocks(
+ struct xrep_newbt *xnr,
+ struct xfs_perag *pag,
+ const struct xfs_alloc_arg *args)
+{
+ struct xfs_mount *mp = xnr->sc->mp;
+ struct xrep_newbt_resv *resv;
+ int error;
+
+ resv = kmalloc(sizeof(struct xrep_newbt_resv), XCHK_GFP_FLAGS);
+ if (!resv)
+ return -ENOMEM;
+
+ INIT_LIST_HEAD(&resv->list);
+ resv->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
+ resv->len = args->len;
+ resv->used = 0;
+ resv->pag = xfs_perag_hold(pag);
+
+ if (args->tp) {
+ ASSERT(xnr->oinfo.oi_offset == 0);
+
+ error = xfs_alloc_schedule_autoreap(args, true, &resv->autoreap);
+ if (error)
+ goto out_pag;
+ }
+
+ list_add_tail(&resv->list, &xnr->resv_list);
+ return 0;
+out_pag:
+ xfs_perag_put(resv->pag);
+ kfree(resv);
+ return error;
+}
+
+/*
+ * Add an extent to the new btree reservation pool. Callers are required to
+ * reap this reservation manually if the repair is cancelled. @pag must be a
+ * passive reference.
+ */
+int
+xrep_newbt_add_extent(
+ struct xrep_newbt *xnr,
+ struct xfs_perag *pag,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ struct xfs_mount *mp = xnr->sc->mp;
+ struct xfs_alloc_arg args = {
+ .tp = NULL, /* no autoreap */
+ .oinfo = xnr->oinfo,
+ .fsbno = XFS_AGB_TO_FSB(mp, pag->pag_agno, agbno),
+ .len = len,
+ .resv = xnr->resv,
+ };
+
+ return xrep_newbt_add_blocks(xnr, pag, &args);
+}
+
+/* Don't let our allocation hint take us beyond this AG */
+static inline void
+xrep_newbt_validate_ag_alloc_hint(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, xnr->alloc_hint);
+
+ if (agno == sc->sa.pag->pag_agno &&
+ xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
+ return;
+
+ xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno,
+ XFS_AGFL_BLOCK(sc->mp) + 1);
+}
+
+/* Allocate disk space for a new per-AG btree. */
+STATIC int
+xrep_newbt_alloc_ag_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ ASSERT(sc->sa.pag != NULL);
+
+ while (nr_blocks > 0) {
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = mp,
+ .oinfo = xnr->oinfo,
+ .minlen = 1,
+ .maxlen = nr_blocks,
+ .prod = 1,
+ .resv = xnr->resv,
+ };
+ xfs_agnumber_t agno;
+
+ xrep_newbt_validate_ag_alloc_hint(xnr);
+
+ error = xfs_alloc_vextent_near_bno(&args, xnr->alloc_hint);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+
+ trace_xrep_newbt_alloc_ag_blocks(mp, agno,
+ XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+ xnr->oinfo.oi_owner);
+
+ if (agno != sc->sa.pag->pag_agno) {
+ ASSERT(agno == sc->sa.pag->pag_agno);
+ return -EFSCORRUPTED;
+ }
+
+ error = xrep_newbt_add_blocks(xnr, sc->sa.pag, &args);
+ if (error)
+ return error;
+
+ nr_blocks -= args.len;
+ xnr->alloc_hint = args.fsbno + args.len;
+
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Don't let our allocation hint take us beyond EOFS */
+static inline void
+xrep_newbt_validate_file_alloc_hint(
+ struct xrep_newbt *xnr)
+{
+ struct xfs_scrub *sc = xnr->sc;
+
+ if (xfs_verify_fsbno(sc->mp, xnr->alloc_hint))
+ return;
+
+ xnr->alloc_hint = XFS_AGB_TO_FSB(sc->mp, 0, XFS_AGFL_BLOCK(sc->mp) + 1);
+}
+
+/* Allocate disk space for our new file-based btree. */
+STATIC int
+xrep_newbt_alloc_file_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xfs_mount *mp = sc->mp;
+ int error = 0;
+
+ while (nr_blocks > 0) {
+ struct xfs_alloc_arg args = {
+ .tp = sc->tp,
+ .mp = mp,
+ .oinfo = xnr->oinfo,
+ .minlen = 1,
+ .maxlen = nr_blocks,
+ .prod = 1,
+ .resv = xnr->resv,
+ };
+ struct xfs_perag *pag;
+ xfs_agnumber_t agno;
+
+ xrep_newbt_validate_file_alloc_hint(xnr);
+
+ error = xfs_alloc_vextent_start_ag(&args, xnr->alloc_hint);
+ if (error)
+ return error;
+ if (args.fsbno == NULLFSBLOCK)
+ return -ENOSPC;
+
+ agno = XFS_FSB_TO_AGNO(mp, args.fsbno);
+
+ trace_xrep_newbt_alloc_file_blocks(mp, agno,
+ XFS_FSB_TO_AGBNO(mp, args.fsbno), args.len,
+ xnr->oinfo.oi_owner);
+
+ pag = xfs_perag_get(mp, agno);
+ if (!pag) {
+ ASSERT(0);
+ return -EFSCORRUPTED;
+ }
+
+ error = xrep_newbt_add_blocks(xnr, pag, &args);
+ xfs_perag_put(pag);
+ if (error)
+ return error;
+
+ nr_blocks -= args.len;
+ xnr->alloc_hint = args.fsbno + args.len;
+
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ }
+
+ return 0;
+}
+
+/* Allocate disk space for our new btree. */
+int
+xrep_newbt_alloc_blocks(
+ struct xrep_newbt *xnr,
+ uint64_t nr_blocks)
+{
+ if (xnr->sc->ip)
+ return xrep_newbt_alloc_file_blocks(xnr, nr_blocks);
+ return xrep_newbt_alloc_ag_blocks(xnr, nr_blocks);
+}
+
+/*
+ * Free the unused part of a space extent that was reserved for a new ondisk
+ * structure. Returns the number of EFIs logged or a negative errno.
+ */
+STATIC int
+xrep_newbt_free_extent(
+ struct xrep_newbt *xnr,
+ struct xrep_newbt_resv *resv,
+ bool btree_committed)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ xfs_agblock_t free_agbno = resv->agbno;
+ xfs_extlen_t free_aglen = resv->len;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ if (!btree_committed || resv->used == 0) {
+ /*
+ * If we're not committing a new btree or we didn't use the
+ * space reservation, let the existing EFI free the entire
+ * space extent.
+ */
+ trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno,
+ free_agbno, free_aglen, xnr->oinfo.oi_owner);
+ xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
+ return 1;
+ }
+
+ /*
+ * We used space and committed the btree. Cancel the autoreap, remove
+ * the written blocks from the reservation, and possibly log a new EFI
+ * to free any unused reservation space.
+ */
+ xfs_alloc_cancel_autoreap(sc->tp, &resv->autoreap);
+ free_agbno += resv->used;
+ free_aglen -= resv->used;
+
+ if (free_aglen == 0)
+ return 0;
+
+ trace_xrep_newbt_free_blocks(sc->mp, resv->pag->pag_agno, free_agbno,
+ free_aglen, xnr->oinfo.oi_owner);
+
+ ASSERT(xnr->resv != XFS_AG_RESV_AGFL);
+ ASSERT(xnr->resv != XFS_AG_RESV_IGNORE);
+
+ /*
+ * Use EFIs to free the reservations. This reduces the chance
+ * that we leak blocks if the system goes down.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, resv->pag->pag_agno, free_agbno);
+ error = xfs_free_extent_later(sc->tp, fsbno, free_aglen, &xnr->oinfo,
+ xnr->resv, true);
+ if (error)
+ return error;
+
+ return 1;
+}
+
+/* Free all the accounting info and disk space we reserved for a new btree. */
+STATIC int
+xrep_newbt_free(
+ struct xrep_newbt *xnr,
+ bool btree_committed)
+{
+ struct xfs_scrub *sc = xnr->sc;
+ struct xrep_newbt_resv *resv, *n;
+ unsigned int freed = 0;
+ int error = 0;
+
+ /*
+ * If the filesystem already went down, we can't free the blocks. Skip
+ * ahead to freeing the incore metadata because we can't fix anything.
+ */
+ if (xfs_is_shutdown(sc->mp))
+ goto junkit;
+
+ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+ int ret;
+
+ ret = xrep_newbt_free_extent(xnr, resv, btree_committed);
+ list_del(&resv->list);
+ xfs_perag_put(resv->pag);
+ kfree(resv);
+ if (ret < 0) {
+ error = ret;
+ goto junkit;
+ }
+
+ freed += ret;
+ if (freed >= XREP_MAX_ITRUNCATE_EFIS) {
+ error = xrep_defer_finish(sc);
+ if (error)
+ goto junkit;
+ freed = 0;
+ }
+ }
+
+ if (freed)
+ error = xrep_defer_finish(sc);
+
+junkit:
+ /*
+ * If we still have reservations attached to @newbt, cleanup must have
+ * failed and the filesystem is about to go down. Clean up the incore
+ * reservations and try to commit to freeing the space we used.
+ */
+ list_for_each_entry_safe(resv, n, &xnr->resv_list, list) {
+ xfs_alloc_commit_autoreap(sc->tp, &resv->autoreap);
+ list_del(&resv->list);
+ xfs_perag_put(resv->pag);
+ kfree(resv);
+ }
+
+ if (sc->ip) {
+ kmem_cache_free(xfs_ifork_cache, xnr->ifake.if_fork);
+ xnr->ifake.if_fork = NULL;
+ }
+
+ return error;
+}
+
+/*
+ * Free all the accounting info and unused disk space allocations after
+ * committing a new btree.
+ */
+int
+xrep_newbt_commit(
+ struct xrep_newbt *xnr)
+{
+ return xrep_newbt_free(xnr, true);
+}
+
+/*
+ * Free all the accounting info and all of the disk space we reserved for a new
+ * btree that we're not going to commit. We want to try to roll things back
+ * cleanly for things like ENOSPC midway through allocation.
+ */
+void
+xrep_newbt_cancel(
+ struct xrep_newbt *xnr)
+{
+ xrep_newbt_free(xnr, false);
+}
+
+/* Feed one of the reserved btree blocks to the bulk loader. */
+int
+xrep_newbt_claim_block(
+ struct xfs_btree_cur *cur,
+ struct xrep_newbt *xnr,
+ union xfs_btree_ptr *ptr)
+{
+ struct xrep_newbt_resv *resv;
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_agblock_t agbno;
+
+ /*
+ * The first item in the list should always have a free block unless
+ * we're completely out.
+ */
+ resv = list_first_entry(&xnr->resv_list, struct xrep_newbt_resv, list);
+ if (resv->used == resv->len)
+ return -ENOSPC;
+
+ /*
+ * Peel off a block from the start of the reservation. We allocate
+ * blocks in order to place blocks on disk in increasing record or key
+ * order. The block reservations tend to end up on the list in
+ * decreasing order, which hopefully results in leaf blocks ending up
+ * together.
+ */
+ agbno = resv->agbno + resv->used;
+ resv->used++;
+
+ /* If we used all the blocks in this reservation, move it to the end. */
+ if (resv->used == resv->len)
+ list_move_tail(&resv->list, &xnr->resv_list);
+
+ trace_xrep_newbt_claim_block(mp, resv->pag->pag_agno, agbno, 1,
+ xnr->oinfo.oi_owner);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(XFS_AGB_TO_FSB(mp, resv->pag->pag_agno,
+ agbno));
+ else
+ ptr->s = cpu_to_be32(agbno);
+
+ /* Relog all the EFIs. */
+ return xrep_defer_finish(xnr->sc);
+}
+
+/* How many reserved blocks are unused? */
+unsigned int
+xrep_newbt_unused_blocks(
+ struct xrep_newbt *xnr)
+{
+ struct xrep_newbt_resv *resv;
+ unsigned int unused = 0;
+
+ list_for_each_entry(resv, &xnr->resv_list, list)
+ unused += resv->len - resv->used;
+ return unused;
+}
diff --git a/fs/xfs/scrub/newbt.h b/fs/xfs/scrub/newbt.h
new file mode 100644
index 000000000000..89f8e3970b1f
--- /dev/null
+++ b/fs/xfs/scrub/newbt.h
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_NEWBT_H__
+#define __XFS_SCRUB_NEWBT_H__
+
+struct xrep_newbt_resv {
+ /* Link to list of extents that we've reserved. */
+ struct list_head list;
+
+ struct xfs_perag *pag;
+
+ /* Auto-freeing this reservation if we don't commit. */
+ struct xfs_alloc_autoreap autoreap;
+
+ /* AG block of the extent we reserved. */
+ xfs_agblock_t agbno;
+
+ /* Length of the reservation. */
+ xfs_extlen_t len;
+
+ /* How much of this reservation has been used. */
+ xfs_extlen_t used;
+};
+
+struct xrep_newbt {
+ struct xfs_scrub *sc;
+
+ /* List of extents that we've reserved. */
+ struct list_head resv_list;
+
+ /* Fake root for new btree. */
+ union {
+ struct xbtree_afakeroot afake;
+ struct xbtree_ifakeroot ifake;
+ };
+
+ /* rmap owner of these blocks */
+ struct xfs_owner_info oinfo;
+
+ /* btree geometry for the bulk loader */
+ struct xfs_btree_bload bload;
+
+ /* Allocation hint */
+ xfs_fsblock_t alloc_hint;
+
+ /* per-ag reservation type */
+ enum xfs_ag_resv_type resv;
+};
+
+void xrep_newbt_init_bare(struct xrep_newbt *xnr, struct xfs_scrub *sc);
+void xrep_newbt_init_ag(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+ const struct xfs_owner_info *oinfo, xfs_fsblock_t alloc_hint,
+ enum xfs_ag_resv_type resv);
+int xrep_newbt_init_inode(struct xrep_newbt *xnr, struct xfs_scrub *sc,
+ int whichfork, const struct xfs_owner_info *oinfo);
+int xrep_newbt_alloc_blocks(struct xrep_newbt *xnr, uint64_t nr_blocks);
+int xrep_newbt_add_extent(struct xrep_newbt *xnr, struct xfs_perag *pag,
+ xfs_agblock_t agbno, xfs_extlen_t len);
+void xrep_newbt_cancel(struct xrep_newbt *xnr);
+int xrep_newbt_commit(struct xrep_newbt *xnr);
+int xrep_newbt_claim_block(struct xfs_btree_cur *cur, struct xrep_newbt *xnr,
+ union xfs_btree_ptr *ptr);
+unsigned int xrep_newbt_unused_blocks(struct xrep_newbt *xnr);
+
+#endif /* __XFS_SCRUB_NEWBT_H__ */
diff --git a/fs/xfs/scrub/off_bitmap.h b/fs/xfs/scrub/off_bitmap.h
new file mode 100644
index 000000000000..0d3f9e6c1aad
--- /dev/null
+++ b/fs/xfs/scrub/off_bitmap.h
@@ -0,0 +1,37 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_OFF_BITMAP_H__
+#define __XFS_SCRUB_OFF_BITMAP_H__
+
+/* Bitmaps, but for type-checked for xfs_fileoff_t */
+
+struct xoff_bitmap {
+ struct xbitmap64 offbitmap;
+};
+
+static inline void xoff_bitmap_init(struct xoff_bitmap *bitmap)
+{
+ xbitmap64_init(&bitmap->offbitmap);
+}
+
+static inline void xoff_bitmap_destroy(struct xoff_bitmap *bitmap)
+{
+ xbitmap64_destroy(&bitmap->offbitmap);
+}
+
+static inline int xoff_bitmap_set(struct xoff_bitmap *bitmap,
+ xfs_fileoff_t off, xfs_filblks_t len)
+{
+ return xbitmap64_set(&bitmap->offbitmap, off, len);
+}
+
+static inline int xoff_bitmap_walk(struct xoff_bitmap *bitmap,
+ xbitmap64_walk_fn fn, void *priv)
+{
+ return xbitmap64_walk(&bitmap->offbitmap, fn, priv);
+}
+
+#endif /* __XFS_SCRUB_OFF_BITMAP_H__ */
diff --git a/fs/xfs/scrub/parent.c b/fs/xfs/scrub/parent.c
index e6155d86f791..7db873672146 100644
--- a/fs/xfs/scrub/parent.c
+++ b/fs/xfs/scrub/parent.c
@@ -156,6 +156,16 @@ xchk_parent_validate(
goto out_rele;
}
+ /*
+ * We cannot yet validate this parent pointer if the directory looks as
+ * though it has been zapped by the inode record repair code.
+ */
+ if (xchk_dir_looks_zapped(dp)) {
+ error = -EBUSY;
+ xchk_set_incomplete(sc);
+ goto out_unlock;
+ }
+
/* Look for a directory entry in the parent pointing to the child. */
error = xchk_dir_walk(sc, dp, xchk_parent_actor, &spc);
if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
@@ -217,6 +227,13 @@ xchk_parent(
*/
error = xchk_parent_validate(sc, parent_ino);
} while (error == -EAGAIN);
+ if (error == -EBUSY) {
+ /*
+ * We could not scan a directory, so we marked the check
+ * incomplete. No further error return is necessary.
+ */
+ return 0;
+ }
return error;
}
diff --git a/fs/xfs/scrub/quota.c b/fs/xfs/scrub/quota.c
index 5671c8153433..183d531875ea 100644
--- a/fs/xfs/scrub/quota.c
+++ b/fs/xfs/scrub/quota.c
@@ -6,6 +6,7 @@
#include "xfs.h"
#include "xfs_fs.h"
#include "xfs_shared.h"
+#include "xfs_bit.h"
#include "xfs_format.h"
#include "xfs_trans_resv.h"
#include "xfs_mount.h"
@@ -17,9 +18,10 @@
#include "xfs_bmap.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
+#include "scrub/quota.h"
/* Convert a scrub type code to a DQ flag, or return 0 if error. */
-static inline xfs_dqtype_t
+xfs_dqtype_t
xchk_quota_to_dqtype(
struct xfs_scrub *sc)
{
@@ -75,14 +77,70 @@ struct xchk_quota_info {
xfs_dqid_t last_id;
};
+/* There's a written block backing this dquot, right? */
+STATIC int
+xchk_quota_item_bmap(
+ struct xfs_scrub *sc,
+ struct xfs_dquot *dq,
+ xfs_fileoff_t offset)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_mount *mp = sc->mp;
+ int nmaps = 1;
+ int error;
+
+ if (!xfs_verify_fileoff(mp, offset)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return 0;
+ }
+
+ if (dq->q_fileoffset != offset) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return 0;
+ }
+
+ error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0);
+ if (error)
+ return error;
+
+ if (nmaps != 1) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ return 0;
+ }
+
+ if (!xfs_verify_fsbno(mp, irec.br_startblock))
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ if (XFS_FSB_TO_DADDR(mp, irec.br_startblock) != dq->q_blkno)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ if (!xfs_bmap_is_written_extent(&irec))
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+
+ return 0;
+}
+
+/* Complain if a quota timer is incorrectly set. */
+static inline void
+xchk_quota_item_timer(
+ struct xfs_scrub *sc,
+ xfs_fileoff_t offset,
+ const struct xfs_dquot_res *res)
+{
+ if ((res->softlimit && res->count > res->softlimit) ||
+ (res->hardlimit && res->count > res->hardlimit)) {
+ if (!res->timer)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ } else {
+ if (res->timer)
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
+ }
+}
+
/* Scrub the fields in an individual quota item. */
STATIC int
xchk_quota_item(
- struct xfs_dquot *dq,
- xfs_dqtype_t dqtype,
- void *priv)
+ struct xchk_quota_info *sqi,
+ struct xfs_dquot *dq)
{
- struct xchk_quota_info *sqi = priv;
struct xfs_scrub *sc = sqi->sc;
struct xfs_mount *mp = sc->mp;
struct xfs_quotainfo *qi = mp->m_quotainfo;
@@ -94,6 +152,17 @@ xchk_quota_item(
return error;
/*
+ * We want to validate the bmap record for the storage backing this
+ * dquot, so we need to lock the dquot and the quota file. For quota
+ * operations, the locking order is first the ILOCK and then the dquot.
+ * However, dqiterate gave us a locked dquot, so drop the dquot lock to
+ * get the ILOCK.
+ */
+ xfs_dqunlock(dq);
+ xchk_ilock(sc, XFS_ILOCK_SHARED);
+ xfs_dqlock(dq);
+
+ /*
* Except for the root dquot, the actual dquot we got must either have
* the same or higher id as we saw before.
*/
@@ -103,6 +172,11 @@ xchk_quota_item(
sqi->last_id = dq->q_id;
+ error = xchk_quota_item_bmap(sc, dq, offset);
+ xchk_iunlock(sc, XFS_ILOCK_SHARED);
+ if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error))
+ return error;
+
/*
* Warn if the hard limits are larger than the fs.
* Administrators can do this, though in production this seems
@@ -166,6 +240,10 @@ xchk_quota_item(
dq->q_rtb.count > dq->q_rtb.hardlimit)
xchk_fblock_set_warning(sc, XFS_DATA_FORK, offset);
+ xchk_quota_item_timer(sc, offset, &dq->q_blk);
+ xchk_quota_item_timer(sc, offset, &dq->q_ino);
+ xchk_quota_item_timer(sc, offset, &dq->q_rtb);
+
out:
if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
return -ECANCELED;
@@ -191,7 +269,7 @@ xchk_quota_data_fork(
return error;
/* Check for data fork problems that apply only to quota files. */
- max_dqid_off = ((xfs_dqid_t)-1) / qi->qi_dqperchunk;
+ max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk;
ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
for_each_xfs_iext(ifp, &icur, &irec) {
if (xchk_should_terminate(sc, &error))
@@ -218,9 +296,11 @@ int
xchk_quota(
struct xfs_scrub *sc)
{
- struct xchk_quota_info sqi;
+ struct xchk_dqiter cursor = { };
+ struct xchk_quota_info sqi = { .sc = sc };
struct xfs_mount *mp = sc->mp;
struct xfs_quotainfo *qi = mp->m_quotainfo;
+ struct xfs_dquot *dq;
xfs_dqtype_t dqtype;
int error = 0;
@@ -239,10 +319,15 @@ xchk_quota(
* functions.
*/
xchk_iunlock(sc, sc->ilock_flags);
- sqi.sc = sc;
- sqi.last_id = 0;
- error = xfs_qm_dqiterate(mp, dqtype, xchk_quota_item, &sqi);
- xchk_ilock(sc, XFS_ILOCK_EXCL);
+
+ /* Now look for things that the quota verifiers won't complain about. */
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xchk_quota_item(&sqi, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
if (error == -ECANCELED)
error = 0;
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK,
diff --git a/fs/xfs/scrub/quota.h b/fs/xfs/scrub/quota.h
new file mode 100644
index 000000000000..6c7134ce2385
--- /dev/null
+++ b/fs/xfs/scrub/quota.h
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_QUOTA_H__
+#define __XFS_SCRUB_QUOTA_H__
+
+xfs_dqtype_t xchk_quota_to_dqtype(struct xfs_scrub *sc);
+
+/* dquot iteration code */
+
+struct xchk_dqiter {
+ struct xfs_scrub *sc;
+
+ /* Quota file that we're walking. */
+ struct xfs_inode *quota_ip;
+
+ /* Cached data fork mapping for the dquot. */
+ struct xfs_bmbt_irec bmap;
+
+ /* The next dquot to scan. */
+ uint64_t id;
+
+ /* Quota type (user/group/project). */
+ xfs_dqtype_t dqtype;
+
+ /* Data fork sequence number to detect stale mappings. */
+ unsigned int if_seq;
+};
+
+void xchk_dqiter_init(struct xchk_dqiter *cursor, struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype);
+int xchk_dquot_iter(struct xchk_dqiter *cursor, struct xfs_dquot **dqpp);
+
+#endif /* __XFS_SCRUB_QUOTA_H__ */
diff --git a/fs/xfs/scrub/quota_repair.c b/fs/xfs/scrub/quota_repair.c
new file mode 100644
index 000000000000..0bab4c30cb85
--- /dev/null
+++ b/fs/xfs/scrub/quota_repair.c
@@ -0,0 +1,575 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bit.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_inode.h"
+#include "xfs_inode_fork.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_quota.h"
+#include "xfs_qm.h"
+#include "xfs_dquot.h"
+#include "xfs_dquot_item.h"
+#include "xfs_reflink.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/quota.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+
+/*
+ * Quota Repair
+ * ============
+ *
+ * Quota repairs are fairly simplistic; we fix everything that the dquot
+ * verifiers complain about, cap any counters or limits that make no sense,
+ * and schedule a quotacheck if we had to fix anything. We also repair any
+ * data fork extent records that don't apply to metadata files.
+ */
+
+struct xrep_quota_info {
+ struct xfs_scrub *sc;
+ bool need_quotacheck;
+};
+
+/*
+ * Allocate a new block into a sparse hole in the quota file backing this
+ * dquot, initialize the block, and commit the whole mess.
+ */
+STATIC int
+xrep_quota_item_fill_bmap_hole(
+ struct xfs_scrub *sc,
+ struct xfs_dquot *dq,
+ struct xfs_bmbt_irec *irec)
+{
+ struct xfs_buf *bp;
+ struct xfs_mount *mp = sc->mp;
+ int nmaps = 1;
+ int error;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Map a block into the file. */
+ error = xfs_trans_reserve_more(sc->tp, XFS_QM_DQALLOC_SPACE_RES(mp),
+ 0);
+ if (error)
+ return error;
+
+ error = xfs_bmapi_write(sc->tp, sc->ip, dq->q_fileoffset,
+ XFS_DQUOT_CLUSTER_SIZE_FSB, XFS_BMAPI_METADATA, 0,
+ irec, &nmaps);
+ if (error)
+ return error;
+ if (nmaps != 1)
+ return -ENOSPC;
+
+ dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec->br_startblock);
+
+ trace_xrep_dquot_item_fill_bmap_hole(sc->mp, dq->q_type, dq->q_id);
+
+ /* Initialize the new block. */
+ error = xfs_trans_get_buf(sc->tp, mp->m_ddev_targp, dq->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ if (error)
+ return error;
+ bp->b_ops = &xfs_dquot_buf_ops;
+
+ xfs_qm_init_dquot_blk(sc->tp, dq->q_id, dq->q_type, bp);
+ xfs_buf_set_ref(bp, XFS_DQUOT_REF);
+
+ /*
+ * Finish the mapping transactions and roll one more time to
+ * disconnect sc->ip from sc->tp.
+ */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ return xfs_trans_roll(&sc->tp);
+}
+
+/* Make sure there's a written block backing this dquot */
+STATIC int
+xrep_quota_item_bmap(
+ struct xfs_scrub *sc,
+ struct xfs_dquot *dq,
+ bool *dirty)
+{
+ struct xfs_bmbt_irec irec;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_quotainfo *qi = mp->m_quotainfo;
+ xfs_fileoff_t offset = dq->q_id / qi->qi_dqperchunk;
+ int nmaps = 1;
+ int error;
+
+ /* The computed file offset should always be valid. */
+ if (!xfs_verify_fileoff(mp, offset)) {
+ ASSERT(xfs_verify_fileoff(mp, offset));
+ return -EFSCORRUPTED;
+ }
+ dq->q_fileoffset = offset;
+
+ error = xfs_bmapi_read(sc->ip, offset, 1, &irec, &nmaps, 0);
+ if (error)
+ return error;
+
+ if (nmaps < 1 || !xfs_bmap_is_real_extent(&irec)) {
+ /* Hole/delalloc extent; allocate a real block. */
+ error = xrep_quota_item_fill_bmap_hole(sc, dq, &irec);
+ if (error)
+ return error;
+ } else if (irec.br_state != XFS_EXT_NORM) {
+ /* Unwritten extent, which we already took care of? */
+ ASSERT(irec.br_state == XFS_EXT_NORM);
+ return -EFSCORRUPTED;
+ } else if (dq->q_blkno != XFS_FSB_TO_DADDR(mp, irec.br_startblock)) {
+ /*
+ * If the cached daddr is incorrect, repair probably punched a
+ * hole out of the quota file and filled it back in with a new
+ * block. Update the block mapping in the dquot.
+ */
+ dq->q_blkno = XFS_FSB_TO_DADDR(mp, irec.br_startblock);
+ }
+
+ *dirty = true;
+ return 0;
+}
+
+/* Reset quota timers if incorrectly set. */
+static inline void
+xrep_quota_item_timer(
+ struct xfs_scrub *sc,
+ const struct xfs_dquot_res *res,
+ bool *dirty)
+{
+ if ((res->softlimit && res->count > res->softlimit) ||
+ (res->hardlimit && res->count > res->hardlimit)) {
+ if (!res->timer)
+ *dirty = true;
+ } else {
+ if (res->timer)
+ *dirty = true;
+ }
+}
+
+/* Scrub the fields in an individual quota item. */
+STATIC int
+xrep_quota_item(
+ struct xrep_quota_info *rqi,
+ struct xfs_dquot *dq)
+{
+ struct xfs_scrub *sc = rqi->sc;
+ struct xfs_mount *mp = sc->mp;
+ xfs_ino_t fs_icount;
+ bool dirty = false;
+ int error = 0;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ /*
+ * We might need to fix holes in the bmap record for the storage
+ * backing this dquot, so we need to lock the dquot and the quota file.
+ * dqiterate gave us a locked dquot, so drop the dquot lock to get the
+ * ILOCK_EXCL.
+ */
+ xfs_dqunlock(dq);
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ xfs_dqlock(dq);
+
+ error = xrep_quota_item_bmap(sc, dq, &dirty);
+ xchk_iunlock(sc, XFS_ILOCK_EXCL);
+ if (error)
+ return error;
+
+ /* Check the limits. */
+ if (dq->q_blk.softlimit > dq->q_blk.hardlimit) {
+ dq->q_blk.softlimit = dq->q_blk.hardlimit;
+ dirty = true;
+ }
+
+ if (dq->q_ino.softlimit > dq->q_ino.hardlimit) {
+ dq->q_ino.softlimit = dq->q_ino.hardlimit;
+ dirty = true;
+ }
+
+ if (dq->q_rtb.softlimit > dq->q_rtb.hardlimit) {
+ dq->q_rtb.softlimit = dq->q_rtb.hardlimit;
+ dirty = true;
+ }
+
+ /*
+ * Check that usage doesn't exceed physical limits. However, on
+ * a reflink filesystem we're allowed to exceed physical space
+ * if there are no quota limits. We don't know what the real number
+ * is, but we can make quotacheck find out for us.
+ */
+ if (!xfs_has_reflink(mp) && dq->q_blk.count > mp->m_sb.sb_dblocks) {
+ dq->q_blk.reserved -= dq->q_blk.count;
+ dq->q_blk.reserved += mp->m_sb.sb_dblocks;
+ dq->q_blk.count = mp->m_sb.sb_dblocks;
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+ fs_icount = percpu_counter_sum(&mp->m_icount);
+ if (dq->q_ino.count > fs_icount) {
+ dq->q_ino.reserved -= dq->q_ino.count;
+ dq->q_ino.reserved += fs_icount;
+ dq->q_ino.count = fs_icount;
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+ if (dq->q_rtb.count > mp->m_sb.sb_rblocks) {
+ dq->q_rtb.reserved -= dq->q_rtb.count;
+ dq->q_rtb.reserved += mp->m_sb.sb_rblocks;
+ dq->q_rtb.count = mp->m_sb.sb_rblocks;
+ rqi->need_quotacheck = true;
+ dirty = true;
+ }
+
+ xrep_quota_item_timer(sc, &dq->q_blk, &dirty);
+ xrep_quota_item_timer(sc, &dq->q_ino, &dirty);
+ xrep_quota_item_timer(sc, &dq->q_rtb, &dirty);
+
+ if (!dirty)
+ return 0;
+
+ trace_xrep_dquot_item(sc->mp, dq->q_type, dq->q_id);
+
+ dq->q_flags |= XFS_DQFLAG_DIRTY;
+ xfs_trans_dqjoin(sc->tp, dq);
+ if (dq->q_id) {
+ xfs_qm_adjust_dqlimits(dq);
+ xfs_qm_adjust_dqtimers(dq);
+ }
+ xfs_trans_log_dquot(sc->tp, dq);
+ error = xfs_trans_roll(&sc->tp);
+ xfs_dqlock(dq);
+ return error;
+}
+
+/* Fix a quota timer so that we can pass the verifier. */
+STATIC void
+xrep_quota_fix_timer(
+ struct xfs_mount *mp,
+ const struct xfs_disk_dquot *ddq,
+ __be64 softlimit,
+ __be64 countnow,
+ __be32 *timer,
+ time64_t timelimit)
+{
+ uint64_t soft = be64_to_cpu(softlimit);
+ uint64_t count = be64_to_cpu(countnow);
+ time64_t new_timer;
+ uint32_t t;
+
+ if (!soft || count <= soft || *timer != 0)
+ return;
+
+ new_timer = xfs_dquot_set_timeout(mp,
+ ktime_get_real_seconds() + timelimit);
+ if (ddq->d_type & XFS_DQTYPE_BIGTIME)
+ t = xfs_dq_unix_to_bigtime(new_timer);
+ else
+ t = new_timer;
+
+ *timer = cpu_to_be32(t);
+}
+
+/* Fix anything the verifiers complain about. */
+STATIC int
+xrep_quota_block(
+ struct xfs_scrub *sc,
+ xfs_daddr_t daddr,
+ xfs_dqtype_t dqtype,
+ xfs_dqid_t id)
+{
+ struct xfs_dqblk *dqblk;
+ struct xfs_disk_dquot *ddq;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ struct xfs_def_quota *defq = xfs_get_defquota(qi, dqtype);
+ struct xfs_buf *bp = NULL;
+ enum xfs_blft buftype = 0;
+ int i;
+ int error;
+
+ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, daddr,
+ qi->qi_dqchunklen, 0, &bp, &xfs_dquot_buf_ops);
+ switch (error) {
+ case -EFSBADCRC:
+ case -EFSCORRUPTED:
+ /* Failed verifier, retry read with no ops. */
+ error = xfs_trans_read_buf(sc->mp, sc->tp,
+ sc->mp->m_ddev_targp, daddr, qi->qi_dqchunklen,
+ 0, &bp, NULL);
+ if (error)
+ return error;
+ break;
+ case 0:
+ dqblk = bp->b_addr;
+ ddq = &dqblk[0].dd_diskdq;
+
+ /*
+ * If there's nothing that would impede a dqiterate, we're
+ * done.
+ */
+ if ((ddq->d_type & XFS_DQTYPE_REC_MASK) != dqtype ||
+ id == be32_to_cpu(ddq->d_id)) {
+ xfs_trans_brelse(sc->tp, bp);
+ return 0;
+ }
+ break;
+ default:
+ return error;
+ }
+
+ /* Something's wrong with the block, fix the whole thing. */
+ dqblk = bp->b_addr;
+ bp->b_ops = &xfs_dquot_buf_ops;
+ for (i = 0; i < qi->qi_dqperchunk; i++, dqblk++) {
+ ddq = &dqblk->dd_diskdq;
+
+ trace_xrep_disk_dquot(sc->mp, dqtype, id + i);
+
+ ddq->d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
+ ddq->d_version = XFS_DQUOT_VERSION;
+ ddq->d_type = dqtype;
+ ddq->d_id = cpu_to_be32(id + i);
+
+ if (xfs_has_bigtime(sc->mp) && ddq->d_id)
+ ddq->d_type |= XFS_DQTYPE_BIGTIME;
+
+ xrep_quota_fix_timer(sc->mp, ddq, ddq->d_blk_softlimit,
+ ddq->d_bcount, &ddq->d_btimer,
+ defq->blk.time);
+
+ xrep_quota_fix_timer(sc->mp, ddq, ddq->d_ino_softlimit,
+ ddq->d_icount, &ddq->d_itimer,
+ defq->ino.time);
+
+ xrep_quota_fix_timer(sc->mp, ddq, ddq->d_rtb_softlimit,
+ ddq->d_rtbcount, &ddq->d_rtbtimer,
+ defq->rtb.time);
+
+ /* We only support v5 filesystems so always set these. */
+ uuid_copy(&dqblk->dd_uuid, &sc->mp->m_sb.sb_meta_uuid);
+ xfs_update_cksum((char *)dqblk, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ dqblk->dd_lsn = 0;
+ }
+ switch (dqtype) {
+ case XFS_DQTYPE_USER:
+ buftype = XFS_BLFT_UDQUOT_BUF;
+ break;
+ case XFS_DQTYPE_GROUP:
+ buftype = XFS_BLFT_GDQUOT_BUF;
+ break;
+ case XFS_DQTYPE_PROJ:
+ buftype = XFS_BLFT_PDQUOT_BUF;
+ break;
+ }
+ xfs_trans_buf_set_type(sc->tp, bp, buftype);
+ xfs_trans_log_buf(sc->tp, bp, 0, BBTOB(bp->b_length) - 1);
+ return xrep_roll_trans(sc);
+}
+
+/*
+ * Repair a quota file's data fork. The function returns with the inode
+ * joined.
+ */
+STATIC int
+xrep_quota_data_fork(
+ struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype)
+{
+ struct xfs_bmbt_irec irec = { 0 };
+ struct xfs_iext_cursor icur;
+ struct xfs_quotainfo *qi = sc->mp->m_quotainfo;
+ struct xfs_ifork *ifp;
+ xfs_fileoff_t max_dqid_off;
+ xfs_fileoff_t off;
+ xfs_fsblock_t fsbno;
+ bool truncate = false;
+ bool joined = false;
+ int error = 0;
+
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ goto out;
+
+ /* Check for data fork problems that apply only to quota files. */
+ max_dqid_off = XFS_DQ_ID_MAX / qi->qi_dqperchunk;
+ ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ if (isnullstartblock(irec.br_startblock)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+
+ if (irec.br_startoff > max_dqid_off ||
+ irec.br_startoff + irec.br_blockcount - 1 > max_dqid_off) {
+ truncate = true;
+ break;
+ }
+
+ /* Convert unwritten extents to real ones. */
+ if (irec.br_state == XFS_EXT_UNWRITTEN) {
+ struct xfs_bmbt_irec nrec;
+ int nmap = 1;
+
+ if (!joined) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+ }
+
+ error = xfs_bmapi_write(sc->tp, sc->ip,
+ irec.br_startoff, irec.br_blockcount,
+ XFS_BMAPI_CONVERT, 0, &nrec, &nmap);
+ if (error)
+ goto out;
+ if (nmap != 1) {
+ error = -ENOSPC;
+ goto out;
+ }
+ ASSERT(nrec.br_startoff == irec.br_startoff);
+ ASSERT(nrec.br_blockcount == irec.br_blockcount);
+
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ goto out;
+ }
+ }
+
+ if (!joined) {
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ joined = true;
+ }
+
+ if (truncate) {
+ /* Erase everything after the block containing the max dquot */
+ error = xfs_bunmapi_range(&sc->tp, sc->ip, 0,
+ max_dqid_off * sc->mp->m_sb.sb_blocksize,
+ XFS_MAX_FILEOFF);
+ if (error)
+ goto out;
+
+ /* Remove all CoW reservations. */
+ error = xfs_reflink_cancel_cow_blocks(sc->ip, &sc->tp, 0,
+ XFS_MAX_FILEOFF, true);
+ if (error)
+ goto out;
+ sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
+
+ /*
+ * Always re-log the inode so that our permanent transaction
+ * can keep on rolling it forward in the log.
+ */
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ }
+
+ /* Now go fix anything that fails the verifiers. */
+ for_each_xfs_iext(ifp, &icur, &irec) {
+ for (fsbno = irec.br_startblock, off = irec.br_startoff;
+ fsbno < irec.br_startblock + irec.br_blockcount;
+ fsbno += XFS_DQUOT_CLUSTER_SIZE_FSB,
+ off += XFS_DQUOT_CLUSTER_SIZE_FSB) {
+ error = xrep_quota_block(sc,
+ XFS_FSB_TO_DADDR(sc->mp, fsbno),
+ dqtype, off * qi->qi_dqperchunk);
+ if (error)
+ goto out;
+ }
+ }
+
+out:
+ return error;
+}
+
+/*
+ * Go fix anything in the quota items that we could have been mad about. Now
+ * that we've checked the quota inode data fork we have to drop ILOCK_EXCL to
+ * use the regular dquot functions.
+ */
+STATIC int
+xrep_quota_problems(
+ struct xfs_scrub *sc,
+ xfs_dqtype_t dqtype)
+{
+ struct xchk_dqiter cursor = { };
+ struct xrep_quota_info rqi = { .sc = sc };
+ struct xfs_dquot *dq;
+ int error;
+
+ xchk_dqiter_init(&cursor, sc, dqtype);
+ while ((error = xchk_dquot_iter(&cursor, &dq)) == 1) {
+ error = xrep_quota_item(&rqi, dq);
+ xfs_qm_dqput(dq);
+ if (error)
+ break;
+ }
+ if (error)
+ return error;
+
+ /* Make a quotacheck happen. */
+ if (rqi.need_quotacheck)
+ xrep_force_quotacheck(sc, dqtype);
+ return 0;
+}
+
+/* Repair all of a quota type's items. */
+int
+xrep_quota(
+ struct xfs_scrub *sc)
+{
+ xfs_dqtype_t dqtype;
+ int error;
+
+ dqtype = xchk_quota_to_dqtype(sc);
+
+ /*
+ * Re-take the ILOCK so that we can fix any problems that we found
+ * with the data fork mappings, or with the dquot bufs themselves.
+ */
+ if (!(sc->ilock_flags & XFS_ILOCK_EXCL))
+ xchk_ilock(sc, XFS_ILOCK_EXCL);
+ error = xrep_quota_data_fork(sc, dqtype);
+ if (error)
+ return error;
+
+ /*
+ * Finish deferred items and roll the transaction to unjoin the quota
+ * inode from transaction so that we can unlock the quota inode; we
+ * play only with dquots from now on.
+ */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+ xchk_iunlock(sc, sc->ilock_flags);
+
+ /* Fix anything the dquot verifiers don't complain about. */
+ error = xrep_quota_problems(sc, dqtype);
+ if (error)
+ return error;
+
+ return xrep_trans_commit(sc);
+}
diff --git a/fs/xfs/scrub/readdir.c b/fs/xfs/scrub/readdir.c
index e51c1544be63..16462332c897 100644
--- a/fs/xfs/scrub/readdir.c
+++ b/fs/xfs/scrub/readdir.c
@@ -36,16 +36,14 @@ xchk_dir_walk_sf(
struct xfs_mount *mp = dp->i_mount;
struct xfs_da_geometry *geo = mp->m_dir_geo;
struct xfs_dir2_sf_entry *sfep;
- struct xfs_dir2_sf_hdr *sfp;
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
xfs_ino_t ino;
xfs_dir2_dataptr_t dapos;
unsigned int i;
int error;
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
-
- sfp = (struct xfs_dir2_sf_hdr *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
/* dot entry */
dapos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c
index 86a62420e02c..f99eca799809 100644
--- a/fs/xfs/scrub/reap.c
+++ b/fs/xfs/scrub/reap.c
@@ -20,6 +20,7 @@
#include "xfs_ialloc_btree.h"
#include "xfs_rmap.h"
#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
#include "xfs_refcount_btree.h"
#include "xfs_extent_busy.h"
#include "xfs_ag.h"
@@ -31,11 +32,14 @@
#include "xfs_da_btree.h"
#include "xfs_attr.h"
#include "xfs_attr_remote.h"
+#include "xfs_defer.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
#include "scrub/repair.h"
#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/fsb_bitmap.h"
#include "scrub/reap.h"
/*
@@ -73,10 +77,10 @@
* with only the same rmap owner but the block is not owned by something with
* the same rmap owner, the block will be freed.
*
- * The caller is responsible for locking the AG headers for the entire rebuild
- * operation so that nothing else can sneak in and change the AG state while
- * we're not looking. We must also invalidate any buffers associated with
- * @bitmap.
+ * The caller is responsible for locking the AG headers/inode for the entire
+ * rebuild operation so that nothing else can sneak in and change the incore
+ * state while we're not looking. We must also invalidate any buffers
+ * associated with @bitmap.
*/
/* Information about reaping extents after a repair. */
@@ -247,7 +251,7 @@ xreap_agextent_binval(
max_fsbs = min_t(xfs_agblock_t, agbno_next - bno,
xfs_attr3_rmt_blocks(mp, XFS_XATTR_SIZE_MAX));
- for (fsbcount = 1; fsbcount < max_fsbs; fsbcount++) {
+ for (fsbcount = 1; fsbcount <= max_fsbs; fsbcount++) {
struct xfs_buf *bp = NULL;
xfs_daddr_t daddr;
int error;
@@ -377,6 +381,17 @@ xreap_agextent_iter(
trace_xreap_dispose_unmap_extent(sc->sa.pag, agbno, *aglenp);
rs->force_roll = true;
+
+ if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
+ /*
+ * If we're unmapping CoW staging extents, remove the
+ * records from the refcountbt, which will remove the
+ * rmap record as well.
+ */
+ xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ return 0;
+ }
+
return xfs_rmap_free(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno,
*aglenp, rs->oinfo);
}
@@ -395,6 +410,26 @@ xreap_agextent_iter(
return 0;
}
+ /*
+ * If we're getting rid of CoW staging extents, use deferred work items
+ * to remove the refcountbt records (which removes the rmap records)
+ * and free the extent. We're not worried about the system going down
+ * here because log recovery walks the refcount btree to clean out the
+ * CoW staging extents.
+ */
+ if (rs->oinfo == &XFS_RMAP_OINFO_COW) {
+ ASSERT(rs->resv == XFS_AG_RESV_NONE);
+
+ xfs_refcount_free_cow_extent(sc->tp, fsbno, *aglenp);
+ error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, NULL,
+ rs->resv, true);
+ if (error)
+ return error;
+
+ rs->force_roll = true;
+ return 0;
+ }
+
/* Put blocks back on the AGFL one at a time. */
if (rs->resv == XFS_AG_RESV_AGFL) {
ASSERT(*aglenp == 1);
@@ -409,13 +444,17 @@ xreap_agextent_iter(
/*
* Use deferred frees to get rid of the old btree blocks to try to
* minimize the window in which we could crash and lose the old blocks.
+ * Add a defer ops barrier every other extent to avoid stressing the
+ * system with large EFIs.
*/
- error = __xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
+ error = xfs_free_extent_later(sc->tp, fsbno, *aglenp, rs->oinfo,
rs->resv, true);
if (error)
return error;
rs->deferred++;
+ if (rs->deferred % 2 == 0)
+ xfs_defer_add_barrier(sc->tp);
return 0;
}
@@ -425,13 +464,12 @@ xreap_agextent_iter(
*/
STATIC int
xreap_agmeta_extent(
- uint64_t fsbno,
- uint64_t len,
+ uint32_t agbno,
+ uint32_t len,
void *priv)
{
struct xreap_state *rs = priv;
struct xfs_scrub *sc = rs->sc;
- xfs_agblock_t agbno = fsbno;
xfs_agblock_t agbno_next = agbno + len;
int error = 0;
@@ -496,3 +534,115 @@ xrep_reap_agblocks(
return 0;
}
+
+/*
+ * Break a file metadata extent into sub-extents by fate (crosslinked, not
+ * crosslinked), and dispose of each sub-extent separately. The extent must
+ * not cross an AG boundary.
+ */
+STATIC int
+xreap_fsmeta_extent(
+ uint64_t fsbno,
+ uint64_t len,
+ void *priv)
+{
+ struct xreap_state *rs = priv;
+ struct xfs_scrub *sc = rs->sc;
+ xfs_agnumber_t agno = XFS_FSB_TO_AGNO(sc->mp, fsbno);
+ xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
+ xfs_agblock_t agbno_next = agbno + len;
+ int error = 0;
+
+ ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
+ ASSERT(sc->ip != NULL);
+ ASSERT(!sc->sa.pag);
+
+ /*
+ * We're reaping blocks after repairing file metadata, which means that
+ * we have to init the xchk_ag structure ourselves.
+ */
+ sc->sa.pag = xfs_perag_get(sc->mp, agno);
+ if (!sc->sa.pag)
+ return -EFSCORRUPTED;
+
+ error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &sc->sa.agf_bp);
+ if (error)
+ goto out_pag;
+
+ while (agbno < agbno_next) {
+ xfs_extlen_t aglen;
+ bool crosslinked;
+
+ error = xreap_agextent_select(rs, agbno, agbno_next,
+ &crosslinked, &aglen);
+ if (error)
+ goto out_agf;
+
+ error = xreap_agextent_iter(rs, agbno, &aglen, crosslinked);
+ if (error)
+ goto out_agf;
+
+ if (xreap_want_defer_finish(rs)) {
+ /*
+ * Holds the AGF buffer across the deferred chain
+ * processing.
+ */
+ error = xrep_defer_finish(sc);
+ if (error)
+ goto out_agf;
+ xreap_defer_finish_reset(rs);
+ } else if (xreap_want_roll(rs)) {
+ /*
+ * Hold the AGF buffer across the transaction roll so
+ * that we don't have to reattach it to the scrub
+ * context.
+ */
+ xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
+ error = xfs_trans_roll_inode(&sc->tp, sc->ip);
+ xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
+ if (error)
+ goto out_agf;
+ xreap_reset(rs);
+ }
+
+ agbno += aglen;
+ }
+
+out_agf:
+ xfs_trans_brelse(sc->tp, sc->sa.agf_bp);
+ sc->sa.agf_bp = NULL;
+out_pag:
+ xfs_perag_put(sc->sa.pag);
+ sc->sa.pag = NULL;
+ return error;
+}
+
+/*
+ * Dispose of every block of every fs metadata extent in the bitmap.
+ * Do not use this to dispose of the mappings in an ondisk inode fork.
+ */
+int
+xrep_reap_fsblocks(
+ struct xfs_scrub *sc,
+ struct xfsb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo)
+{
+ struct xreap_state rs = {
+ .sc = sc,
+ .oinfo = oinfo,
+ .resv = XFS_AG_RESV_NONE,
+ };
+ int error;
+
+ ASSERT(xfs_has_rmapbt(sc->mp));
+ ASSERT(sc->ip != NULL);
+
+ error = xfsb_bitmap_walk(bitmap, xreap_fsmeta_extent, &rs);
+ if (error)
+ return error;
+
+ if (xreap_dirty(&rs))
+ return xrep_defer_finish(sc);
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/reap.h b/fs/xfs/scrub/reap.h
index fe24626af164..0b69f16dd98f 100644
--- a/fs/xfs/scrub/reap.h
+++ b/fs/xfs/scrub/reap.h
@@ -6,7 +6,12 @@
#ifndef __XFS_SCRUB_REAP_H__
#define __XFS_SCRUB_REAP_H__
+struct xagb_bitmap;
+struct xfsb_bitmap;
+
int xrep_reap_agblocks(struct xfs_scrub *sc, struct xagb_bitmap *bitmap,
const struct xfs_owner_info *oinfo, enum xfs_ag_resv_type type);
+int xrep_reap_fsblocks(struct xfs_scrub *sc, struct xfsb_bitmap *bitmap,
+ const struct xfs_owner_info *oinfo);
#endif /* __XFS_SCRUB_REAP_H__ */
diff --git a/fs/xfs/scrub/refcount.c b/fs/xfs/scrub/refcount.c
index 304ea1e1bfb0..bf22f245bbfa 100644
--- a/fs/xfs/scrub/refcount.c
+++ b/fs/xfs/scrub/refcount.c
@@ -441,7 +441,7 @@ xchk_refcountbt_rec(
struct xchk_refcbt_records *rrc = bs->private;
xfs_refcount_btrec_to_irec(rec, &irec);
- if (xfs_refcount_check_irec(bs->cur, &irec) != NULL) {
+ if (xfs_refcount_check_irec(bs->cur->bc_ag.pag, &irec) != NULL) {
xchk_btree_set_corrupt(bs->sc, bs->cur, 0);
return 0;
}
diff --git a/fs/xfs/scrub/refcount_repair.c b/fs/xfs/scrub/refcount_repair.c
new file mode 100644
index 000000000000..f38fccc42a20
--- /dev/null
+++ b/fs/xfs/scrub/refcount_repair.c
@@ -0,0 +1,794 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2018-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_btree_staging.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
+#include "xfs_rmap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_error.h"
+#include "xfs_ag.h"
+#include "scrub/xfs_scrub.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/btree.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
+#include "scrub/xfile.h"
+#include "scrub/xfarray.h"
+#include "scrub/newbt.h"
+#include "scrub/reap.h"
+
+/*
+ * Rebuilding the Reference Count Btree
+ * ====================================
+ *
+ * This algorithm is "borrowed" from xfs_repair. Imagine the rmap
+ * entries as rectangles representing extents of physical blocks, and
+ * that the rectangles can be laid down to allow them to overlap each
+ * other; then we know that we must emit a refcnt btree entry wherever
+ * the amount of overlap changes, i.e. the emission stimulus is
+ * level-triggered:
+ *
+ * - ---
+ * -- ----- ---- --- ------
+ * -- ---- ----------- ---- ---------
+ * -------------------------------- -----------
+ * ^ ^ ^^ ^^ ^ ^^ ^^^ ^^^^ ^ ^^ ^ ^ ^
+ * 2 1 23 21 3 43 234 2123 1 01 2 3 0
+ *
+ * For our purposes, a rmap is a tuple (startblock, len, fileoff, owner).
+ *
+ * Note that in the actual refcnt btree we don't store the refcount < 2
+ * cases because the bnobt tells us which blocks are free; single-use
+ * blocks aren't recorded in the bnobt or the refcntbt. If the rmapbt
+ * supports storing multiple entries covering a given block we could
+ * theoretically dispense with the refcntbt and simply count rmaps, but
+ * that's inefficient in the (hot) write path, so we'll take the cost of
+ * the extra tree to save time. Also there's no guarantee that rmap
+ * will be enabled.
+ *
+ * Given an array of rmaps sorted by physical block number, a starting
+ * physical block (sp), a bag to hold rmaps that cover sp, and the next
+ * physical block where the level changes (np), we can reconstruct the
+ * refcount btree as follows:
+ *
+ * While there are still unprocessed rmaps in the array,
+ * - Set sp to the physical block (pblk) of the next unprocessed rmap.
+ * - Add to the bag all rmaps in the array where startblock == sp.
+ * - Set np to the physical block where the bag size will change. This
+ * is the minimum of (the pblk of the next unprocessed rmap) and
+ * (startblock + len of each rmap in the bag).
+ * - Record the bag size as old_bag_size.
+ *
+ * - While the bag isn't empty,
+ * - Remove from the bag all rmaps where startblock + len == np.
+ * - Add to the bag all rmaps in the array where startblock == np.
+ * - If the bag size isn't old_bag_size, store the refcount entry
+ * (sp, np - sp, bag_size) in the refcnt btree.
+ * - If the bag is empty, break out of the inner loop.
+ * - Set old_bag_size to the bag size
+ * - Set sp = np.
+ * - Set np to the physical block where the bag size will change.
+ * This is the minimum of (the pblk of the next unprocessed rmap)
+ * and (startblock + len of each rmap in the bag).
+ *
+ * Like all the other repairers, we make a list of all the refcount
+ * records we need, then reinitialize the refcount btree root and
+ * insert all the records.
+ */
+
+/* The only parts of the rmap that we care about for computing refcounts. */
+struct xrep_refc_rmap {
+ xfs_agblock_t startblock;
+ xfs_extlen_t blockcount;
+} __packed;
+
+struct xrep_refc {
+ /* refcount extents */
+ struct xfarray *refcount_records;
+
+ /* new refcountbt information */
+ struct xrep_newbt new_btree;
+
+ /* old refcountbt blocks */
+ struct xagb_bitmap old_refcountbt_blocks;
+
+ struct xfs_scrub *sc;
+
+ /* get_records()'s position in the refcount record array. */
+ xfarray_idx_t array_cur;
+
+ /* # of refcountbt blocks */
+ xfs_extlen_t btblocks;
+};
+
+/* Check for any obvious conflicts with this shared/CoW staging extent. */
+STATIC int
+xrep_refc_check_ext(
+ struct xfs_scrub *sc,
+ const struct xfs_refcount_irec *rec)
+{
+ enum xbtree_recpacking outcome;
+ int error;
+
+ if (xfs_refcount_check_irec(sc->sa.pag, rec) != NULL)
+ return -EFSCORRUPTED;
+
+ /* Make sure this isn't free space. */
+ error = xfs_alloc_has_records(sc->sa.bno_cur, rec->rc_startblock,
+ rec->rc_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ /* Must not be an inode chunk. */
+ error = xfs_ialloc_has_inodes_at_extent(sc->sa.ino_cur,
+ rec->rc_startblock, rec->rc_blockcount, &outcome);
+ if (error)
+ return error;
+ if (outcome != XBTREE_RECPACKING_EMPTY)
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Record a reference count extent. */
+STATIC int
+xrep_refc_stash(
+ struct xrep_refc *rr,
+ enum xfs_refc_domain domain,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len,
+ uint64_t refcount)
+{
+ struct xfs_refcount_irec irec = {
+ .rc_startblock = agbno,
+ .rc_blockcount = len,
+ .rc_domain = domain,
+ };
+ struct xfs_scrub *sc = rr->sc;
+ int error = 0;
+
+ if (xchk_should_terminate(sc, &error))
+ return error;
+
+ irec.rc_refcount = min_t(uint64_t, MAXREFCOUNT, refcount);
+
+ error = xrep_refc_check_ext(rr->sc, &irec);
+ if (error)
+ return error;
+
+ trace_xrep_refc_found(sc->sa.pag, &irec);
+
+ return xfarray_append(rr->refcount_records, &irec);
+}
+
+/* Record a CoW staging extent. */
+STATIC int
+xrep_refc_stash_cow(
+ struct xrep_refc *rr,
+ xfs_agblock_t agbno,
+ xfs_extlen_t len)
+{
+ return xrep_refc_stash(rr, XFS_REFC_DOMAIN_COW, agbno, len, 1);
+}
+
+/* Decide if an rmap could describe a shared extent. */
+static inline bool
+xrep_refc_rmap_shareable(
+ struct xfs_mount *mp,
+ const struct xfs_rmap_irec *rmap)
+{
+ /* AG metadata are never sharable */
+ if (XFS_RMAP_NON_INODE_OWNER(rmap->rm_owner))
+ return false;
+
+ /* Metadata in files are never shareable */
+ if (xfs_internal_inum(mp, rmap->rm_owner))
+ return false;
+
+ /* Metadata and unwritten file blocks are not shareable. */
+ if (rmap->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK |
+ XFS_RMAP_UNWRITTEN))
+ return false;
+
+ return true;
+}
+
+/*
+ * Walk along the reverse mapping records until we find one that could describe
+ * a shared extent.
+ */
+STATIC int
+xrep_refc_walk_rmaps(
+ struct xrep_refc *rr,
+ struct xrep_refc_rmap *rrm,
+ bool *have_rec)
+{
+ struct xfs_rmap_irec rmap;
+ struct xfs_btree_cur *cur = rr->sc->sa.rmap_cur;
+ struct xfs_mount *mp = cur->bc_mp;
+ int have_gt;
+ int error = 0;
+
+ *have_rec = false;
+
+ /*
+ * Loop through the remaining rmaps. Remember CoW staging
+ * extents and the refcountbt blocks from the old tree for later
+ * disposal. We can only share written data fork extents, so
+ * keep looping until we find an rmap for one.
+ */
+ do {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfs_btree_increment(cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (!have_gt)
+ return 0;
+
+ error = xfs_rmap_get_rec(cur, &rmap, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(mp, !have_gt))
+ return -EFSCORRUPTED;
+
+ if (rmap.rm_owner == XFS_RMAP_OWN_COW) {
+ error = xrep_refc_stash_cow(rr, rmap.rm_startblock,
+ rmap.rm_blockcount);
+ if (error)
+ return error;
+ } else if (rmap.rm_owner == XFS_RMAP_OWN_REFC) {
+ /* refcountbt block, dump it when we're done. */
+ rr->btblocks += rmap.rm_blockcount;
+ error = xagb_bitmap_set(&rr->old_refcountbt_blocks,
+ rmap.rm_startblock, rmap.rm_blockcount);
+ if (error)
+ return error;
+ }
+ } while (!xrep_refc_rmap_shareable(mp, &rmap));
+
+ rrm->startblock = rmap.rm_startblock;
+ rrm->blockcount = rmap.rm_blockcount;
+ *have_rec = true;
+ return 0;
+}
+
+static inline uint32_t
+xrep_refc_encode_startblock(
+ const struct xfs_refcount_irec *irec)
+{
+ uint32_t start;
+
+ start = irec->rc_startblock & ~XFS_REFC_COWFLAG;
+ if (irec->rc_domain == XFS_REFC_DOMAIN_COW)
+ start |= XFS_REFC_COWFLAG;
+
+ return start;
+}
+
+/* Sort in the same order as the ondisk records. */
+static int
+xrep_refc_extent_cmp(
+ const void *a,
+ const void *b)
+{
+ const struct xfs_refcount_irec *ap = a;
+ const struct xfs_refcount_irec *bp = b;
+ uint32_t sa, sb;
+
+ sa = xrep_refc_encode_startblock(ap);
+ sb = xrep_refc_encode_startblock(bp);
+
+ if (sa > sb)
+ return 1;
+ if (sa < sb)
+ return -1;
+ return 0;
+}
+
+/*
+ * Sort the refcount extents by startblock or else the btree records will be in
+ * the wrong order. Make sure the records do not overlap in physical space.
+ */
+STATIC int
+xrep_refc_sort_records(
+ struct xrep_refc *rr)
+{
+ struct xfs_refcount_irec irec;
+ xfarray_idx_t cur;
+ enum xfs_refc_domain dom = XFS_REFC_DOMAIN_SHARED;
+ xfs_agblock_t next_agbno = 0;
+ int error;
+
+ error = xfarray_sort(rr->refcount_records, xrep_refc_extent_cmp,
+ XFARRAY_SORT_KILLABLE);
+ if (error)
+ return error;
+
+ foreach_xfarray_idx(rr->refcount_records, cur) {
+ if (xchk_should_terminate(rr->sc, &error))
+ return error;
+
+ error = xfarray_load(rr->refcount_records, cur, &irec);
+ if (error)
+ return error;
+
+ if (dom == XFS_REFC_DOMAIN_SHARED &&
+ irec.rc_domain == XFS_REFC_DOMAIN_COW) {
+ dom = irec.rc_domain;
+ next_agbno = 0;
+ }
+
+ if (dom != irec.rc_domain)
+ return -EFSCORRUPTED;
+ if (irec.rc_startblock < next_agbno)
+ return -EFSCORRUPTED;
+
+ next_agbno = irec.rc_startblock + irec.rc_blockcount;
+ }
+
+ return error;
+}
+
+#define RRM_NEXT(r) ((r).startblock + (r).blockcount)
+/*
+ * Find the next block where the refcount changes, given the next rmap we
+ * looked at and the ones we're already tracking.
+ */
+static inline int
+xrep_refc_next_edge(
+ struct xfarray *rmap_bag,
+ struct xrep_refc_rmap *next_rrm,
+ bool next_valid,
+ xfs_agblock_t *nbnop)
+{
+ struct xrep_refc_rmap rrm;
+ xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
+ xfs_agblock_t nbno = NULLAGBLOCK;
+ int error;
+
+ if (next_valid)
+ nbno = next_rrm->startblock;
+
+ while ((error = xfarray_iter(rmap_bag, &array_cur, &rrm)) == 1)
+ nbno = min_t(xfs_agblock_t, nbno, RRM_NEXT(rrm));
+
+ if (error)
+ return error;
+
+ /*
+ * We should have found /something/ because either next_rrm is the next
+ * interesting rmap to look at after emitting this refcount extent, or
+ * there are other rmaps in rmap_bag contributing to the current
+ * sharing count. But if something is seriously wrong, bail out.
+ */
+ if (nbno == NULLAGBLOCK)
+ return -EFSCORRUPTED;
+
+ *nbnop = nbno;
+ return 0;
+}
+
+/*
+ * Walk forward through the rmap btree to collect all rmaps starting at
+ * @bno in @rmap_bag. These represent the file(s) that share ownership of
+ * the current block. Upon return, the rmap cursor points to the last record
+ * satisfying the startblock constraint.
+ */
+static int
+xrep_refc_push_rmaps_at(
+ struct xrep_refc *rr,
+ struct xfarray *rmap_bag,
+ xfs_agblock_t bno,
+ struct xrep_refc_rmap *rrm,
+ bool *have,
+ uint64_t *stack_sz)
+{
+ struct xfs_scrub *sc = rr->sc;
+ int have_gt;
+ int error;
+
+ while (*have && rrm->startblock == bno) {
+ error = xfarray_store_anywhere(rmap_bag, rrm);
+ if (error)
+ return error;
+ (*stack_sz)++;
+ error = xrep_refc_walk_rmaps(rr, rrm, have);
+ if (error)
+ return error;
+ }
+
+ error = xfs_btree_decrement(sc->sa.rmap_cur, 0, &have_gt);
+ if (error)
+ return error;
+ if (XFS_IS_CORRUPT(sc->mp, !have_gt))
+ return -EFSCORRUPTED;
+
+ return 0;
+}
+
+/* Iterate all the rmap records to generate reference count data. */
+STATIC int
+xrep_refc_find_refcounts(
+ struct xrep_refc *rr)
+{
+ struct xrep_refc_rmap rrm;
+ struct xfs_scrub *sc = rr->sc;
+ struct xfarray *rmap_bag;
+ char *descr;
+ uint64_t old_stack_sz;
+ uint64_t stack_sz = 0;
+ xfs_agblock_t sbno;
+ xfs_agblock_t cbno;
+ xfs_agblock_t nbno;
+ bool have;
+ int error;
+
+ xrep_ag_btcur_init(sc, &sc->sa);
+
+ /*
+ * Set up a sparse array to store all the rmap records that we're
+ * tracking to generate a reference count record. If this exceeds
+ * MAXREFCOUNT, we clamp rc_refcount.
+ */
+ descr = xchk_xfile_ag_descr(sc, "rmap record bag");
+ error = xfarray_create(descr, 0, sizeof(struct xrep_refc_rmap),
+ &rmap_bag);
+ kfree(descr);
+ if (error)
+ goto out_cur;
+
+ /* Start the rmapbt cursor to the left of all records. */
+ error = xfs_btree_goto_left_edge(sc->sa.rmap_cur);
+ if (error)
+ goto out_bag;
+
+ /* Process reverse mappings into refcount data. */
+ while (xfs_btree_has_more_records(sc->sa.rmap_cur)) {
+ /* Push all rmaps with pblk == sbno onto the stack */
+ error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ if (error)
+ goto out_bag;
+ if (!have)
+ break;
+ sbno = cbno = rrm.startblock;
+ error = xrep_refc_push_rmaps_at(rr, rmap_bag, sbno,
+ &rrm, &have, &stack_sz);
+ if (error)
+ goto out_bag;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = xrep_refc_next_edge(rmap_bag, &rrm, have, &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ old_stack_sz = stack_sz;
+
+ /* While stack isn't empty... */
+ while (stack_sz) {
+ xfarray_idx_t array_cur = XFARRAY_CURSOR_INIT;
+
+ /* Pop all rmaps that end at nbno */
+ while ((error = xfarray_iter(rmap_bag, &array_cur,
+ &rrm)) == 1) {
+ if (RRM_NEXT(rrm) != nbno)
+ continue;
+ error = xfarray_unset(rmap_bag, array_cur - 1);
+ if (error)
+ goto out_bag;
+ stack_sz--;
+ }
+ if (error)
+ goto out_bag;
+
+ /* Push array items that start at nbno */
+ error = xrep_refc_walk_rmaps(rr, &rrm, &have);
+ if (error)
+ goto out_bag;
+ if (have) {
+ error = xrep_refc_push_rmaps_at(rr, rmap_bag,
+ nbno, &rrm, &have, &stack_sz);
+ if (error)
+ goto out_bag;
+ }
+
+ /* Emit refcount if necessary */
+ ASSERT(nbno > cbno);
+ if (stack_sz != old_stack_sz) {
+ if (old_stack_sz > 1) {
+ error = xrep_refc_stash(rr,
+ XFS_REFC_DOMAIN_SHARED,
+ cbno, nbno - cbno,
+ old_stack_sz);
+ if (error)
+ goto out_bag;
+ }
+ cbno = nbno;
+ }
+
+ /* Stack empty, go find the next rmap */
+ if (stack_sz == 0)
+ break;
+ old_stack_sz = stack_sz;
+ sbno = nbno;
+
+ /* Set nbno to the bno of the next refcount change */
+ error = xrep_refc_next_edge(rmap_bag, &rrm, have,
+ &nbno);
+ if (error)
+ goto out_bag;
+
+ ASSERT(nbno > sbno);
+ }
+ }
+
+ ASSERT(stack_sz == 0);
+out_bag:
+ xfarray_destroy(rmap_bag);
+out_cur:
+ xchk_ag_btcur_free(&sc->sa);
+ return error;
+}
+#undef RRM_NEXT
+
+/* Retrieve refcountbt data for bulk load. */
+STATIC int
+xrep_refc_get_records(
+ struct xfs_btree_cur *cur,
+ unsigned int idx,
+ struct xfs_btree_block *block,
+ unsigned int nr_wanted,
+ void *priv)
+{
+ struct xfs_refcount_irec *irec = &cur->bc_rec.rc;
+ struct xrep_refc *rr = priv;
+ union xfs_btree_rec *block_rec;
+ unsigned int loaded;
+ int error;
+
+ for (loaded = 0; loaded < nr_wanted; loaded++, idx++) {
+ error = xfarray_load(rr->refcount_records, rr->array_cur++,
+ irec);
+ if (error)
+ return error;
+
+ block_rec = xfs_btree_rec_addr(cur, idx, block);
+ cur->bc_ops->init_rec_from_cur(cur, block_rec);
+ }
+
+ return loaded;
+}
+
+/* Feed one of the new btree blocks to the bulk loader. */
+STATIC int
+xrep_refc_claim_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ void *priv)
+{
+ struct xrep_refc *rr = priv;
+
+ return xrep_newbt_claim_block(cur, &rr->new_btree, ptr);
+}
+
+/* Update the AGF counters. */
+STATIC int
+xrep_refc_reset_counters(
+ struct xrep_refc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+
+ /*
+ * After we commit the new btree to disk, it is possible that the
+ * process to reap the old btree blocks will race with the AIL trying
+ * to checkpoint the old btree blocks into the filesystem. If the new
+ * tree is shorter than the old one, the refcountbt write verifier will
+ * fail and the AIL will shut down the filesystem.
+ *
+ * To avoid this, save the old incore btree height values as the alt
+ * height values before re-initializing the perag info from the updated
+ * AGF to capture all the new values.
+ */
+ pag->pagf_repair_refcount_level = pag->pagf_refcount_level;
+
+ /* Reinitialize with the values we just logged. */
+ return xrep_reinit_pagf(sc);
+}
+
+/*
+ * Use the collected refcount information to stage a new refcount btree. If
+ * this is successful we'll return with the new btree root information logged
+ * to the repair transaction but not yet committed.
+ */
+STATIC int
+xrep_refc_build_new_tree(
+ struct xrep_refc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_btree_cur *refc_cur;
+ struct xfs_perag *pag = sc->sa.pag;
+ xfs_fsblock_t fsbno;
+ int error;
+
+ error = xrep_refc_sort_records(rr);
+ if (error)
+ return error;
+
+ /*
+ * Prepare to construct the new btree by reserving disk space for the
+ * new btree and setting up all the accounting information we'll need
+ * to root the new btree while it's under construction and before we
+ * attach it to the AG header.
+ */
+ fsbno = XFS_AGB_TO_FSB(sc->mp, pag->pag_agno, xfs_refc_block(sc->mp));
+ xrep_newbt_init_ag(&rr->new_btree, sc, &XFS_RMAP_OINFO_REFC, fsbno,
+ XFS_AG_RESV_METADATA);
+ rr->new_btree.bload.get_records = xrep_refc_get_records;
+ rr->new_btree.bload.claim_block = xrep_refc_claim_block;
+
+ /* Compute how many blocks we'll need. */
+ refc_cur = xfs_refcountbt_stage_cursor(sc->mp, &rr->new_btree.afake,
+ pag);
+ error = xfs_btree_bload_compute_geometry(refc_cur,
+ &rr->new_btree.bload,
+ xfarray_length(rr->refcount_records));
+ if (error)
+ goto err_cur;
+
+ /* Last chance to abort before we start committing fixes. */
+ if (xchk_should_terminate(sc, &error))
+ goto err_cur;
+
+ /* Reserve the space we'll need for the new btree. */
+ error = xrep_newbt_alloc_blocks(&rr->new_btree,
+ rr->new_btree.bload.nr_blocks);
+ if (error)
+ goto err_cur;
+
+ /*
+ * Due to btree slack factors, it's possible for a new btree to be one
+ * level taller than the old btree. Update the incore btree height so
+ * that we don't trip the verifiers when writing the new btree blocks
+ * to disk.
+ */
+ pag->pagf_repair_refcount_level = rr->new_btree.bload.btree_height;
+
+ /* Add all observed refcount records. */
+ rr->array_cur = XFARRAY_CURSOR_INIT;
+ error = xfs_btree_bload(refc_cur, &rr->new_btree.bload, rr);
+ if (error)
+ goto err_level;
+
+ /*
+ * Install the new btree in the AG header. After this point the old
+ * btree is no longer accessible and the new tree is live.
+ */
+ xfs_refcountbt_commit_staged_btree(refc_cur, sc->tp, sc->sa.agf_bp);
+ xfs_btree_del_cursor(refc_cur, 0);
+
+ /* Reset the AGF counters now that we've changed the btree shape. */
+ error = xrep_refc_reset_counters(rr);
+ if (error)
+ goto err_newbt;
+
+ /* Dispose of any unused blocks and the accounting information. */
+ error = xrep_newbt_commit(&rr->new_btree);
+ if (error)
+ return error;
+
+ return xrep_roll_ag_trans(sc);
+
+err_level:
+ pag->pagf_repair_refcount_level = 0;
+err_cur:
+ xfs_btree_del_cursor(refc_cur, error);
+err_newbt:
+ xrep_newbt_cancel(&rr->new_btree);
+ return error;
+}
+
+/*
+ * Now that we've logged the roots of the new btrees, invalidate all of the
+ * old blocks and free them.
+ */
+STATIC int
+xrep_refc_remove_old_tree(
+ struct xrep_refc *rr)
+{
+ struct xfs_scrub *sc = rr->sc;
+ struct xfs_perag *pag = sc->sa.pag;
+ int error;
+
+ /* Free the old refcountbt blocks if they're not in use. */
+ error = xrep_reap_agblocks(sc, &rr->old_refcountbt_blocks,
+ &XFS_RMAP_OINFO_REFC, XFS_AG_RESV_METADATA);
+ if (error)
+ return error;
+
+ /*
+ * Now that we've zapped all the old refcountbt blocks we can turn off
+ * the alternate height mechanism and reset the per-AG space
+ * reservations.
+ */
+ pag->pagf_repair_refcount_level = 0;
+ sc->flags |= XREP_RESET_PERAG_RESV;
+ return 0;
+}
+
+/* Rebuild the refcount btree. */
+int
+xrep_refcountbt(
+ struct xfs_scrub *sc)
+{
+ struct xrep_refc *rr;
+ struct xfs_mount *mp = sc->mp;
+ char *descr;
+ int error;
+
+ /* We require the rmapbt to rebuild anything. */
+ if (!xfs_has_rmapbt(mp))
+ return -EOPNOTSUPP;
+
+ rr = kzalloc(sizeof(struct xrep_refc), XCHK_GFP_FLAGS);
+ if (!rr)
+ return -ENOMEM;
+ rr->sc = sc;
+
+ /* Set up enough storage to handle one refcount record per block. */
+ descr = xchk_xfile_ag_descr(sc, "reference count records");
+ error = xfarray_create(descr, mp->m_sb.sb_agblocks,
+ sizeof(struct xfs_refcount_irec),
+ &rr->refcount_records);
+ kfree(descr);
+ if (error)
+ goto out_rr;
+
+ /* Collect all reference counts. */
+ xagb_bitmap_init(&rr->old_refcountbt_blocks);
+ error = xrep_refc_find_refcounts(rr);
+ if (error)
+ goto out_bitmap;
+
+ /* Rebuild the refcount information. */
+ error = xrep_refc_build_new_tree(rr);
+ if (error)
+ goto out_bitmap;
+
+ /* Kill the old tree. */
+ error = xrep_refc_remove_old_tree(rr);
+ if (error)
+ goto out_bitmap;
+
+out_bitmap:
+ xagb_bitmap_destroy(&rr->old_refcountbt_blocks);
+ xfarray_destroy(rr->refcount_records);
+out_rr:
+ kfree(rr);
+ return error;
+}
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 1b8b5439f2d7..745d5b8f405a 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -27,6 +27,9 @@
#include "xfs_quota.h"
#include "xfs_qm.h"
#include "xfs_defer.h"
+#include "xfs_errortag.h"
+#include "xfs_error.h"
+#include "xfs_reflink.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -176,6 +179,16 @@ xrep_roll_ag_trans(
return 0;
}
+/* Roll the scrub transaction, holding the primary metadata locked. */
+int
+xrep_roll_trans(
+ struct xfs_scrub *sc)
+{
+ if (!sc->ip)
+ return xrep_roll_ag_trans(sc);
+ return xfs_trans_roll_inode(&sc->tp, sc->ip);
+}
+
/* Finish all deferred work attached to the repair transaction. */
int
xrep_defer_finish(
@@ -673,6 +686,7 @@ xrep_find_ag_btree_roots(
return error;
}
+#ifdef CONFIG_XFS_QUOTA
/* Force a quotacheck the next time we mount. */
void
xrep_force_quotacheck(
@@ -699,10 +713,10 @@ xrep_force_quotacheck(
*
* This function ensures that the appropriate dquots are attached to an inode.
* We cannot allow the dquot code to allocate an on-disk dquot block here
- * because we're already in transaction context with the inode locked. The
- * on-disk dquot should already exist anyway. If the quota code signals
- * corruption or missing quota information, schedule quotacheck, which will
- * repair corruptions in the quota metadata.
+ * because we're already in transaction context. The on-disk dquot should
+ * already exist anyway. If the quota code signals corruption or missing quota
+ * information, schedule quotacheck, which will repair corruptions in the quota
+ * metadata.
*/
int
xrep_ino_dqattach(
@@ -710,7 +724,10 @@ xrep_ino_dqattach(
{
int error;
- error = xfs_qm_dqattach_locked(sc->ip, false);
+ ASSERT(sc->tp != NULL);
+ ASSERT(sc->ip != NULL);
+
+ error = xfs_qm_dqattach(sc->ip);
switch (error) {
case -EFSBADCRC:
case -EFSCORRUPTED:
@@ -734,3 +751,367 @@ xrep_ino_dqattach(
return error;
}
+#endif /* CONFIG_XFS_QUOTA */
+
+/*
+ * Ensure that the inode being repaired is ready to handle a certain number of
+ * extents, or return EFSCORRUPTED. Caller must hold the ILOCK of the inode
+ * being repaired and have joined it to the scrub transaction.
+ */
+int
+xrep_ino_ensure_extent_count(
+ struct xfs_scrub *sc,
+ int whichfork,
+ xfs_extnum_t nextents)
+{
+ xfs_extnum_t max_extents;
+ bool inode_has_nrext64;
+
+ inode_has_nrext64 = xfs_inode_has_large_extent_counts(sc->ip);
+ max_extents = xfs_iext_max_nextents(inode_has_nrext64, whichfork);
+ if (nextents <= max_extents)
+ return 0;
+ if (inode_has_nrext64)
+ return -EFSCORRUPTED;
+ if (!xfs_has_large_extent_counts(sc->mp))
+ return -EFSCORRUPTED;
+
+ max_extents = xfs_iext_max_nextents(true, whichfork);
+ if (nextents > max_extents)
+ return -EFSCORRUPTED;
+
+ sc->ip->i_diflags2 |= XFS_DIFLAG2_NREXT64;
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ return 0;
+}
+
+/*
+ * Initialize all the btree cursors for an AG repair except for the btree that
+ * we're rebuilding.
+ */
+void
+xrep_ag_btcur_init(
+ struct xfs_scrub *sc,
+ struct xchk_ag *sa)
+{
+ struct xfs_mount *mp = sc->mp;
+
+ /* Set up a bnobt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_BNOBT &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_CNTBT) {
+ sa->bno_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag, XFS_BTNUM_BNO);
+ sa->cnt_cur = xfs_allocbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag, XFS_BTNUM_CNT);
+ }
+
+ /* Set up a inobt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_INOBT &&
+ sc->sm->sm_type != XFS_SCRUB_TYPE_FINOBT) {
+ sa->ino_cur = xfs_inobt_init_cursor(sc->sa.pag, sc->tp,
+ sa->agi_bp, XFS_BTNUM_INO);
+ if (xfs_has_finobt(mp))
+ sa->fino_cur = xfs_inobt_init_cursor(sc->sa.pag,
+ sc->tp, sa->agi_bp, XFS_BTNUM_FINO);
+ }
+
+ /* Set up a rmapbt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_RMAPBT &&
+ xfs_has_rmapbt(mp))
+ sa->rmap_cur = xfs_rmapbt_init_cursor(mp, sc->tp, sa->agf_bp,
+ sc->sa.pag);
+
+ /* Set up a refcountbt cursor for cross-referencing. */
+ if (sc->sm->sm_type != XFS_SCRUB_TYPE_REFCNTBT &&
+ xfs_has_reflink(mp))
+ sa->refc_cur = xfs_refcountbt_init_cursor(mp, sc->tp,
+ sa->agf_bp, sc->sa.pag);
+}
+
+/*
+ * Reinitialize the in-core AG state after a repair by rereading the AGF
+ * buffer. We had better get the same AGF buffer as the one that's attached
+ * to the scrub context.
+ */
+int
+xrep_reinit_pagf(
+ struct xfs_scrub *sc)
+{
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(pag);
+ ASSERT(xfs_perag_initialised_agf(pag));
+
+ clear_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate);
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &bp);
+ if (error)
+ return error;
+
+ if (bp != sc->sa.agf_bp) {
+ ASSERT(bp == sc->sa.agf_bp);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/*
+ * Reinitialize the in-core AG state after a repair by rereading the AGI
+ * buffer. We had better get the same AGI buffer as the one that's attached
+ * to the scrub context.
+ */
+int
+xrep_reinit_pagi(
+ struct xfs_scrub *sc)
+{
+ struct xfs_perag *pag = sc->sa.pag;
+ struct xfs_buf *bp;
+ int error;
+
+ ASSERT(pag);
+ ASSERT(xfs_perag_initialised_agi(pag));
+
+ clear_bit(XFS_AGSTATE_AGI_INIT, &pag->pag_opstate);
+ error = xfs_ialloc_read_agi(pag, sc->tp, &bp);
+ if (error)
+ return error;
+
+ if (bp != sc->sa.agi_bp) {
+ ASSERT(bp == sc->sa.agi_bp);
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
+}
+
+/*
+ * Given an active reference to a perag structure, load AG headers and cursors.
+ * This should only be called to scan an AG while repairing file-based metadata.
+ */
+int
+xrep_ag_init(
+ struct xfs_scrub *sc,
+ struct xfs_perag *pag,
+ struct xchk_ag *sa)
+{
+ int error;
+
+ ASSERT(!sa->pag);
+
+ error = xfs_ialloc_read_agi(pag, sc->tp, &sa->agi_bp);
+ if (error)
+ return error;
+
+ error = xfs_alloc_read_agf(pag, sc->tp, 0, &sa->agf_bp);
+ if (error)
+ return error;
+
+ /* Grab our own passive reference from the caller's ref. */
+ sa->pag = xfs_perag_hold(pag);
+ xrep_ag_btcur_init(sc, sa);
+ return 0;
+}
+
+/* Reinitialize the per-AG block reservation for the AG we just fixed. */
+int
+xrep_reset_perag_resv(
+ struct xfs_scrub *sc)
+{
+ int error;
+
+ if (!(sc->flags & XREP_RESET_PERAG_RESV))
+ return 0;
+
+ ASSERT(sc->sa.pag != NULL);
+ ASSERT(sc->ops->type == ST_PERAG);
+ ASSERT(sc->tp);
+
+ sc->flags &= ~XREP_RESET_PERAG_RESV;
+ error = xfs_ag_resv_free(sc->sa.pag);
+ if (error)
+ goto out;
+ error = xfs_ag_resv_init(sc->sa.pag, sc->tp);
+ if (error == -ENOSPC) {
+ xfs_err(sc->mp,
+"Insufficient free space to reset per-AG reservation for AG %u after repair.",
+ sc->sa.pag->pag_agno);
+ error = 0;
+ }
+
+out:
+ return error;
+}
+
+/* Decide if we are going to call the repair function for a scrub type. */
+bool
+xrep_will_attempt(
+ struct xfs_scrub *sc)
+{
+ /* Userspace asked us to rebuild the structure regardless. */
+ if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
+ return true;
+
+ /* Let debug users force us into the repair routines. */
+ if (XFS_TEST_ERROR(false, sc->mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
+ return true;
+
+ /* Metadata is corrupt or failed cross-referencing. */
+ if (xchk_needs_repair(sc->sm))
+ return true;
+
+ return false;
+}
+
+/* Try to fix some part of a metadata inode by calling another scrubber. */
+STATIC int
+xrep_metadata_inode_subtype(
+ struct xfs_scrub *sc,
+ unsigned int scrub_type)
+{
+ __u32 smtype = sc->sm->sm_type;
+ __u32 smflags = sc->sm->sm_flags;
+ unsigned int sick_mask = sc->sick_mask;
+ int error;
+
+ /*
+ * Let's see if the inode needs repair. We're going to open-code calls
+ * to the scrub and repair functions so that we can hang on to the
+ * resources that we already acquired instead of using the standard
+ * setup/teardown routines.
+ */
+ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+ sc->sm->sm_type = scrub_type;
+
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xchk_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xchk_bmap_data(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTA:
+ error = xchk_bmap_attr(sc);
+ break;
+ default:
+ ASSERT(0);
+ error = -EFSCORRUPTED;
+ }
+ if (error)
+ goto out;
+
+ if (!xrep_will_attempt(sc))
+ goto out;
+
+ /*
+ * Repair some part of the inode. This will potentially join the inode
+ * to the transaction.
+ */
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xrep_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xrep_bmap(sc, XFS_DATA_FORK, false);
+ break;
+ case XFS_SCRUB_TYPE_BMBTA:
+ error = xrep_bmap(sc, XFS_ATTR_FORK, false);
+ break;
+ }
+ if (error)
+ goto out;
+
+ /*
+ * Finish all deferred intent items and then roll the transaction so
+ * that the inode will not be joined to the transaction when we exit
+ * the function.
+ */
+ error = xfs_defer_finish(&sc->tp);
+ if (error)
+ goto out;
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ goto out;
+
+ /*
+ * Clear the corruption flags and re-check the metadata that we just
+ * repaired.
+ */
+ sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
+
+ switch (scrub_type) {
+ case XFS_SCRUB_TYPE_INODE:
+ error = xchk_inode(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTD:
+ error = xchk_bmap_data(sc);
+ break;
+ case XFS_SCRUB_TYPE_BMBTA:
+ error = xchk_bmap_attr(sc);
+ break;
+ }
+ if (error)
+ goto out;
+
+ /* If corruption persists, the repair has failed. */
+ if (xchk_needs_repair(sc->sm)) {
+ error = -EFSCORRUPTED;
+ goto out;
+ }
+out:
+ sc->sick_mask = sick_mask;
+ sc->sm->sm_type = smtype;
+ sc->sm->sm_flags = smflags;
+ return error;
+}
+
+/*
+ * Repair the ondisk forks of a metadata inode. The caller must ensure that
+ * sc->ip points to the metadata inode and the ILOCK is held on that inode.
+ * The inode must not be joined to the transaction before the call, and will
+ * not be afterwards.
+ */
+int
+xrep_metadata_inode_forks(
+ struct xfs_scrub *sc)
+{
+ bool dirty = false;
+ int error;
+
+ /* Repair the inode record and the data fork. */
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_INODE);
+ if (error)
+ return error;
+
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTD);
+ if (error)
+ return error;
+
+ /* Make sure the attr fork looks ok before we delete it. */
+ error = xrep_metadata_inode_subtype(sc, XFS_SCRUB_TYPE_BMBTA);
+ if (error)
+ return error;
+
+ /* Clear the reflink flag since metadata never shares. */
+ if (xfs_is_reflink_inode(sc->ip)) {
+ dirty = true;
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+ error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
+ if (error)
+ return error;
+ }
+
+ /*
+ * If we modified the inode, roll the transaction but don't rejoin the
+ * inode to the new transaction because xrep_bmap_data can do that.
+ */
+ if (dirty) {
+ error = xfs_trans_roll(&sc->tp);
+ if (error)
+ return error;
+ dirty = false;
+ }
+
+ return 0;
+}
diff --git a/fs/xfs/scrub/repair.h b/fs/xfs/scrub/repair.h
index 60d2a9ae5f2e..17114327e6fa 100644
--- a/fs/xfs/scrub/repair.h
+++ b/fs/xfs/scrub/repair.h
@@ -28,15 +28,28 @@ static inline int xrep_notsupported(struct xfs_scrub *sc)
/* Repair helpers */
int xrep_attempt(struct xfs_scrub *sc, struct xchk_stats_run *run);
+bool xrep_will_attempt(struct xfs_scrub *sc);
void xrep_failure(struct xfs_mount *mp);
int xrep_roll_ag_trans(struct xfs_scrub *sc);
+int xrep_roll_trans(struct xfs_scrub *sc);
int xrep_defer_finish(struct xfs_scrub *sc);
bool xrep_ag_has_space(struct xfs_perag *pag, xfs_extlen_t nr_blocks,
enum xfs_ag_resv_type type);
xfs_extlen_t xrep_calc_ag_resblks(struct xfs_scrub *sc);
+static inline int
+xrep_trans_commit(
+ struct xfs_scrub *sc)
+{
+ int error = xfs_trans_commit(sc->tp);
+
+ sc->tp = NULL;
+ return error;
+}
+
struct xbitmap;
struct xagb_bitmap;
+struct xfsb_bitmap;
int xrep_fix_freelist(struct xfs_scrub *sc, bool can_shrink);
@@ -57,8 +70,35 @@ struct xrep_find_ag_btree {
int xrep_find_ag_btree_roots(struct xfs_scrub *sc, struct xfs_buf *agf_bp,
struct xrep_find_ag_btree *btree_info, struct xfs_buf *agfl_bp);
+
+#ifdef CONFIG_XFS_QUOTA
void xrep_force_quotacheck(struct xfs_scrub *sc, xfs_dqtype_t type);
int xrep_ino_dqattach(struct xfs_scrub *sc);
+#else
+# define xrep_force_quotacheck(sc, type) ((void)0)
+# define xrep_ino_dqattach(sc) (0)
+#endif /* CONFIG_XFS_QUOTA */
+
+int xrep_ino_ensure_extent_count(struct xfs_scrub *sc, int whichfork,
+ xfs_extnum_t nextents);
+int xrep_reset_perag_resv(struct xfs_scrub *sc);
+int xrep_bmap(struct xfs_scrub *sc, int whichfork, bool allow_unwritten);
+int xrep_metadata_inode_forks(struct xfs_scrub *sc);
+
+/* Repair setup functions */
+int xrep_setup_ag_allocbt(struct xfs_scrub *sc);
+
+struct xfs_imap;
+int xrep_setup_inode(struct xfs_scrub *sc, const struct xfs_imap *imap);
+
+void xrep_ag_btcur_init(struct xfs_scrub *sc, struct xchk_ag *sa);
+int xrep_ag_init(struct xfs_scrub *sc, struct xfs_perag *pag,
+ struct xchk_ag *sa);
+
+/* Metadata revalidators */
+
+int xrep_revalidate_allocbt(struct xfs_scrub *sc);
+int xrep_revalidate_iallocbt(struct xfs_scrub *sc);
/* Metadata repairers */
@@ -67,9 +107,34 @@ int xrep_superblock(struct xfs_scrub *sc);
int xrep_agf(struct xfs_scrub *sc);
int xrep_agfl(struct xfs_scrub *sc);
int xrep_agi(struct xfs_scrub *sc);
+int xrep_allocbt(struct xfs_scrub *sc);
+int xrep_iallocbt(struct xfs_scrub *sc);
+int xrep_refcountbt(struct xfs_scrub *sc);
+int xrep_inode(struct xfs_scrub *sc);
+int xrep_bmap_data(struct xfs_scrub *sc);
+int xrep_bmap_attr(struct xfs_scrub *sc);
+int xrep_bmap_cow(struct xfs_scrub *sc);
+
+#ifdef CONFIG_XFS_RT
+int xrep_rtbitmap(struct xfs_scrub *sc);
+#else
+# define xrep_rtbitmap xrep_notsupported
+#endif /* CONFIG_XFS_RT */
+
+#ifdef CONFIG_XFS_QUOTA
+int xrep_quota(struct xfs_scrub *sc);
+#else
+# define xrep_quota xrep_notsupported
+#endif /* CONFIG_XFS_QUOTA */
+
+int xrep_reinit_pagf(struct xfs_scrub *sc);
+int xrep_reinit_pagi(struct xfs_scrub *sc);
#else
+#define xrep_ino_dqattach(sc) (0)
+#define xrep_will_attempt(sc) (false)
+
static inline int
xrep_attempt(
struct xfs_scrub *sc,
@@ -87,11 +152,45 @@ xrep_calc_ag_resblks(
return 0;
}
+static inline int
+xrep_reset_perag_resv(
+ struct xfs_scrub *sc)
+{
+ if (!(sc->flags & XREP_RESET_PERAG_RESV))
+ return 0;
+
+ ASSERT(0);
+ return -EOPNOTSUPP;
+}
+
+/* repair setup functions for no-repair */
+static inline int
+xrep_setup_nothing(
+ struct xfs_scrub *sc)
+{
+ return 0;
+}
+#define xrep_setup_ag_allocbt xrep_setup_nothing
+
+#define xrep_setup_inode(sc, imap) ((void)0)
+
+#define xrep_revalidate_allocbt (NULL)
+#define xrep_revalidate_iallocbt (NULL)
+
#define xrep_probe xrep_notsupported
#define xrep_superblock xrep_notsupported
#define xrep_agf xrep_notsupported
#define xrep_agfl xrep_notsupported
#define xrep_agi xrep_notsupported
+#define xrep_allocbt xrep_notsupported
+#define xrep_iallocbt xrep_notsupported
+#define xrep_refcountbt xrep_notsupported
+#define xrep_inode xrep_notsupported
+#define xrep_bmap_data xrep_notsupported
+#define xrep_bmap_attr xrep_notsupported
+#define xrep_bmap_cow xrep_notsupported
+#define xrep_rtbitmap xrep_notsupported
+#define xrep_quota xrep_notsupported
#endif /* CONFIG_XFS_ONLINE_REPAIR */
diff --git a/fs/xfs/scrub/rmap.c b/fs/xfs/scrub/rmap.c
index d29a26ecddd6..c99d1714f283 100644
--- a/fs/xfs/scrub/rmap.c
+++ b/fs/xfs/scrub/rmap.c
@@ -24,6 +24,7 @@
#include "scrub/common.h"
#include "scrub/btree.h"
#include "scrub/bitmap.h"
+#include "scrub/agb_bitmap.h"
/*
* Set us up to scrub reverse mapping btrees.
diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c
index 41a1d89ae8e6..46583517377f 100644
--- a/fs/xfs/scrub/rtbitmap.c
+++ b/fs/xfs/scrub/rtbitmap.c
@@ -14,17 +14,34 @@
#include "xfs_rtbitmap.h"
#include "xfs_inode.h"
#include "xfs_bmap.h"
+#include "xfs_bit.h"
+#include "xfs_sb.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
+#include "scrub/repair.h"
+#include "scrub/rtbitmap.h"
/* Set us up with the realtime metadata locked. */
int
xchk_setup_rtbitmap(
struct xfs_scrub *sc)
{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_rtbitmap *rtb;
int error;
- error = xchk_trans_alloc(sc, 0);
+ rtb = kzalloc(sizeof(struct xchk_rtbitmap), XCHK_GFP_FLAGS);
+ if (!rtb)
+ return -ENOMEM;
+ sc->buf = rtb;
+
+ if (xchk_could_repair(sc)) {
+ error = xrep_setup_rtbitmap(sc, rtb);
+ if (error)
+ return error;
+ }
+
+ error = xchk_trans_alloc(sc, rtb->resblks);
if (error)
return error;
@@ -32,7 +49,22 @@ xchk_setup_rtbitmap(
if (error)
return error;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ return error;
+
xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTBITMAP);
+
+ /*
+ * Now that we've locked the rtbitmap, we can't race with growfsrt
+ * trying to expand the bitmap or change the size of the rt volume.
+ * Hence it is safe to compute and check the geometry values.
+ */
+ if (mp->m_sb.sb_rblocks) {
+ rtb->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
+ rtb->rextslog = xfs_compute_rextslog(rtb->rextents);
+ rtb->rbmblocks = xfs_rtbitmap_blockcount(mp, rtb->rextents);
+ }
return 0;
}
@@ -63,21 +95,30 @@ STATIC int
xchk_rtbitmap_check_extents(
struct xfs_scrub *sc)
{
- struct xfs_mount *mp = sc->mp;
struct xfs_bmbt_irec map;
- xfs_rtblock_t off;
- int nmap;
+ struct xfs_iext_cursor icur;
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ xfs_fileoff_t off = 0;
+ xfs_fileoff_t endoff;
int error = 0;
- for (off = 0; off < mp->m_sb.sb_rbmblocks;) {
+ /* Mappings may not cross or lie beyond EOF. */
+ endoff = XFS_B_TO_FSB(mp, ip->i_disk_size);
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff);
+ return 0;
+ }
+
+ while (off < endoff) {
+ int nmap = 1;
+
if (xchk_should_terminate(sc, &error) ||
(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
break;
/* Make sure we have a written extent. */
- nmap = 1;
- error = xfs_bmapi_read(mp->m_rbmip, off,
- mp->m_sb.sb_rbmblocks - off, &map, &nmap,
+ error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap,
XFS_DATA_FORK);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
break;
@@ -98,12 +139,48 @@ int
xchk_rtbitmap(
struct xfs_scrub *sc)
{
+ struct xfs_mount *mp = sc->mp;
+ struct xchk_rtbitmap *rtb = sc->buf;
int error;
- /* Is the size of the rtbitmap correct? */
- if (sc->mp->m_rbmip->i_disk_size !=
- XFS_FSB_TO_B(sc->mp, sc->mp->m_sb.sb_rbmblocks)) {
- xchk_ino_set_corrupt(sc, sc->mp->m_rbmip->i_ino);
+ /* Is sb_rextents correct? */
+ if (mp->m_sb.sb_rextents != rtb->rextents) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /* Is sb_rextslog correct? */
+ if (mp->m_sb.sb_rextslog != rtb->rextslog) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /*
+ * Is sb_rbmblocks large enough to handle the current rt volume? In no
+ * case can we exceed 4bn bitmap blocks since the super field is a u32.
+ */
+ if (rtb->rbmblocks > U32_MAX) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+ if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /* The bitmap file length must be aligned to an fsblock. */
+ if (mp->m_rbmip->i_disk_size & mp->m_blockmask) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ return 0;
+ }
+
+ /*
+ * Is the bitmap file itself large enough to handle the rt volume?
+ * growfsrt expands the bitmap file before updating sb_rextents, so the
+ * file can be larger than sb_rbmblocks.
+ */
+ if (mp->m_rbmip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks)) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
return 0;
}
@@ -116,12 +193,11 @@ xchk_rtbitmap(
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
return error;
- error = xfs_rtalloc_query_all(sc->mp, sc->tp, xchk_rtbitmap_rec, sc);
+ error = xfs_rtalloc_query_all(mp, sc->tp, xchk_rtbitmap_rec, sc);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
- goto out;
+ return error;
-out:
- return error;
+ return 0;
}
/* xref check that the extent is not free in the rtbitmap */
diff --git a/fs/xfs/scrub/rtbitmap.h b/fs/xfs/scrub/rtbitmap.h
new file mode 100644
index 000000000000..85304ff019e1
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap.h
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#ifndef __XFS_SCRUB_RTBITMAP_H__
+#define __XFS_SCRUB_RTBITMAP_H__
+
+struct xchk_rtbitmap {
+ uint64_t rextents;
+ uint64_t rbmblocks;
+ unsigned int rextslog;
+ unsigned int resblks;
+};
+
+#ifdef CONFIG_XFS_ONLINE_REPAIR
+int xrep_setup_rtbitmap(struct xfs_scrub *sc, struct xchk_rtbitmap *rtb);
+#else
+# define xrep_setup_rtbitmap(sc, rtb) (0)
+#endif /* CONFIG_XFS_ONLINE_REPAIR */
+
+#endif /* __XFS_SCRUB_RTBITMAP_H__ */
diff --git a/fs/xfs/scrub/rtbitmap_repair.c b/fs/xfs/scrub/rtbitmap_repair.c
new file mode 100644
index 000000000000..46f5d5f605c9
--- /dev/null
+++ b/fs/xfs/scrub/rtbitmap_repair.c
@@ -0,0 +1,202 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2020-2023 Oracle. All Rights Reserved.
+ * Author: Darrick J. Wong <djwong@kernel.org>
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_log_format.h"
+#include "xfs_trans.h"
+#include "xfs_inode.h"
+#include "xfs_bit.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_btree.h"
+#include "scrub/scrub.h"
+#include "scrub/common.h"
+#include "scrub/trace.h"
+#include "scrub/repair.h"
+#include "scrub/xfile.h"
+#include "scrub/rtbitmap.h"
+
+/* Set up to repair the realtime bitmap file metadata. */
+int
+xrep_setup_rtbitmap(
+ struct xfs_scrub *sc,
+ struct xchk_rtbitmap *rtb)
+{
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks = 0;
+
+ /*
+ * Reserve enough blocks to write out a completely new bmbt for a
+ * maximally fragmented bitmap file. We do not hold the rtbitmap
+ * ILOCK yet, so this is entirely speculative.
+ */
+ blocks = xfs_bmbt_calc_size(mp, mp->m_sb.sb_rbmblocks);
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+
+ rtb->resblks += blocks;
+ return 0;
+}
+
+/*
+ * Make sure that the given range of the data fork of the realtime file is
+ * mapped to written blocks. The caller must ensure that the inode is joined
+ * to the transaction.
+ */
+STATIC int
+xrep_rtbitmap_data_mappings(
+ struct xfs_scrub *sc,
+ xfs_filblks_t len)
+{
+ struct xfs_bmbt_irec map;
+ xfs_fileoff_t off = 0;
+ int error;
+
+ ASSERT(sc->ip != NULL);
+
+ while (off < len) {
+ int nmaps = 1;
+
+ /*
+ * If we have a real extent mapping this block then we're
+ * in ok shape.
+ */
+ error = xfs_bmapi_read(sc->ip, off, len - off, &map, &nmaps,
+ XFS_DATA_FORK);
+ if (error)
+ return error;
+ if (nmaps == 0) {
+ ASSERT(nmaps != 0);
+ return -EFSCORRUPTED;
+ }
+
+ /*
+ * Written extents are ok. Holes are not filled because we
+ * do not know the freespace information.
+ */
+ if (xfs_bmap_is_written_extent(&map) ||
+ map.br_startblock == HOLESTARTBLOCK) {
+ off = map.br_startoff + map.br_blockcount;
+ continue;
+ }
+
+ /*
+ * If we find a delalloc reservation then something is very
+ * very wrong. Bail out.
+ */
+ if (map.br_startblock == DELAYSTARTBLOCK)
+ return -EFSCORRUPTED;
+
+ /* Make sure we're really converting an unwritten extent. */
+ if (map.br_state != XFS_EXT_UNWRITTEN) {
+ ASSERT(map.br_state == XFS_EXT_UNWRITTEN);
+ return -EFSCORRUPTED;
+ }
+
+ /* Make sure this block has a real zeroed extent mapped. */
+ nmaps = 1;
+ error = xfs_bmapi_write(sc->tp, sc->ip, map.br_startoff,
+ map.br_blockcount,
+ XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO,
+ 0, &map, &nmaps);
+ if (error)
+ return error;
+ if (nmaps != 1)
+ return -EFSCORRUPTED;
+
+ /* Commit new extent and all deferred work. */
+ error = xrep_defer_finish(sc);
+ if (error)
+ return error;
+
+ off = map.br_startoff + map.br_blockcount;
+ }
+
+ return 0;
+}
+
+/* Fix broken rt volume geometry. */
+STATIC int
+xrep_rtbitmap_geometry(
+ struct xfs_scrub *sc,
+ struct xchk_rtbitmap *rtb)
+{
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_trans *tp = sc->tp;
+
+ /* Superblock fields */
+ if (mp->m_sb.sb_rextents != rtb->rextents)
+ xfs_trans_mod_sb(sc->tp, XFS_TRANS_SB_REXTENTS,
+ rtb->rextents - mp->m_sb.sb_rextents);
+
+ if (mp->m_sb.sb_rbmblocks != rtb->rbmblocks)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS,
+ rtb->rbmblocks - mp->m_sb.sb_rbmblocks);
+
+ if (mp->m_sb.sb_rextslog != rtb->rextslog)
+ xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG,
+ rtb->rextslog - mp->m_sb.sb_rextslog);
+
+ /* Fix broken isize */
+ sc->ip->i_disk_size = roundup_64(sc->ip->i_disk_size,
+ mp->m_sb.sb_blocksize);
+
+ if (sc->ip->i_disk_size < XFS_FSB_TO_B(mp, rtb->rbmblocks))
+ sc->ip->i_disk_size = XFS_FSB_TO_B(mp, rtb->rbmblocks);
+
+ xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
+ return xrep_roll_trans(sc);
+}
+
+/* Repair the realtime bitmap file metadata. */
+int
+xrep_rtbitmap(
+ struct xfs_scrub *sc)
+{
+ struct xchk_rtbitmap *rtb = sc->buf;
+ struct xfs_mount *mp = sc->mp;
+ unsigned long long blocks = 0;
+ int error;
+
+ /* Impossibly large rtbitmap means we can't touch the filesystem. */
+ if (rtb->rbmblocks > U32_MAX)
+ return 0;
+
+ /*
+ * If the size of the rt bitmap file is larger than what we reserved,
+ * figure out if we need to adjust the block reservation in the
+ * transaction.
+ */
+ blocks = xfs_bmbt_calc_size(mp, rtb->rbmblocks);
+ if (blocks > UINT_MAX)
+ return -EOPNOTSUPP;
+ if (blocks > rtb->resblks) {
+ error = xfs_trans_reserve_more(sc->tp, blocks, 0);
+ if (error)
+ return error;
+
+ rtb->resblks += blocks;
+ }
+
+ /* Fix inode core and forks. */
+ error = xrep_metadata_inode_forks(sc);
+ if (error)
+ return error;
+
+ xfs_trans_ijoin(sc->tp, sc->ip, 0);
+
+ /* Ensure no unwritten extents. */
+ error = xrep_rtbitmap_data_mappings(sc, rtb->rbmblocks);
+ if (error)
+ return error;
+
+ /* Fix inconsistent bitmap geometry */
+ return xrep_rtbitmap_geometry(sc, rtb);
+}
diff --git a/fs/xfs/scrub/rtsummary.c b/fs/xfs/scrub/rtsummary.c
index 8b15c47408d0..b1ff4f33324a 100644
--- a/fs/xfs/scrub/rtsummary.c
+++ b/fs/xfs/scrub/rtsummary.c
@@ -16,6 +16,7 @@
#include "xfs_rtbitmap.h"
#include "xfs_bit.h"
#include "xfs_bmap.h"
+#include "xfs_sb.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
#include "scrub/trace.h"
@@ -31,6 +32,18 @@
* (potentially large) amount of data in pageable memory.
*/
+struct xchk_rtsummary {
+ struct xfs_rtalloc_args args;
+
+ uint64_t rextents;
+ uint64_t rbmblocks;
+ uint64_t rsumsize;
+ unsigned int rsumlevels;
+
+ /* Memory buffer for the summary comparison. */
+ union xfs_suminfo_raw words[];
+};
+
/* Set us up to check the rtsummary file. */
int
xchk_setup_rtsummary(
@@ -38,8 +51,15 @@ xchk_setup_rtsummary(
{
struct xfs_mount *mp = sc->mp;
char *descr;
+ struct xchk_rtsummary *rts;
int error;
+ rts = kvzalloc(struct_size(rts, words, mp->m_blockwsize),
+ XCHK_GFP_FLAGS);
+ if (!rts)
+ return -ENOMEM;
+ sc->buf = rts;
+
/*
* Create an xfile to construct a new rtsummary file. The xfile allows
* us to avoid pinning kernel memory for this purpose.
@@ -54,15 +74,14 @@ xchk_setup_rtsummary(
if (error)
return error;
- /* Allocate a memory buffer for the summary comparison. */
- sc->buf = kvmalloc(mp->m_sb.sb_blocksize, XCHK_GFP_FLAGS);
- if (!sc->buf)
- return -ENOMEM;
-
error = xchk_install_live_inode(sc, mp->m_rsumip);
if (error)
return error;
+ error = xchk_ino_dqattach(sc);
+ if (error)
+ return error;
+
/*
* Locking order requires us to take the rtbitmap first. We must be
* careful to unlock it ourselves when we are done with the rtbitmap
@@ -71,13 +90,29 @@ xchk_setup_rtsummary(
*/
xfs_ilock(mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP);
xchk_ilock(sc, XFS_ILOCK_EXCL | XFS_ILOCK_RTSUM);
+
+ /*
+ * Now that we've locked the rtbitmap and rtsummary, we can't race with
+ * growfsrt trying to expand the summary or change the size of the rt
+ * volume. Hence it is safe to compute and check the geometry values.
+ */
+ if (mp->m_sb.sb_rblocks) {
+ xfs_filblks_t rsumblocks;
+ int rextslog;
+
+ rts->rextents = xfs_rtb_to_rtx(mp, mp->m_sb.sb_rblocks);
+ rextslog = xfs_compute_rextslog(rts->rextents);
+ rts->rsumlevels = rextslog + 1;
+ rts->rbmblocks = xfs_rtbitmap_blockcount(mp, rts->rextents);
+ rsumblocks = xfs_rtsummary_blockcount(mp, rts->rsumlevels,
+ rts->rbmblocks);
+ rts->rsumsize = XFS_FSB_TO_B(mp, rsumblocks);
+ }
return 0;
}
/* Helper functions to record suminfo words in an xfile. */
-typedef unsigned int xchk_rtsumoff_t;
-
static inline int
xfsum_load(
struct xfs_scrub *sc,
@@ -143,7 +178,7 @@ xchk_rtsum_record_free(
/* Compute the relevant location in the rtsum file. */
rbmoff = xfs_rtx_to_rbmblock(mp, rec->ar_startext);
- lenlog = XFS_RTBLOCKLOG(rec->ar_extcount);
+ lenlog = xfs_highbit64(rec->ar_extcount);
offs = xfs_rtsumoffs(mp, lenlog, rbmoff);
rtbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
@@ -188,19 +223,29 @@ STATIC int
xchk_rtsum_compare(
struct xfs_scrub *sc)
{
- struct xfs_rtalloc_args args = {
- .mp = sc->mp,
- .tp = sc->tp,
- };
- struct xfs_mount *mp = sc->mp;
struct xfs_bmbt_irec map;
- xfs_fileoff_t off;
- xchk_rtsumoff_t sumoff = 0;
- int nmap;
+ struct xfs_iext_cursor icur;
+
+ struct xfs_mount *mp = sc->mp;
+ struct xfs_inode *ip = sc->ip;
+ struct xchk_rtsummary *rts = sc->buf;
+ xfs_fileoff_t off = 0;
+ xfs_fileoff_t endoff;
+ xfs_rtsumoff_t sumoff = 0;
+ int error = 0;
+
+ rts->args.mp = sc->mp;
+ rts->args.tp = sc->tp;
+
+ /* Mappings may not cross or lie beyond EOF. */
+ endoff = XFS_B_TO_FSB(mp, ip->i_disk_size);
+ if (xfs_iext_lookup_extent(ip, &ip->i_df, endoff, &icur, &map)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, endoff);
+ return 0;
+ }
- for (off = 0; off < XFS_B_TO_FSB(mp, mp->m_rsumsize); off++) {
- union xfs_suminfo_raw *ondisk_info;
- int error = 0;
+ while (off < endoff) {
+ int nmap = 1;
if (xchk_should_terminate(sc, &error))
return error;
@@ -208,8 +253,7 @@ xchk_rtsum_compare(
return 0;
/* Make sure we have a written extent. */
- nmap = 1;
- error = xfs_bmapi_read(mp->m_rsumip, off, 1, &map, &nmap,
+ error = xfs_bmapi_read(ip, off, endoff - off, &map, &nmap,
XFS_DATA_FORK);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
return error;
@@ -219,24 +263,33 @@ xchk_rtsum_compare(
return 0;
}
+ off += map.br_blockcount;
+ }
+
+ for (off = 0; off < endoff; off++) {
+ union xfs_suminfo_raw *ondisk_info;
+
/* Read a block's worth of ondisk rtsummary file. */
- error = xfs_rtsummary_read_buf(&args, off);
+ error = xfs_rtsummary_read_buf(&rts->args, off);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, off, &error))
return error;
/* Read a block's worth of computed rtsummary file. */
- error = xfsum_copyout(sc, sumoff, sc->buf, mp->m_blockwsize);
+ error = xfsum_copyout(sc, sumoff, rts->words, mp->m_blockwsize);
if (error) {
- xfs_rtbuf_cache_relse(&args);
+ xfs_rtbuf_cache_relse(&rts->args);
return error;
}
- ondisk_info = xfs_rsumblock_infoptr(&args, 0);
- if (memcmp(ondisk_info, sc->buf,
- mp->m_blockwsize << XFS_WORDLOG) != 0)
+ ondisk_info = xfs_rsumblock_infoptr(&rts->args, 0);
+ if (memcmp(ondisk_info, rts->words,
+ mp->m_blockwsize << XFS_WORDLOG) != 0) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, off);
+ xfs_rtbuf_cache_relse(&rts->args);
+ return error;
+ }
- xfs_rtbuf_cache_relse(&args);
+ xfs_rtbuf_cache_relse(&rts->args);
sumoff += mp->m_blockwsize;
}
@@ -249,8 +302,43 @@ xchk_rtsummary(
struct xfs_scrub *sc)
{
struct xfs_mount *mp = sc->mp;
+ struct xchk_rtsummary *rts = sc->buf;
int error = 0;
+ /* Is sb_rextents correct? */
+ if (mp->m_sb.sb_rextents != rts->rextents) {
+ xchk_ino_set_corrupt(sc, mp->m_rbmip->i_ino);
+ goto out_rbm;
+ }
+
+ /* Is m_rsumlevels correct? */
+ if (mp->m_rsumlevels != rts->rsumlevels) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
+ /* Is m_rsumsize correct? */
+ if (mp->m_rsumsize != rts->rsumsize) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
+ /* The summary file length must be aligned to an fsblock. */
+ if (mp->m_rsumip->i_disk_size & mp->m_blockmask) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
+ /*
+ * Is the summary file itself large enough to handle the rt volume?
+ * growfsrt expands the summary file before updating sb_rextents, so
+ * the file can be larger than rsumsize.
+ */
+ if (mp->m_rsumip->i_disk_size < rts->rsumsize) {
+ xchk_ino_set_corrupt(sc, mp->m_rsumip->i_ino);
+ goto out_rbm;
+ }
+
/* Invoke the fork scrubber. */
error = xchk_metadata_inode_forks(sc);
if (error || (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 4849efcaa33a..caf324c2b991 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -14,8 +14,6 @@
#include "xfs_inode.h"
#include "xfs_quota.h"
#include "xfs_qm.h"
-#include "xfs_errortag.h"
-#include "xfs_error.h"
#include "xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -238,27 +236,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
[XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_allocbt,
- .scrub = xchk_bnobt,
- .repair = xrep_notsupported,
+ .scrub = xchk_allocbt,
+ .repair = xrep_allocbt,
+ .repair_eval = xrep_revalidate_allocbt,
},
[XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */
.type = ST_PERAG,
.setup = xchk_setup_ag_allocbt,
- .scrub = xchk_cntbt,
- .repair = xrep_notsupported,
+ .scrub = xchk_allocbt,
+ .repair = xrep_allocbt,
+ .repair_eval = xrep_revalidate_allocbt,
},
[XFS_SCRUB_TYPE_INOBT] = { /* inobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_iallocbt,
- .scrub = xchk_inobt,
- .repair = xrep_notsupported,
+ .scrub = xchk_iallocbt,
+ .repair = xrep_iallocbt,
+ .repair_eval = xrep_revalidate_iallocbt,
},
[XFS_SCRUB_TYPE_FINOBT] = { /* finobt */
.type = ST_PERAG,
.setup = xchk_setup_ag_iallocbt,
- .scrub = xchk_finobt,
+ .scrub = xchk_iallocbt,
.has = xfs_has_finobt,
- .repair = xrep_notsupported,
+ .repair = xrep_iallocbt,
+ .repair_eval = xrep_revalidate_iallocbt,
},
[XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */
.type = ST_PERAG,
@@ -272,31 +274,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.setup = xchk_setup_ag_refcountbt,
.scrub = xchk_refcountbt,
.has = xfs_has_reflink,
- .repair = xrep_notsupported,
+ .repair = xrep_refcountbt,
},
[XFS_SCRUB_TYPE_INODE] = { /* inode record */
.type = ST_INODE,
.setup = xchk_setup_inode,
.scrub = xchk_inode,
- .repair = xrep_notsupported,
+ .repair = xrep_inode,
},
[XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */
.type = ST_INODE,
.setup = xchk_setup_inode_bmap,
.scrub = xchk_bmap_data,
- .repair = xrep_notsupported,
+ .repair = xrep_bmap_data,
},
[XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */
.type = ST_INODE,
.setup = xchk_setup_inode_bmap,
.scrub = xchk_bmap_attr,
- .repair = xrep_notsupported,
+ .repair = xrep_bmap_attr,
},
[XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */
.type = ST_INODE,
.setup = xchk_setup_inode_bmap,
.scrub = xchk_bmap_cow,
- .repair = xrep_notsupported,
+ .repair = xrep_bmap_cow,
},
[XFS_SCRUB_TYPE_DIR] = { /* directory */
.type = ST_INODE,
@@ -326,33 +328,31 @@ static const struct xchk_meta_ops meta_scrub_ops[] = {
.type = ST_FS,
.setup = xchk_setup_rtbitmap,
.scrub = xchk_rtbitmap,
- .has = xfs_has_realtime,
- .repair = xrep_notsupported,
+ .repair = xrep_rtbitmap,
},
[XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */
.type = ST_FS,
.setup = xchk_setup_rtsummary,
.scrub = xchk_rtsummary,
- .has = xfs_has_realtime,
.repair = xrep_notsupported,
},
[XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */
.type = ST_FS,
.setup = xchk_setup_quota,
.scrub = xchk_quota,
- .repair = xrep_notsupported,
+ .repair = xrep_quota,
},
[XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */
.type = ST_FS,
.setup = xchk_setup_quota,
.scrub = xchk_quota,
- .repair = xrep_notsupported,
+ .repair = xrep_quota,
},
[XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */
.type = ST_FS,
.setup = xchk_setup_quota,
.scrub = xchk_quota,
- .repair = xrep_notsupported,
+ .repair = xrep_quota,
},
[XFS_SCRUB_TYPE_FSCOUNTERS] = { /* fs summary counters */
.type = ST_FS,
@@ -531,7 +531,10 @@ retry_op:
/* Scrub for errors. */
check_start = xchk_stats_now();
- error = sc->ops->scrub(sc);
+ if ((sc->flags & XREP_ALREADY_FIXED) && sc->ops->repair_eval != NULL)
+ error = sc->ops->repair_eval(sc);
+ else
+ error = sc->ops->scrub(sc);
run.scrub_ns += xchk_stats_elapsed_ns(check_start);
if (error == -EDEADLOCK && !(sc->flags & XCHK_TRY_HARDER))
goto try_harder;
@@ -542,23 +545,12 @@ retry_op:
xchk_update_health(sc);
- if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) &&
- !(sc->flags & XREP_ALREADY_FIXED)) {
- bool needs_fix = xchk_needs_repair(sc->sm);
-
- /* Userspace asked us to rebuild the structure regardless. */
- if (sc->sm->sm_flags & XFS_SCRUB_IFLAG_FORCE_REBUILD)
- needs_fix = true;
-
- /* Let debug users force us into the repair routines. */
- if (XFS_TEST_ERROR(needs_fix, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR))
- needs_fix = true;
-
+ if (xchk_could_repair(sc)) {
/*
* If userspace asked for a repair but it wasn't necessary,
* report that back to userspace.
*/
- if (!needs_fix) {
+ if (!xrep_will_attempt(sc)) {
sc->sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED;
goto out_nofix;
}
diff --git a/fs/xfs/scrub/scrub.h b/fs/xfs/scrub/scrub.h
index 1ef9c6b4842a..7fc50654c4fe 100644
--- a/fs/xfs/scrub/scrub.h
+++ b/fs/xfs/scrub/scrub.h
@@ -35,6 +35,14 @@ struct xchk_meta_ops {
/* Repair or optimize the metadata. */
int (*repair)(struct xfs_scrub *);
+ /*
+ * Re-scrub the metadata we repaired, in case there's extra work that
+ * we need to do to check our repair work. If this is NULL, we'll use
+ * the ->scrub function pointer, assuming that the regular scrub is
+ * sufficient.
+ */
+ int (*repair_eval)(struct xfs_scrub *sc);
+
/* Decide if we even have this piece of metadata. */
bool (*has)(struct xfs_mount *);
@@ -113,6 +121,7 @@ struct xfs_scrub {
#define XCHK_HAVE_FREEZE_PROT (1U << 1) /* do we have freeze protection? */
#define XCHK_FSGATES_DRAIN (1U << 2) /* defer ops draining enabled */
#define XCHK_NEED_DRAIN (1U << 3) /* scrub needs to drain defer ops */
+#define XREP_RESET_PERAG_RESV (1U << 30) /* must reset AG space reservation */
#define XREP_ALREADY_FIXED (1U << 31) /* checking our repair work */
/*
@@ -129,10 +138,8 @@ int xchk_superblock(struct xfs_scrub *sc);
int xchk_agf(struct xfs_scrub *sc);
int xchk_agfl(struct xfs_scrub *sc);
int xchk_agi(struct xfs_scrub *sc);
-int xchk_bnobt(struct xfs_scrub *sc);
-int xchk_cntbt(struct xfs_scrub *sc);
-int xchk_inobt(struct xfs_scrub *sc);
-int xchk_finobt(struct xfs_scrub *sc);
+int xchk_allocbt(struct xfs_scrub *sc);
+int xchk_iallocbt(struct xfs_scrub *sc);
int xchk_rmapbt(struct xfs_scrub *sc);
int xchk_refcountbt(struct xfs_scrub *sc);
int xchk_inode(struct xfs_scrub *sc);
diff --git a/fs/xfs/scrub/symlink.c b/fs/xfs/scrub/symlink.c
index 38708fb9a5d7..ddff86713df3 100644
--- a/fs/xfs/scrub/symlink.c
+++ b/fs/xfs/scrub/symlink.c
@@ -12,8 +12,10 @@
#include "xfs_log_format.h"
#include "xfs_inode.h"
#include "xfs_symlink.h"
+#include "xfs_health.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
+#include "scrub/health.h"
/* Set us up to scrub a symbolic link. */
int
@@ -41,29 +43,37 @@ xchk_symlink(
if (!S_ISLNK(VFS_I(ip)->i_mode))
return -ENOENT;
+
+ if (xchk_file_looks_zapped(sc, XFS_SICK_INO_SYMLINK_ZAPPED)) {
+ xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
+ return 0;
+ }
+
ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
len = ip->i_disk_size;
/* Plausible size? */
if (len > XFS_SYMLINK_MAXLEN || len <= 0) {
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- goto out;
+ return 0;
}
/* Inline symlink? */
if (ifp->if_format == XFS_DINODE_FMT_LOCAL) {
if (len > xfs_inode_data_fork_size(ip) ||
- len > strnlen(ifp->if_u1.if_data, xfs_inode_data_fork_size(ip)))
+ len > strnlen(ifp->if_data, xfs_inode_data_fork_size(ip)))
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
- goto out;
+ return 0;
}
/* Remote symlink; must read the contents. */
error = xfs_readlink_bmap_ilocked(sc->ip, sc->buf);
if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error))
- goto out;
+ return error;
if (strnlen(sc->buf, XFS_SYMLINK_MAXLEN) < len)
xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
-out:
- return error;
+
+ /* If a remote symlink is clean, it is clearly not zapped. */
+ xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_SYMLINK_ZAPPED);
+ return 0;
}
diff --git a/fs/xfs/scrub/trace.c b/fs/xfs/scrub/trace.c
index 29afa4851235..d0e24ffaf754 100644
--- a/fs/xfs/scrub/trace.c
+++ b/fs/xfs/scrub/trace.c
@@ -14,9 +14,12 @@
#include "xfs_btree.h"
#include "xfs_ag.h"
#include "xfs_rtbitmap.h"
+#include "xfs_quota.h"
+#include "xfs_quota_defs.h"
#include "scrub/scrub.h"
#include "scrub/xfile.h"
#include "scrub/xfarray.h"
+#include "scrub/quota.h"
/* Figure out which block the btree cursor was pointing to. */
static inline xfs_fsblock_t
diff --git a/fs/xfs/scrub/trace.h b/fs/xfs/scrub/trace.h
index 4a8bc6f3c8f2..6bbb4e8639dc 100644
--- a/fs/xfs/scrub/trace.h
+++ b/fs/xfs/scrub/trace.h
@@ -19,6 +19,7 @@
struct xfile;
struct xfarray;
struct xfarray_sortinfo;
+struct xchk_dqiter;
/*
* ftrace's __print_symbolic requires that all enum values be wrapped in the
@@ -106,6 +107,7 @@ TRACE_DEFINE_ENUM(XFS_SCRUB_TYPE_FSCOUNTERS);
{ XCHK_HAVE_FREEZE_PROT, "nofreeze" }, \
{ XCHK_FSGATES_DRAIN, "fsgates_drain" }, \
{ XCHK_NEED_DRAIN, "need_drain" }, \
+ { XREP_RESET_PERAG_RESV, "reset_perag_resv" }, \
{ XREP_ALREADY_FIXED, "already_fixed" }
DECLARE_EVENT_CLASS(xchk_class,
@@ -347,6 +349,54 @@ DEFINE_EVENT(xchk_fblock_error_class, name, \
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_error);
DEFINE_SCRUB_FBLOCK_ERROR_EVENT(xchk_fblock_warning);
+#ifdef CONFIG_XFS_QUOTA
+DECLARE_EVENT_CLASS(xchk_dqiter_class,
+ TP_PROTO(struct xchk_dqiter *cursor, uint64_t id),
+ TP_ARGS(cursor, id),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_dqtype_t, dqtype)
+ __field(xfs_ino_t, ino)
+ __field(unsigned long long, cur_id)
+ __field(unsigned long long, id)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_filblks_t, blockcount)
+ __field(xfs_exntst_t, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = cursor->sc->ip->i_mount->m_super->s_dev;
+ __entry->dqtype = cursor->dqtype;
+ __entry->ino = cursor->quota_ip->i_ino;
+ __entry->cur_id = cursor->id;
+ __entry->startoff = cursor->bmap.br_startoff;
+ __entry->startblock = cursor->bmap.br_startblock;
+ __entry->blockcount = cursor->bmap.br_blockcount;
+ __entry->state = cursor->bmap.br_state;
+ __entry->id = id;
+ ),
+ TP_printk("dev %d:%d dquot type %s ino 0x%llx cursor_id 0x%llx startoff 0x%llx startblock 0x%llx blockcount 0x%llx state %u id 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_symbolic(__entry->dqtype, XFS_DQTYPE_STRINGS),
+ __entry->ino,
+ __entry->cur_id,
+ __entry->startoff,
+ __entry->startblock,
+ __entry->blockcount,
+ __entry->state,
+ __entry->id)
+);
+
+#define DEFINE_SCRUB_DQITER_EVENT(name) \
+DEFINE_EVENT(xchk_dqiter_class, name, \
+ TP_PROTO(struct xchk_dqiter *cursor, uint64_t id), \
+ TP_ARGS(cursor, id))
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_revalidate_bmap);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_bmap);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter_advance_incore);
+DEFINE_SCRUB_DQITER_EVENT(xchk_dquot_iter);
+#endif /* CONFIG_XFS_QUOTA */
+
TRACE_EVENT(xchk_incomplete,
TP_PROTO(struct xfs_scrub *sc, void *ret_ip),
TP_ARGS(sc, ret_ip),
@@ -1172,37 +1222,125 @@ DEFINE_EVENT(xrep_rmap_class, name, \
xfs_agblock_t agbno, xfs_extlen_t len, \
uint64_t owner, uint64_t offset, unsigned int flags), \
TP_ARGS(mp, agno, agbno, len, owner, offset, flags))
-DEFINE_REPAIR_RMAP_EVENT(xrep_alloc_extent_fn);
-DEFINE_REPAIR_RMAP_EVENT(xrep_ialloc_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xrep_ibt_walk_rmap);
DEFINE_REPAIR_RMAP_EVENT(xrep_rmap_extent_fn);
-DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_extent_fn);
+DEFINE_REPAIR_RMAP_EVENT(xrep_bmap_walk_rmap);
-TRACE_EVENT(xrep_refcount_extent_fn,
+TRACE_EVENT(xrep_abt_found,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- struct xfs_refcount_irec *irec),
- TP_ARGS(mp, agno, irec),
+ const struct xfs_alloc_rec_incore *rec),
+ TP_ARGS(mp, agno, rec),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
__field(xfs_agblock_t, startblock)
__field(xfs_extlen_t, blockcount)
- __field(xfs_nlink_t, refcount)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
- __entry->startblock = irec->rc_startblock;
- __entry->blockcount = irec->rc_blockcount;
- __entry->refcount = irec->rc_refcount;
+ __entry->startblock = rec->ar_startblock;
+ __entry->blockcount = rec->ar_blockcount;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startblock,
+ __entry->blockcount)
+)
+
+TRACE_EVENT(xrep_ibt_found,
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+ const struct xfs_inobt_rec_incore *rec),
+ TP_ARGS(mp, agno, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agino_t, startino)
+ __field(uint16_t, holemask)
+ __field(uint8_t, count)
+ __field(uint8_t, freecount)
+ __field(uint64_t, freemask)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->agno = agno;
+ __entry->startino = rec->ir_startino;
+ __entry->holemask = rec->ir_holemask;
+ __entry->count = rec->ir_count;
+ __entry->freecount = rec->ir_freecount;
+ __entry->freemask = rec->ir_free;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agino 0x%x holemask 0x%x count 0x%x freecount 0x%x freemask 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->startino,
+ __entry->holemask,
+ __entry->count,
+ __entry->freecount,
+ __entry->freemask)
+)
+
+TRACE_EVENT(xrep_refc_found,
+ TP_PROTO(struct xfs_perag *pag, const struct xfs_refcount_irec *rec),
+ TP_ARGS(pag, rec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(enum xfs_refc_domain, domain)
+ __field(xfs_agblock_t, startblock)
+ __field(xfs_extlen_t, blockcount)
+ __field(xfs_nlink_t, refcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->domain = rec->rc_domain;
+ __entry->startblock = rec->rc_startblock;
+ __entry->blockcount = rec->rc_blockcount;
+ __entry->refcount = rec->rc_refcount;
),
- TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x refcount %u",
+ TP_printk("dev %d:%d agno 0x%x dom %s agbno 0x%x fsbcount 0x%x refcount %u",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
+ __print_symbolic(__entry->domain, XFS_REFC_DOMAIN_STRINGS),
__entry->startblock,
__entry->blockcount,
__entry->refcount)
)
+TRACE_EVENT(xrep_bmap_found,
+ TP_PROTO(struct xfs_inode *ip, int whichfork,
+ struct xfs_bmbt_irec *irec),
+ TP_ARGS(ip, whichfork, irec),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(int, whichfork)
+ __field(xfs_fileoff_t, lblk)
+ __field(xfs_filblks_t, len)
+ __field(xfs_fsblock_t, pblk)
+ __field(int, state)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(ip)->i_sb->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->whichfork = whichfork;
+ __entry->lblk = irec->br_startoff;
+ __entry->len = irec->br_blockcount;
+ __entry->pblk = irec->br_startblock;
+ __entry->state = irec->br_state;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx whichfork %s fileoff 0x%llx fsbcount 0x%llx startblock 0x%llx state %d",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __print_symbolic(__entry->whichfork, XFS_WHICHFORK_STRINGS),
+ __entry->lblk,
+ __entry->len,
+ __entry->pblk,
+ __entry->state)
+);
+
TRACE_EVENT(xrep_findroot_block,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
uint32_t magic, uint16_t level),
@@ -1299,39 +1437,327 @@ TRACE_EVENT(xrep_reset_counters,
MAJOR(__entry->dev), MINOR(__entry->dev))
)
-TRACE_EVENT(xrep_ialloc_insert,
+DECLARE_EVENT_CLASS(xrep_newbt_extent_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
- xfs_agino_t startino, uint16_t holemask, uint8_t count,
- uint8_t freecount, uint64_t freemask),
- TP_ARGS(mp, agno, startino, holemask, count, freecount, freemask),
+ xfs_agblock_t agbno, xfs_extlen_t len,
+ int64_t owner),
+ TP_ARGS(mp, agno, agbno, len, owner),
TP_STRUCT__entry(
__field(dev_t, dev)
__field(xfs_agnumber_t, agno)
- __field(xfs_agino_t, startino)
- __field(uint16_t, holemask)
- __field(uint8_t, count)
- __field(uint8_t, freecount)
- __field(uint64_t, freemask)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, len)
+ __field(int64_t, owner)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
__entry->agno = agno;
- __entry->startino = startino;
- __entry->holemask = holemask;
- __entry->count = count;
- __entry->freecount = freecount;
- __entry->freemask = freemask;
+ __entry->agbno = agbno;
+ __entry->len = len;
+ __entry->owner = owner;
),
- TP_printk("dev %d:%d agno 0x%x startino 0x%x holemask 0x%x count %u freecount %u freemask 0x%llx",
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x owner 0x%llx",
MAJOR(__entry->dev), MINOR(__entry->dev),
__entry->agno,
- __entry->startino,
- __entry->holemask,
- __entry->count,
- __entry->freecount,
- __entry->freemask)
+ __entry->agbno,
+ __entry->len,
+ __entry->owner)
+);
+#define DEFINE_NEWBT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xrep_newbt_extent_class, name, \
+ TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+ xfs_agblock_t agbno, xfs_extlen_t len, \
+ int64_t owner), \
+ TP_ARGS(mp, agno, agbno, len, owner))
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_ag_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_alloc_file_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_free_blocks);
+DEFINE_NEWBT_EXTENT_EVENT(xrep_newbt_claim_block);
+
+DECLARE_EVENT_CLASS(xrep_dinode_class,
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip),
+ TP_ARGS(sc, dip),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(uint16_t, mode)
+ __field(uint8_t, version)
+ __field(uint8_t, format)
+ __field(uint32_t, uid)
+ __field(uint32_t, gid)
+ __field(uint64_t, size)
+ __field(uint64_t, nblocks)
+ __field(uint32_t, extsize)
+ __field(uint32_t, nextents)
+ __field(uint16_t, anextents)
+ __field(uint8_t, forkoff)
+ __field(uint8_t, aformat)
+ __field(uint16_t, flags)
+ __field(uint32_t, gen)
+ __field(uint64_t, flags2)
+ __field(uint32_t, cowextsize)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->mode = be16_to_cpu(dip->di_mode);
+ __entry->version = dip->di_version;
+ __entry->format = dip->di_format;
+ __entry->uid = be32_to_cpu(dip->di_uid);
+ __entry->gid = be32_to_cpu(dip->di_gid);
+ __entry->size = be64_to_cpu(dip->di_size);
+ __entry->nblocks = be64_to_cpu(dip->di_nblocks);
+ __entry->extsize = be32_to_cpu(dip->di_extsize);
+ __entry->nextents = be32_to_cpu(dip->di_nextents);
+ __entry->anextents = be16_to_cpu(dip->di_anextents);
+ __entry->forkoff = dip->di_forkoff;
+ __entry->aformat = dip->di_aformat;
+ __entry->flags = be16_to_cpu(dip->di_flags);
+ __entry->gen = be32_to_cpu(dip->di_gen);
+ __entry->flags2 = be64_to_cpu(dip->di_flags2);
+ __entry->cowextsize = be32_to_cpu(dip->di_cowextsize);
+ ),
+ TP_printk("dev %d:%d ino 0x%llx mode 0x%x version %u format %u uid %u gid %u disize 0x%llx nblocks 0x%llx extsize %u nextents %u anextents %u forkoff 0x%x aformat %u flags 0x%x gen 0x%x flags2 0x%llx cowextsize %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->mode,
+ __entry->version,
+ __entry->format,
+ __entry->uid,
+ __entry->gid,
+ __entry->size,
+ __entry->nblocks,
+ __entry->extsize,
+ __entry->nextents,
+ __entry->anextents,
+ __entry->forkoff,
+ __entry->aformat,
+ __entry->flags,
+ __entry->gen,
+ __entry->flags2,
+ __entry->cowextsize)
)
+#define DEFINE_REPAIR_DINODE_EVENT(name) \
+DEFINE_EVENT(xrep_dinode_class, name, \
+ TP_PROTO(struct xfs_scrub *sc, struct xfs_dinode *dip), \
+ TP_ARGS(sc, dip))
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_header);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_mode);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_flags);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_size);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_extsize_hints);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_symlink);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dir);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_fixed);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_forks);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_dfork);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_zap_afork);
+DEFINE_REPAIR_DINODE_EVENT(xrep_dinode_ensure_forkoff);
+
+DECLARE_EVENT_CLASS(xrep_inode_class,
+ TP_PROTO(struct xfs_scrub *sc),
+ TP_ARGS(sc),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsize_t, size)
+ __field(xfs_rfsblock_t, nblocks)
+ __field(uint16_t, flags)
+ __field(uint64_t, flags2)
+ __field(uint32_t, nextents)
+ __field(uint8_t, format)
+ __field(uint32_t, anextents)
+ __field(uint8_t, aformat)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->size = sc->ip->i_disk_size;
+ __entry->nblocks = sc->ip->i_nblocks;
+ __entry->flags = sc->ip->i_diflags;
+ __entry->flags2 = sc->ip->i_diflags2;
+ __entry->nextents = sc->ip->i_df.if_nextents;
+ __entry->format = sc->ip->i_df.if_format;
+ __entry->anextents = sc->ip->i_af.if_nextents;
+ __entry->aformat = sc->ip->i_af.if_format;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx disize 0x%llx nblocks 0x%llx flags 0x%x flags2 0x%llx nextents %u format %u anextents %u aformat %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->size,
+ __entry->nblocks,
+ __entry->flags,
+ __entry->flags2,
+ __entry->nextents,
+ __entry->format,
+ __entry->anextents,
+ __entry->aformat)
+)
+
+#define DEFINE_REPAIR_INODE_EVENT(name) \
+DEFINE_EVENT(xrep_inode_class, name, \
+ TP_PROTO(struct xfs_scrub *sc), \
+ TP_ARGS(sc))
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockcounts);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_ids);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_flags);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_blockdir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_sfdir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_dir_size);
+DEFINE_REPAIR_INODE_EVENT(xrep_inode_fixed);
+
+TRACE_EVENT(xrep_dinode_count_rmaps,
+ TP_PROTO(struct xfs_scrub *sc, xfs_rfsblock_t data_blocks,
+ xfs_rfsblock_t rt_blocks, xfs_rfsblock_t attr_blocks,
+ xfs_extnum_t data_extents, xfs_extnum_t rt_extents,
+ xfs_aextnum_t attr_extents),
+ TP_ARGS(sc, data_blocks, rt_blocks, attr_blocks, data_extents,
+ rt_extents, attr_extents),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_rfsblock_t, data_blocks)
+ __field(xfs_rfsblock_t, rt_blocks)
+ __field(xfs_rfsblock_t, attr_blocks)
+ __field(xfs_extnum_t, data_extents)
+ __field(xfs_extnum_t, rt_extents)
+ __field(xfs_aextnum_t, attr_extents)
+ ),
+ TP_fast_assign(
+ __entry->dev = sc->mp->m_super->s_dev;
+ __entry->ino = sc->sm->sm_ino;
+ __entry->data_blocks = data_blocks;
+ __entry->rt_blocks = rt_blocks;
+ __entry->attr_blocks = attr_blocks;
+ __entry->data_extents = data_extents;
+ __entry->rt_extents = rt_extents;
+ __entry->attr_extents = attr_extents;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx dblocks 0x%llx rtblocks 0x%llx ablocks 0x%llx dextents %llu rtextents %llu aextents %u",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->data_blocks,
+ __entry->rt_blocks,
+ __entry->attr_blocks,
+ __entry->data_extents,
+ __entry->rt_extents,
+ __entry->attr_extents)
+);
+
+TRACE_EVENT(xrep_cow_mark_file_range,
+ TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t startblock,
+ xfs_fileoff_t startoff, xfs_filblks_t blockcount),
+ TP_ARGS(ip, startblock, startoff, blockcount),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_filblks_t, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->startoff = startoff;
+ __entry->startblock = startblock;
+ __entry->blockcount = blockcount;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx fileoff 0x%llx startblock 0x%llx fsbcount 0x%llx",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->startoff,
+ __entry->startblock,
+ __entry->blockcount)
+);
+
+TRACE_EVENT(xrep_cow_replace_mapping,
+ TP_PROTO(struct xfs_inode *ip, const struct xfs_bmbt_irec *irec,
+ xfs_fsblock_t new_startblock, xfs_extlen_t new_blockcount),
+ TP_ARGS(ip, irec, new_startblock, new_blockcount),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __field(xfs_fsblock_t, startblock)
+ __field(xfs_fileoff_t, startoff)
+ __field(xfs_filblks_t, blockcount)
+ __field(xfs_exntst_t, state)
+ __field(xfs_fsblock_t, new_startblock)
+ __field(xfs_extlen_t, new_blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = ip->i_mount->m_super->s_dev;
+ __entry->ino = ip->i_ino;
+ __entry->startoff = irec->br_startoff;
+ __entry->startblock = irec->br_startblock;
+ __entry->blockcount = irec->br_blockcount;
+ __entry->state = irec->br_state;
+ __entry->new_startblock = new_startblock;
+ __entry->new_blockcount = new_blockcount;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx startoff 0x%llx startblock 0x%llx fsbcount 0x%llx state 0x%x new_startblock 0x%llx new_fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->startoff,
+ __entry->startblock,
+ __entry->blockcount,
+ __entry->state,
+ __entry->new_startblock,
+ __entry->new_blockcount)
+);
+
+TRACE_EVENT(xrep_cow_free_staging,
+ TP_PROTO(struct xfs_perag *pag, xfs_agblock_t agbno,
+ xfs_extlen_t blockcount),
+ TP_ARGS(pag, agbno, blockcount),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_agnumber_t, agno)
+ __field(xfs_agblock_t, agbno)
+ __field(xfs_extlen_t, blockcount)
+ ),
+ TP_fast_assign(
+ __entry->dev = pag->pag_mount->m_super->s_dev;
+ __entry->agno = pag->pag_agno;
+ __entry->agbno = agbno;
+ __entry->blockcount = blockcount;
+ ),
+ TP_printk("dev %d:%d agno 0x%x agbno 0x%x fsbcount 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->agno,
+ __entry->agbno,
+ __entry->blockcount)
+);
+
+#ifdef CONFIG_XFS_QUOTA
+DECLARE_EVENT_CLASS(xrep_dquot_class,
+ TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id),
+ TP_ARGS(mp, type, id),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(uint8_t, type)
+ __field(uint32_t, id)
+ ),
+ TP_fast_assign(
+ __entry->dev = mp->m_super->s_dev;
+ __entry->id = id;
+ __entry->type = type;
+ ),
+ TP_printk("dev %d:%d type %s id 0x%x",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __print_flags(__entry->type, "|", XFS_DQTYPE_STRINGS),
+ __entry->id)
+);
+
+#define DEFINE_XREP_DQUOT_EVENT(name) \
+DEFINE_EVENT(xrep_dquot_class, name, \
+ TP_PROTO(struct xfs_mount *mp, uint8_t type, uint32_t id), \
+ TP_ARGS(mp, type, id))
+DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item);
+DEFINE_XREP_DQUOT_EVENT(xrep_disk_dquot);
+DEFINE_XREP_DQUOT_EVENT(xrep_dquot_item_fill_bmap_hole);
+#endif /* CONFIG_XFS_QUOTA */
+
#endif /* IS_ENABLED(CONFIG_XFS_ONLINE_REPAIR) */
#endif /* _TRACE_XFS_SCRUB_TRACE_H */
diff --git a/fs/xfs/scrub/xfarray.h b/fs/xfs/scrub/xfarray.h
index 4ecac01363d9..62b9c506fdd1 100644
--- a/fs/xfs/scrub/xfarray.h
+++ b/fs/xfs/scrub/xfarray.h
@@ -54,6 +54,28 @@ static inline int xfarray_append(struct xfarray *array, const void *ptr)
uint64_t xfarray_length(struct xfarray *array);
int xfarray_load_next(struct xfarray *array, xfarray_idx_t *idx, void *rec);
+/*
+ * Iterate the non-null elements in a sparse xfarray. Callers should
+ * initialize *idx to XFARRAY_CURSOR_INIT before the first call; on return, it
+ * will be set to one more than the index of the record that was retrieved.
+ * Returns 1 if a record was retrieved, 0 if there weren't any more records, or
+ * a negative errno.
+ */
+static inline int
+xfarray_iter(
+ struct xfarray *array,
+ xfarray_idx_t *idx,
+ void *rec)
+{
+ int ret = xfarray_load_next(array, idx, rec);
+
+ if (ret == -ENODATA)
+ return 0;
+ if (ret == 0)
+ return 1;
+ return ret;
+}
+
/* Declarations for xfile array sort functionality. */
typedef cmp_func_t xfarray_cmp_fn;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 465d7630bb21..813f85156b0c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -584,7 +584,7 @@ const struct address_space_operations xfs_address_space_operations = {
.bmap = xfs_vm_bmap,
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = xfs_iomap_swapfile_activate,
};
diff --git a/fs/xfs/xfs_attr_item.c b/fs/xfs/xfs_attr_item.c
index 36fe2abb16e6..9e02111bd890 100644
--- a/fs/xfs/xfs_attr_item.c
+++ b/fs/xfs/xfs_attr_item.c
@@ -33,8 +33,6 @@ struct kmem_cache *xfs_attrd_cache;
static const struct xfs_item_ops xfs_attri_item_ops;
static const struct xfs_item_ops xfs_attrd_item_ops;
-static struct xfs_attrd_log_item *xfs_trans_get_attrd(struct xfs_trans *tp,
- struct xfs_attri_log_item *attrip);
static inline struct xfs_attri_log_item *ATTRI_ITEM(struct xfs_log_item *lip)
{
@@ -310,47 +308,6 @@ xfs_attrd_item_intent(
return &ATTRD_ITEM(lip)->attrd_attrip->attri_item;
}
-/*
- * Performs one step of an attribute update intent and marks the attrd item
- * dirty.. An attr operation may be a set or a remove. Note that the
- * transaction is marked dirty regardless of whether the operation succeeds or
- * fails to support the ATTRI/ATTRD lifecycle rules.
- */
-STATIC int
-xfs_xattri_finish_update(
- struct xfs_attr_intent *attr,
- struct xfs_attrd_log_item *attrdp)
-{
- struct xfs_da_args *args = attr->xattri_da_args;
- int error;
-
- if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
- error = -EIO;
- goto out;
- }
-
- error = xfs_attr_set_iter(attr);
- if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
- error = -EAGAIN;
-out:
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the ATTRI and frees the ATTRD
- * 2.) shuts down the filesystem
- */
- args->trans->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
-
- /*
- * attr intent/done items are null when logged attributes are disabled
- */
- if (attrdp)
- set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
-
- return error;
-}
-
/* Log an attr to the intent item. */
STATIC void
xfs_attr_log_item(
@@ -360,9 +317,6 @@ xfs_attr_log_item(
{
struct xfs_attri_log_format *attrp;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &attrip->attri_item.li_flags);
-
/*
* At this point the xfs_attr_intent has been constructed, and we've
* created the log intent. Fill in the attri log item and log format
@@ -419,7 +373,6 @@ xfs_attr_create_intent(
}
attrip = xfs_attri_init(mp, attr->xattri_nameval);
- xfs_trans_add_item(tp, &attrip->attri_item);
xfs_attr_log_item(tp, attrip, attr);
return &attrip->attri_item;
@@ -447,23 +400,33 @@ xfs_attr_finish_item(
struct xfs_btree_cur **state)
{
struct xfs_attr_intent *attr;
- struct xfs_attrd_log_item *done_item = NULL;
+ struct xfs_da_args *args;
int error;
attr = container_of(item, struct xfs_attr_intent, xattri_list);
- if (done)
- done_item = ATTRD_ITEM(done);
+ args = attr->xattri_da_args;
- /*
- * Always reset trans after EAGAIN cycle
- * since the transaction is new
- */
- attr->xattri_da_args->trans = tp;
+ /* Reset trans after EAGAIN cycle since the transaction is new */
+ args->trans = tp;
- error = xfs_xattri_finish_update(attr, done_item);
- if (error != -EAGAIN)
- xfs_attr_free_item(attr);
+ if (XFS_TEST_ERROR(false, args->dp->i_mount, XFS_ERRTAG_LARP)) {
+ error = -EIO;
+ goto out;
+ }
+ /* If an attr removal is trivially complete, we're done. */
+ if (attr->xattri_op_flags == XFS_ATTRI_OP_FLAGS_REMOVE &&
+ !xfs_inode_hasattr(args->dp)) {
+ error = 0;
+ goto out;
+ }
+
+ error = xfs_attr_set_iter(attr);
+ if (!error && attr->xattri_dela_state != XFS_DAS_DONE)
+ return -EAGAIN;
+
+out:
+ xfs_attr_free_item(attr);
return error;
}
@@ -532,41 +495,22 @@ xfs_attri_validate(
return xfs_verify_ino(mp, attrp->alfi_ino);
}
-/*
- * Process an attr intent item that was recovered from the log. We need to
- * delete the attr that it describes.
- */
-STATIC int
-xfs_attri_item_recover(
- struct xfs_log_item *lip,
- struct list_head *capture_list)
+static inline struct xfs_attr_intent *
+xfs_attri_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_attri_log_format *attrp,
+ struct xfs_inode **ipp,
+ struct xfs_attri_log_nameval *nv)
{
- struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
struct xfs_attr_intent *attr;
- struct xfs_mount *mp = lip->li_log->l_mp;
- struct xfs_inode *ip;
struct xfs_da_args *args;
- struct xfs_trans *tp;
- struct xfs_trans_res resv;
- struct xfs_attri_log_format *attrp;
- struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
- int error;
- int total;
int local;
- struct xfs_attrd_log_item *done_item = NULL;
-
- /*
- * First check the validity of the attr described by the ATTRI. If any
- * are bad, then assume that all are bad and just toss the ATTRI.
- */
- attrp = &attrip->attri_format;
- if (!xfs_attri_validate(mp, attrp) ||
- !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
- return -EFSCORRUPTED;
+ int error;
- error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
+ error = xlog_recover_iget(mp, attrp->alfi_ino, ipp);
if (error)
- return error;
+ return ERR_PTR(error);
attr = kmem_zalloc(sizeof(struct xfs_attr_intent) +
sizeof(struct xfs_da_args), KM_NOFS);
@@ -584,7 +528,7 @@ xfs_attri_item_recover(
attr->xattri_nameval = xfs_attri_log_nameval_get(nv);
ASSERT(attr->xattri_nameval);
- args->dp = ip;
+ args->dp = *ipp;
args->geo = mp->m_attr_geo;
args->whichfork = XFS_ATTR_FORK;
args->name = nv->name.i_addr;
@@ -608,43 +552,65 @@ xfs_attri_item_recover(
attr->xattri_dela_state = xfs_attr_init_add_state(args);
break;
case XFS_ATTRI_OP_FLAGS_REMOVE:
- if (!xfs_inode_hasattr(args->dp))
- goto out;
attr->xattri_dela_state = xfs_attr_init_remove_state(args);
break;
- default:
- ASSERT(0);
- error = -EFSCORRUPTED;
- goto out;
}
+ xfs_defer_add_item(dfp, &attr->xattri_list);
+ return attr;
+}
+
+/*
+ * Process an attr intent item that was recovered from the log. We need to
+ * delete the attr that it describes.
+ */
+STATIC int
+xfs_attr_recover_work(
+ struct xfs_defer_pending *dfp,
+ struct list_head *capture_list)
+{
+ struct xfs_log_item *lip = dfp->dfp_intent;
+ struct xfs_attri_log_item *attrip = ATTRI_ITEM(lip);
+ struct xfs_attr_intent *attr;
+ struct xfs_mount *mp = lip->li_log->l_mp;
+ struct xfs_inode *ip;
+ struct xfs_da_args *args;
+ struct xfs_trans *tp;
+ struct xfs_trans_res resv;
+ struct xfs_attri_log_format *attrp;
+ struct xfs_attri_log_nameval *nv = attrip->attri_nameval;
+ int error;
+ int total;
+
+ /*
+ * First check the validity of the attr described by the ATTRI. If any
+ * are bad, then assume that all are bad and just toss the ATTRI.
+ */
+ attrp = &attrip->attri_format;
+ if (!xfs_attri_validate(mp, attrp) ||
+ !xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len))
+ return -EFSCORRUPTED;
+
+ attr = xfs_attri_recover_work(mp, dfp, attrp, &ip, nv);
+ if (IS_ERR(attr))
+ return PTR_ERR(attr);
+ args = attr->xattri_da_args;
+
xfs_init_attr_trans(args, &resv, &total);
resv = xlog_recover_resv(&resv);
error = xfs_trans_alloc(mp, &resv, total, 0, XFS_TRANS_RESERVE, &tp);
if (error)
- goto out;
-
+ return error;
args->trans = tp;
- done_item = xfs_trans_get_attrd(tp, attrip);
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- error = xfs_xattri_finish_update(attr, done_item);
- if (error == -EAGAIN) {
- /*
- * There's more work to do, so add the intent item to this
- * transaction so that we can continue it later.
- */
- xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_ATTR, &attr->xattri_list);
- error = xfs_defer_ops_capture_and_commit(tp, capture_list);
- if (error)
- goto out_unlock;
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_irele(ip);
- return 0;
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &attrip->attri_format,
+ sizeof(attrip->attri_format));
if (error) {
xfs_trans_cancel(tp);
goto out_unlock;
@@ -654,18 +620,16 @@ xfs_attri_item_recover(
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
xfs_irele(ip);
-out:
- xfs_attr_free_item(attr);
return error;
}
/* Re-log an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_attri_item_relog(
+xfs_attr_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_attrd_log_item *attrdp;
struct xfs_attri_log_item *old_attrip;
struct xfs_attri_log_item *new_attrip;
struct xfs_attri_log_format *new_attrp;
@@ -674,10 +638,6 @@ xfs_attri_item_relog(
old_attrip = ATTRI_ITEM(intent);
old_attrp = &old_attrip->attri_format;
- tp->t_flags |= XFS_TRANS_DIRTY;
- attrdp = xfs_trans_get_attrd(tp, old_attrip);
- set_bit(XFS_LI_DIRTY, &attrdp->attrd_item.li_flags);
-
/*
* Create a new log item that shares the same name/value buffer as the
* old log item.
@@ -691,12 +651,43 @@ xfs_attri_item_relog(
new_attrp->alfi_name_len = old_attrp->alfi_name_len;
new_attrp->alfi_attr_filter = old_attrp->alfi_attr_filter;
- xfs_trans_add_item(tp, &new_attrip->attri_item);
- set_bit(XFS_LI_DIRTY, &new_attrip->attri_item.li_flags);
-
return &new_attrip->attri_item;
}
+/* Get an ATTRD so we can process all the attrs. */
+static struct xfs_log_item *
+xfs_attr_create_done(
+ struct xfs_trans *tp,
+ struct xfs_log_item *intent,
+ unsigned int count)
+{
+ struct xfs_attri_log_item *attrip;
+ struct xfs_attrd_log_item *attrdp;
+
+ attrip = ATTRI_ITEM(intent);
+
+ attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
+ &xfs_attrd_item_ops);
+ attrdp->attrd_attrip = attrip;
+ attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
+
+ return &attrdp->attrd_item;
+}
+
+const struct xfs_defer_op_type xfs_attr_defer_type = {
+ .name = "attr",
+ .max_items = 1,
+ .create_intent = xfs_attr_create_intent,
+ .abort_intent = xfs_attr_abort_intent,
+ .create_done = xfs_attr_create_done,
+ .finish_item = xfs_attr_finish_item,
+ .cancel_item = xfs_attr_cancel_item,
+ .recover_work = xfs_attr_recover_work,
+ .relog_intent = xfs_attr_relog_intent,
+};
+
STATIC int
xlog_recover_attri_commit_pass2(
struct xlog *log,
@@ -767,63 +758,13 @@ xlog_recover_attri_commit_pass2(
attrip = xfs_attri_init(mp, nv);
memcpy(&attrip->attri_format, attri_formatp, len);
- /*
- * The ATTRI has two references. One for the ATTRD and one for ATTRI to
- * ensure it makes it into the AIL. Insert the ATTRI into the AIL
- * directly and drop the ATTRI reference. Note that
- * xfs_trans_ail_update() drops the AIL lock.
- */
- xfs_trans_ail_insert(log->l_ailp, &attrip->attri_item, lsn);
- xfs_attri_release(attrip);
+ xlog_recover_intent_item(log, &attrip->attri_item, lsn,
+ &xfs_attr_defer_type);
xfs_attri_log_nameval_put(nv);
return 0;
}
/*
- * This routine is called to allocate an "attr free done" log item.
- */
-static struct xfs_attrd_log_item *
-xfs_trans_get_attrd(struct xfs_trans *tp,
- struct xfs_attri_log_item *attrip)
-{
- struct xfs_attrd_log_item *attrdp;
-
- ASSERT(tp != NULL);
-
- attrdp = kmem_cache_zalloc(xfs_attrd_cache, GFP_NOFS | __GFP_NOFAIL);
-
- xfs_log_item_init(tp->t_mountp, &attrdp->attrd_item, XFS_LI_ATTRD,
- &xfs_attrd_item_ops);
- attrdp->attrd_attrip = attrip;
- attrdp->attrd_format.alfd_alf_id = attrip->attri_format.alfi_id;
-
- xfs_trans_add_item(tp, &attrdp->attrd_item);
- return attrdp;
-}
-
-/* Get an ATTRD so we can process all the attrs. */
-static struct xfs_log_item *
-xfs_attr_create_done(
- struct xfs_trans *tp,
- struct xfs_log_item *intent,
- unsigned int count)
-{
- if (!intent)
- return NULL;
-
- return &xfs_trans_get_attrd(tp, ATTRI_ITEM(intent))->attrd_item;
-}
-
-const struct xfs_defer_op_type xfs_attr_defer_type = {
- .max_items = 1,
- .create_intent = xfs_attr_create_intent,
- .abort_intent = xfs_attr_abort_intent,
- .create_done = xfs_attr_create_done,
- .finish_item = xfs_attr_finish_item,
- .cancel_item = xfs_attr_cancel_item,
-};
-
-/*
* This routine is called when an ATTRD format structure is found in a committed
* transaction in the log. Its purpose is to cancel the corresponding ATTRI if
* it was still in the log. To do this it searches the AIL for the ATTRI with
@@ -857,9 +798,7 @@ static const struct xfs_item_ops xfs_attri_item_ops = {
.iop_format = xfs_attri_item_format,
.iop_unpin = xfs_attri_item_unpin,
.iop_release = xfs_attri_item_release,
- .iop_recover = xfs_attri_item_recover,
.iop_match = xfs_attri_item_match,
- .iop_relog = xfs_attri_item_relog,
};
const struct xlog_recover_item_ops xlog_attri_item_ops = {
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 99bbbe1a0e44..e368ad671e26 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -56,14 +56,13 @@ xfs_attr_shortform_list(
struct xfs_attrlist_cursor_kern *cursor = &context->cursor;
struct xfs_inode *dp = context->dp;
struct xfs_attr_sf_sort *sbuf, *sbp;
- struct xfs_attr_shortform *sf;
+ struct xfs_attr_sf_hdr *sf = dp->i_af.if_data;
struct xfs_attr_sf_entry *sfe;
int sbsize, nsbuf, count, i;
int error = 0;
- sf = (struct xfs_attr_shortform *)dp->i_af.if_u1.if_data;
ASSERT(sf != NULL);
- if (!sf->hdr.count)
+ if (!sf->count)
return 0;
trace_xfs_attr_list_sf(context);
@@ -79,8 +78,8 @@ xfs_attr_shortform_list(
*/
if (context->bufsize == 0 ||
(XFS_ISRESET_CURSOR(cursor) &&
- (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
- for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ (dp->i_af.if_bytes + sf->count * 16) < context->bufsize)) {
+ for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sfe->nameval,
sfe->namelen)))
@@ -109,7 +108,7 @@ xfs_attr_shortform_list(
/*
* It didn't all fit, so we have to sort everything on hashval.
*/
- sbsize = sf->hdr.count * sizeof(*sbuf);
+ sbsize = sf->count * sizeof(*sbuf);
sbp = sbuf = kmem_alloc(sbsize, KM_NOFS);
/*
@@ -117,7 +116,7 @@ xfs_attr_shortform_list(
* the relevant info from only those that match into a buffer.
*/
nsbuf = 0;
- for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
+ for (i = 0, sfe = xfs_attr_sf_firstentry(sf); i < sf->count; i++) {
if (unlikely(
((char *)sfe < (char *)sf) ||
((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) {
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
index e736a0844c89..52fb8a148b7d 100644
--- a/fs/xfs/xfs_bmap_item.c
+++ b/fs/xfs/xfs_bmap_item.c
@@ -221,51 +221,6 @@ static const struct xfs_item_ops xfs_bud_item_ops = {
.iop_intent = xfs_bud_item_intent,
};
-static struct xfs_bud_log_item *
-xfs_trans_get_bud(
- struct xfs_trans *tp,
- struct xfs_bui_log_item *buip)
-{
- struct xfs_bud_log_item *budp;
-
- budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
- &xfs_bud_item_ops);
- budp->bud_buip = buip;
- budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
-
- xfs_trans_add_item(tp, &budp->bud_item);
- return budp;
-}
-
-/*
- * Finish an bmap update and log it to the BUD. Note that the
- * transaction is marked dirty regardless of whether the bmap update
- * succeeds or fails to support the BUI/BUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_bmap_update(
- struct xfs_trans *tp,
- struct xfs_bud_log_item *budp,
- struct xfs_bmap_intent *bi)
-{
- int error;
-
- error = xfs_bmap_finish_one(tp, bi);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the BUI and frees the BUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
-
- return error;
-}
-
/* Sort bmap intents by inode. */
static int
xfs_bmap_update_diff_items(
@@ -314,9 +269,6 @@ xfs_bmap_update_log_item(
uint next_extent;
struct xfs_map_extent *map;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -346,7 +298,6 @@ xfs_bmap_update_create_intent(
ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
- xfs_trans_add_item(tp, &buip->bui_item);
if (sort)
list_sort(mp, items, xfs_bmap_update_diff_items);
list_for_each_entry(bi, items, bi_list)
@@ -354,14 +305,23 @@ xfs_bmap_update_create_intent(
return &buip->bui_item;
}
-/* Get an BUD so we can process all the deferred rmap updates. */
+/* Get an BUD so we can process all the deferred bmap updates. */
static struct xfs_log_item *
xfs_bmap_update_create_done(
struct xfs_trans *tp,
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_bud(tp, BUI_ITEM(intent))->bud_item;
+ struct xfs_bui_log_item *buip = BUI_ITEM(intent);
+ struct xfs_bud_log_item *budp;
+
+ budp = kmem_cache_zalloc(xfs_bud_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(tp->t_mountp, &budp->bud_item, XFS_LI_BUD,
+ &xfs_bud_item_ops);
+ budp->bud_buip = buip;
+ budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
+
+ return &budp->bud_item;
}
/* Take a passive ref to the AG containing the space we're mapping. */
@@ -392,7 +352,7 @@ xfs_bmap_update_put_group(
xfs_perag_intent_put(bi->bi_pag);
}
-/* Process a deferred rmap update. */
+/* Process a deferred bmap update. */
STATIC int
xfs_bmap_update_finish_item(
struct xfs_trans *tp,
@@ -405,7 +365,7 @@ xfs_bmap_update_finish_item(
bi = container_of(item, struct xfs_bmap_intent, bi_list);
- error = xfs_trans_log_finish_bmap_update(tp, BUD_ITEM(done), bi);
+ error = xfs_bmap_finish_one(tp, bi);
if (!error && bi->bi_bmap.br_blockcount > 0) {
ASSERT(bi->bi_type == XFS_BMAP_UNMAP);
return -EAGAIN;
@@ -437,15 +397,6 @@ xfs_bmap_update_cancel_item(
kmem_cache_free(xfs_bmap_intent_cache, bi);
}
-const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
- .max_items = XFS_BUI_MAX_FAST_EXTENTS,
- .create_intent = xfs_bmap_update_create_intent,
- .abort_intent = xfs_bmap_update_abort_intent,
- .create_done = xfs_bmap_update_create_done,
- .finish_item = xfs_bmap_update_finish_item,
- .cancel_item = xfs_bmap_update_cancel_item,
-};
-
/* Is this recovered BUI ok? */
static inline bool
xfs_bui_validate(
@@ -480,23 +431,53 @@ xfs_bui_validate(
return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
}
+static inline struct xfs_bmap_intent *
+xfs_bui_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_inode **ipp,
+ struct xfs_map_extent *map)
+{
+ struct xfs_bmap_intent *bi;
+ int error;
+
+ error = xlog_recover_iget(mp, map->me_owner, ipp);
+ if (error)
+ return ERR_PTR(error);
+
+ bi = kmem_cache_zalloc(xfs_bmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+ bi->bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ bi->bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+ bi->bi_bmap.br_startblock = map->me_startblock;
+ bi->bi_bmap.br_startoff = map->me_startoff;
+ bi->bi_bmap.br_blockcount = map->me_len;
+ bi->bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ bi->bi_owner = *ipp;
+ xfs_bmap_update_get_group(mp, bi);
+
+ xfs_defer_add_item(dfp, &bi->bi_list);
+ return bi;
+}
+
/*
* Process a bmap update intent item that was recovered from the log.
* We need to update some inode's bmbt.
*/
STATIC int
-xfs_bui_item_recover(
- struct xfs_log_item *lip,
+xfs_bmap_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
- struct xfs_bmap_intent fake = { };
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_bui_log_item *buip = BUI_ITEM(lip);
struct xfs_trans *tp;
struct xfs_inode *ip = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
struct xfs_map_extent *map;
- struct xfs_bud_log_item *budp;
+ struct xfs_bmap_intent *work;
int iext_delta;
int error = 0;
@@ -507,13 +488,9 @@ xfs_bui_item_recover(
}
map = &buip->bui_format.bui_extents[0];
- fake.bi_whichfork = (map->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
- fake.bi_type = map->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
-
- error = xlog_recover_iget(mp, map->me_owner, &ip);
- if (error)
- return error;
+ work = xfs_bui_recover_work(mp, dfp, &ip, map);
+ if (IS_ERR(work))
+ return PTR_ERR(work);
/* Allocate transaction and do the work. */
resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
@@ -522,42 +499,27 @@ xfs_bui_item_recover(
if (error)
goto err_rele;
- budp = xfs_trans_get_bud(tp, buip);
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_trans_ijoin(tp, ip, 0);
- if (fake.bi_type == XFS_BMAP_MAP)
+ if (work->bi_type == XFS_BMAP_MAP)
iext_delta = XFS_IEXT_ADD_NOSPLIT_CNT;
else
iext_delta = XFS_IEXT_PUNCH_HOLE_CNT;
- error = xfs_iext_count_may_overflow(ip, fake.bi_whichfork, iext_delta);
+ error = xfs_iext_count_may_overflow(ip, work->bi_whichfork, iext_delta);
if (error == -EFBIG)
error = xfs_iext_count_upgrade(tp, ip, iext_delta);
if (error)
goto err_cancel;
- fake.bi_owner = ip;
- fake.bi_bmap.br_startblock = map->me_startblock;
- fake.bi_bmap.br_startoff = map->me_startoff;
- fake.bi_bmap.br_blockcount = map->me_len;
- fake.bi_bmap.br_state = (map->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-
- xfs_bmap_update_get_group(mp, &fake);
- error = xfs_trans_log_finish_bmap_update(tp, budp, &fake);
+ error = xlog_recover_finish_intent(tp, dfp);
if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, map,
- sizeof(*map));
- xfs_bmap_update_put_group(&fake);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &buip->bui_format, sizeof(buip->bui_format));
if (error)
goto err_cancel;
- if (fake.bi_bmap.br_blockcount > 0) {
- ASSERT(fake.bi_type == XFS_BMAP_UNMAP);
- xfs_bmap_unmap_extent(tp, ip, &fake.bi_bmap);
- }
-
/*
* Commit transaction, which frees the transaction and saves the inode
* for later replay activities.
@@ -579,21 +541,13 @@ err_rele:
return error;
}
-STATIC bool
-xfs_bui_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_bui_item_relog(
+xfs_bmap_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_bud_log_item *budp;
struct xfs_bui_log_item *buip;
struct xfs_map_extent *map;
unsigned int count;
@@ -601,27 +555,40 @@ xfs_bui_item_relog(
count = BUI_ITEM(intent)->bui_format.bui_nextents;
map = BUI_ITEM(intent)->bui_format.bui_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- budp = xfs_trans_get_bud(tp, BUI_ITEM(intent));
- set_bit(XFS_LI_DIRTY, &budp->bud_item.li_flags);
-
buip = xfs_bui_init(tp->t_mountp);
memcpy(buip->bui_format.bui_extents, map, count * sizeof(*map));
atomic_set(&buip->bui_next_extent, count);
- xfs_trans_add_item(tp, &buip->bui_item);
- set_bit(XFS_LI_DIRTY, &buip->bui_item.li_flags);
+
return &buip->bui_item;
}
+const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+ .name = "bmap",
+ .max_items = XFS_BUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_bmap_update_create_intent,
+ .abort_intent = xfs_bmap_update_abort_intent,
+ .create_done = xfs_bmap_update_create_done,
+ .finish_item = xfs_bmap_update_finish_item,
+ .cancel_item = xfs_bmap_update_cancel_item,
+ .recover_work = xfs_bmap_recover_work,
+ .relog_intent = xfs_bmap_relog_intent,
+};
+
+STATIC bool
+xfs_bui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return BUI_ITEM(lip)->bui_format.bui_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_bui_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_bui_item_size,
.iop_format = xfs_bui_item_format,
.iop_unpin = xfs_bui_item_unpin,
.iop_release = xfs_bui_item_release,
- .iop_recover = xfs_bui_item_recover,
.iop_match = xfs_bui_item_match,
- .iop_relog = xfs_bui_item_relog,
};
static inline void
@@ -681,12 +648,9 @@ xlog_recover_bui_commit_pass2(
buip = xfs_bui_init(mp);
xfs_bui_copy_format(&buip->bui_format, bui_formatp);
atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &buip->bui_item, lsn);
- xfs_bui_release(buip);
+
+ xlog_recover_intent_item(log, &buip->bui_item, lsn,
+ &xfs_bmap_update_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 731260a5af6d..c2531c28905c 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -69,147 +69,6 @@ xfs_zero_extent(
GFP_NOFS, 0);
}
-#ifdef CONFIG_XFS_RT
-int
-xfs_bmap_rtalloc(
- struct xfs_bmalloca *ap)
-{
- struct xfs_mount *mp = ap->ip->i_mount;
- xfs_fileoff_t orig_offset = ap->offset;
- xfs_rtxnum_t rtx;
- xfs_rtxlen_t prod = 0; /* product factor for allocators */
- xfs_extlen_t mod = 0; /* product factor for allocators */
- xfs_rtxlen_t ralen = 0; /* realtime allocation length */
- xfs_extlen_t align; /* minimum allocation alignment */
- xfs_extlen_t orig_length = ap->length;
- xfs_extlen_t minlen = mp->m_sb.sb_rextsize;
- xfs_rtxlen_t raminlen;
- bool rtlocked = false;
- bool ignore_locality = false;
- int error;
-
- align = xfs_get_extsz_hint(ap->ip);
-retry:
- prod = xfs_extlen_to_rtxlen(mp, align);
- error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
- align, 1, ap->eof, 0,
- ap->conv, &ap->offset, &ap->length);
- if (error)
- return error;
- ASSERT(ap->length);
- ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0);
-
- /*
- * If we shifted the file offset downward to satisfy an extent size
- * hint, increase minlen by that amount so that the allocator won't
- * give us an allocation that's too short to cover at least one of the
- * blocks that the caller asked for.
- */
- if (ap->offset != orig_offset)
- minlen += orig_offset - ap->offset;
-
- /*
- * If the offset & length are not perfectly aligned
- * then kill prod, it will just get us in trouble.
- */
- div_u64_rem(ap->offset, align, &mod);
- if (mod || ap->length % align)
- prod = 1;
- /*
- * Set ralen to be the actual requested length in rtextents.
- *
- * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
- * we rounded up to it, cut it back so it's valid again.
- * Note that if it's a really large request (bigger than
- * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
- * adjust the starting point to match it.
- */
- ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
-
- /*
- * Lock out modifications to both the RT bitmap and summary inodes
- */
- if (!rtlocked) {
- xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
- xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
- xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
- xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
- rtlocked = true;
- }
-
- /*
- * If it's an allocation to an empty file at offset 0,
- * pick an extent that will space things out in the rt area.
- */
- if (ap->eof && ap->offset == 0) {
- error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
- if (error)
- return error;
- ap->blkno = xfs_rtx_to_rtb(mp, rtx);
- } else {
- ap->blkno = 0;
- }
-
- xfs_bmap_adjacent(ap);
-
- /*
- * Realtime allocation, done through xfs_rtallocate_extent.
- */
- if (ignore_locality)
- rtx = 0;
- else
- rtx = xfs_rtb_to_rtx(mp, ap->blkno);
- raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
- error = xfs_rtallocate_extent(ap->tp, rtx, raminlen, ralen, &ralen,
- ap->wasdel, prod, &rtx);
- if (error)
- return error;
-
- if (rtx != NULLRTEXTNO) {
- ap->blkno = xfs_rtx_to_rtb(mp, rtx);
- ap->length = xfs_rtxlen_to_extlen(mp, ralen);
- ap->ip->i_nblocks += ap->length;
- xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
- if (ap->wasdel)
- ap->ip->i_delayed_blks -= ap->length;
- /*
- * Adjust the disk quota also. This was reserved
- * earlier.
- */
- xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
- ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
- XFS_TRANS_DQ_RTBCOUNT, ap->length);
- return 0;
- }
-
- if (align > mp->m_sb.sb_rextsize) {
- /*
- * We previously enlarged the request length to try to satisfy
- * an extent size hint. The allocator didn't return anything,
- * so reset the parameters to the original values and try again
- * without alignment criteria.
- */
- ap->offset = orig_offset;
- ap->length = orig_length;
- minlen = align = mp->m_sb.sb_rextsize;
- goto retry;
- }
-
- if (!ignore_locality && ap->blkno != 0) {
- /*
- * If we can't allocate near a specific rt extent, try again
- * without locality criteria.
- */
- ignore_locality = true;
- goto retry;
- }
-
- ap->blkno = NULLFSBLOCK;
- ap->length = 0;
- return 0;
-}
-#endif /* CONFIG_XFS_RT */
-
/*
* Extent tree block counting routines.
*/
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 6888078f5c31..77ecbb753ef2 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -47,7 +47,7 @@ int xfs_bmap_extsize_align(struct xfs_mount *mp, struct xfs_bmbt_irec *gotp,
struct xfs_bmbt_irec *prevp, xfs_extlen_t extsz,
int rt, int eof, int delay, int convert,
xfs_fileoff_t *offp, xfs_extlen_t *lenp);
-void xfs_bmap_adjacent(struct xfs_bmalloca *ap);
+bool xfs_bmap_adjacent(struct xfs_bmalloca *ap);
int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip,
int whichfork, struct xfs_bmbt_irec *rec,
int *is_empty);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 545c7991b9b5..8e5bd50d29fe 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -169,7 +169,7 @@ xfs_buf_stale(
atomic_set(&bp->b_lru_ref, 0);
if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
- (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+ (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru)))
atomic_dec(&bp->b_hold);
ASSERT(atomic_read(&bp->b_hold) >= 1);
@@ -1047,7 +1047,7 @@ xfs_buf_rele(
* buffer for the LRU and clear the (now stale) dispose list
* state flag
*/
- if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+ if (list_lru_add_obj(&bp->b_target->bt_lru, &bp->b_lru)) {
bp->b_state &= ~XFS_BSTATE_DISPOSE;
atomic_inc(&bp->b_hold);
}
@@ -1060,7 +1060,7 @@ xfs_buf_rele(
* was on was the disposal list
*/
if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
- list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+ list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru);
} else {
ASSERT(list_empty(&bp->b_lru));
}
@@ -2049,6 +2049,14 @@ error_free:
return NULL;
}
+static inline void
+xfs_buf_list_del(
+ struct xfs_buf *bp)
+{
+ list_del_init(&bp->b_list);
+ wake_up_var(&bp->b_list);
+}
+
/*
* Cancel a delayed write list.
*
@@ -2066,7 +2074,7 @@ xfs_buf_delwri_cancel(
xfs_buf_lock(bp);
bp->b_flags &= ~_XBF_DELWRI_Q;
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
xfs_buf_relse(bp);
}
}
@@ -2120,6 +2128,34 @@ xfs_buf_delwri_queue(
}
/*
+ * Queue a buffer to this delwri list as part of a data integrity operation.
+ * If the buffer is on any other delwri list, we'll wait for that to clear
+ * so that the caller can submit the buffer for IO and wait for the result.
+ * Callers must ensure the buffer is not already on the list.
+ */
+void
+xfs_buf_delwri_queue_here(
+ struct xfs_buf *bp,
+ struct list_head *buffer_list)
+{
+ /*
+ * We need this buffer to end up on the /caller's/ delwri list, not any
+ * old list. This can happen if the buffer is marked stale (which
+ * clears DELWRI_Q) after the AIL queues the buffer to its list but
+ * before the AIL has a chance to submit the list.
+ */
+ while (!list_empty(&bp->b_list)) {
+ xfs_buf_unlock(bp);
+ wait_var_event(&bp->b_list, list_empty(&bp->b_list));
+ xfs_buf_lock(bp);
+ }
+
+ ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
+
+ xfs_buf_delwri_queue(bp, buffer_list);
+}
+
+/*
* Compare function is more complex than it needs to be because
* the return value is only 32 bits and we are doing comparisons
* on 64 bit values
@@ -2181,7 +2217,7 @@ xfs_buf_delwri_submit_buffers(
* reference and remove it from the list here.
*/
if (!(bp->b_flags & _XBF_DELWRI_Q)) {
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
xfs_buf_relse(bp);
continue;
}
@@ -2201,7 +2237,7 @@ xfs_buf_delwri_submit_buffers(
list_move_tail(&bp->b_list, wait_list);
} else {
bp->b_flags |= XBF_ASYNC;
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
}
__xfs_buf_submit(bp, false);
}
@@ -2255,7 +2291,7 @@ xfs_buf_delwri_submit(
while (!list_empty(&wait_list)) {
bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
- list_del_init(&bp->b_list);
+ xfs_buf_list_del(bp);
/*
* Wait on the locked buffer, check for errors and unlock and
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c86e16419656..b470de08a46c 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -319,6 +319,7 @@ extern void xfs_buf_stale(struct xfs_buf *bp);
/* Delayed Write Buffer Routines */
extern void xfs_buf_delwri_cancel(struct list_head *);
extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
+void xfs_buf_delwri_queue_here(struct xfs_buf *bp, struct list_head *bl);
extern int xfs_buf_delwri_submit(struct list_head *);
extern int xfs_buf_delwri_submit_nowait(struct list_head *);
extern int xfs_buf_delwri_pushbuf(struct xfs_buf *, struct list_head *);
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 9f3ceb461515..cc6dc56f455d 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -18,6 +18,7 @@
#include "xfs_bmap.h"
#include "xfs_trans.h"
#include "xfs_error.h"
+#include "xfs_health.h"
/*
* Directory file type support functions
@@ -51,7 +52,7 @@ xfs_dir2_sf_getdents(
struct xfs_mount *mp = dp->i_mount;
xfs_dir2_dataptr_t off; /* current entry's offset */
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
- xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
+ struct xfs_dir2_sf_hdr *sfp = dp->i_df.if_data;
xfs_dir2_dataptr_t dot_offset;
xfs_dir2_dataptr_t dotdot_offset;
xfs_ino_t ino;
@@ -59,9 +60,7 @@ xfs_dir2_sf_getdents(
ASSERT(dp->i_df.if_format == XFS_DINODE_FMT_LOCAL);
ASSERT(dp->i_df.if_bytes == dp->i_disk_size);
- ASSERT(dp->i_df.if_u1.if_data != NULL);
-
- sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
+ ASSERT(sfp != NULL);
/*
* If the block number in the offset is out of range, we're done.
@@ -519,6 +518,8 @@ xfs_readdir(
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
ASSERT(xfs_isilocked(dp, XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index a013b87ab8d5..b4f20d9c8f98 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -172,14 +172,14 @@ xfs_qm_adjust_dqtimers(
/*
* initialize a buffer full of dquots and log the whole thing
*/
-STATIC void
+void
xfs_qm_init_dquot_blk(
struct xfs_trans *tp,
- struct xfs_mount *mp,
xfs_dqid_t id,
xfs_dqtype_t type,
struct xfs_buf *bp)
{
+ struct xfs_mount *mp = tp->t_mountp;
struct xfs_quotainfo *q = mp->m_quotainfo;
struct xfs_dqblk *d;
xfs_dqid_t curid;
@@ -353,7 +353,7 @@ xfs_dquot_disk_alloc(
* Make a chunk of dquots out of this buffer and log
* the entire thing.
*/
- xfs_qm_init_dquot_blk(tp, mp, dqp->q_id, qtype, bp);
+ xfs_qm_init_dquot_blk(tp, dqp->q_id, qtype, bp);
xfs_buf_set_ref(bp, XFS_DQUOT_REF);
/*
@@ -1065,7 +1065,7 @@ xfs_qm_dqput(
struct xfs_quotainfo *qi = dqp->q_mount->m_quotainfo;
trace_xfs_dqput_free(dqp);
- if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
+ if (list_lru_add_obj(&qi->qi_lru, &dqp->q_lru))
XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
}
xfs_dqunlock(dqp);
@@ -1362,34 +1362,3 @@ xfs_qm_exit(void)
kmem_cache_destroy(xfs_dqtrx_cache);
kmem_cache_destroy(xfs_dquot_cache);
}
-
-/*
- * Iterate every dquot of a particular type. The caller must ensure that the
- * particular quota type is active. iter_fn can return negative error codes,
- * or -ECANCELED to indicate that it wants to stop iterating.
- */
-int
-xfs_qm_dqiterate(
- struct xfs_mount *mp,
- xfs_dqtype_t type,
- xfs_qm_dqiterate_fn iter_fn,
- void *priv)
-{
- struct xfs_dquot *dq;
- xfs_dqid_t id = 0;
- int error;
-
- do {
- error = xfs_qm_dqget_next(mp, id, type, &dq);
- if (error == -ENOENT)
- return 0;
- if (error)
- return error;
-
- error = iter_fn(dq, type, priv);
- id = dq->q_id + 1;
- xfs_qm_dqput(dq);
- } while (error == 0 && id != 0);
-
- return error;
-}
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 80c8f851a2f3..956272d9b302 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -234,12 +234,10 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
return dqp;
}
-typedef int (*xfs_qm_dqiterate_fn)(struct xfs_dquot *dq,
- xfs_dqtype_t type, void *priv);
-int xfs_qm_dqiterate(struct xfs_mount *mp, xfs_dqtype_t type,
- xfs_qm_dqiterate_fn iter_fn, void *priv);
-
time64_t xfs_dquot_set_timeout(struct xfs_mount *mp, time64_t timeout);
time64_t xfs_dquot_set_grace_period(time64_t grace);
+void xfs_qm_init_dquot_blk(struct xfs_trans *tp, xfs_dqid_t id, xfs_dqtype_t
+ type, struct xfs_buf *bp);
+
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 9ecfdcdc752f..2ccde32c9a9e 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -678,3 +678,16 @@ xfs_extent_busy_ag_cmp(
diff = b1->bno - b2->bno;
return diff;
}
+
+/* Are there any busy extents in this AG? */
+bool
+xfs_extent_busy_list_empty(
+ struct xfs_perag *pag)
+{
+ bool res;
+
+ spin_lock(&pag->pagb_lock);
+ res = RB_EMPTY_ROOT(&pag->pagb_tree);
+ spin_unlock(&pag->pagb_lock);
+ return res;
+}
diff --git a/fs/xfs/xfs_extent_busy.h b/fs/xfs/xfs_extent_busy.h
index 0639aab336f3..470032de3139 100644
--- a/fs/xfs/xfs_extent_busy.h
+++ b/fs/xfs/xfs_extent_busy.h
@@ -85,4 +85,6 @@ static inline void xfs_extent_busy_sort(struct list_head *list)
list_sort(NULL, list, xfs_extent_busy_ag_cmp);
}
+bool xfs_extent_busy_list_empty(struct xfs_perag *pag);
+
#endif /* __XFS_EXTENT_BUSY_H__ */
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 3fa8789820ad..1d1185fca6a5 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -304,39 +304,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = {
};
/*
- * Allocate an "extent free done" log item that will hold nextents worth of
- * extents. The caller must use all nextents extents, because we are not
- * flexible about this at all.
- */
-static struct xfs_efd_log_item *
-xfs_trans_get_efd(
- struct xfs_trans *tp,
- struct xfs_efi_log_item *efip,
- unsigned int nextents)
-{
- struct xfs_efd_log_item *efdp;
-
- ASSERT(nextents > 0);
-
- if (nextents > XFS_EFD_MAX_FAST_EXTENTS) {
- efdp = kzalloc(xfs_efd_log_item_sizeof(nextents),
- GFP_KERNEL | __GFP_NOFAIL);
- } else {
- efdp = kmem_cache_zalloc(xfs_efd_cache,
- GFP_KERNEL | __GFP_NOFAIL);
- }
-
- xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
- &xfs_efd_item_ops);
- efdp->efd_efip = efip;
- efdp->efd_format.efd_nextents = nextents;
- efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
-
- xfs_trans_add_item(tp, &efdp->efd_item);
- return efdp;
-}
-
-/*
* Fill the EFD with all extents from the EFI when we need to roll the
* transaction and continue with a new EFI.
*
@@ -364,69 +331,6 @@ xfs_efd_from_efi(
efdp->efd_next_extent = efip->efi_format.efi_nextents;
}
-/*
- * Free an extent and log it to the EFD. Note that the transaction is marked
- * dirty regardless of whether the extent free succeeds or fails to support the
- * EFI/EFD lifecycle rules.
- */
-static int
-xfs_trans_free_extent(
- struct xfs_trans *tp,
- struct xfs_efd_log_item *efdp,
- struct xfs_extent_free_item *xefi)
-{
- struct xfs_owner_info oinfo = { };
- struct xfs_mount *mp = tp->t_mountp;
- struct xfs_extent *extp;
- uint next_extent;
- xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp,
- xefi->xefi_startblock);
- int error;
-
- oinfo.oi_owner = xefi->xefi_owner;
- if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
- oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
- if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
- oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
-
- trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
- agbno, xefi->xefi_blockcount);
-
- error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
- xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv,
- xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the EFI and frees the EFD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
- /*
- * If we need a new transaction to make progress, the caller will log a
- * new EFI with the current contents. It will also log an EFD to cancel
- * the existing EFI, and so we need to copy all the unprocessed extents
- * in this EFI to the EFD so this works correctly.
- */
- if (error == -EAGAIN) {
- xfs_efd_from_efi(efdp);
- return error;
- }
-
- next_extent = efdp->efd_next_extent;
- ASSERT(next_extent < efdp->efd_format.efd_nextents);
- extp = &(efdp->efd_format.efd_extents[next_extent]);
- extp->ext_start = xefi->xefi_startblock;
- extp->ext_len = xefi->xefi_blockcount;
- efdp->efd_next_extent++;
-
- return error;
-}
-
/* Sort bmap items by AG. */
static int
xfs_extent_free_diff_items(
@@ -453,9 +357,6 @@ xfs_extent_free_log_item(
uint next_extent;
struct xfs_extent *extp;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -481,7 +382,6 @@ xfs_extent_free_create_intent(
ASSERT(count > 0);
- xfs_trans_add_item(tp, &efip->efi_item);
if (sort)
list_sort(mp, items, xfs_extent_free_diff_items);
list_for_each_entry(xefi, items, xefi_list)
@@ -496,7 +396,26 @@ xfs_extent_free_create_done(
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_efd(tp, EFI_ITEM(intent), count)->efd_item;
+ struct xfs_efi_log_item *efip = EFI_ITEM(intent);
+ struct xfs_efd_log_item *efdp;
+
+ ASSERT(count > 0);
+
+ if (count > XFS_EFD_MAX_FAST_EXTENTS) {
+ efdp = kzalloc(xfs_efd_log_item_sizeof(count),
+ GFP_KERNEL | __GFP_NOFAIL);
+ } else {
+ efdp = kmem_cache_zalloc(xfs_efd_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ }
+
+ xfs_log_item_init(tp->t_mountp, &efdp->efd_item, XFS_LI_EFD,
+ &xfs_efd_item_ops);
+ efdp->efd_efip = efip;
+ efdp->efd_format.efd_nextents = count;
+ efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
+
+ return &efdp->efd_item;
}
/* Take a passive ref to the AG containing the space we're freeing. */
@@ -527,19 +446,49 @@ xfs_extent_free_finish_item(
struct list_head *item,
struct xfs_btree_cur **state)
{
+ struct xfs_owner_info oinfo = { };
struct xfs_extent_free_item *xefi;
- int error;
+ struct xfs_efd_log_item *efdp = EFD_ITEM(done);
+ struct xfs_mount *mp = tp->t_mountp;
+ struct xfs_extent *extp;
+ uint next_extent;
+ xfs_agblock_t agbno;
+ int error = 0;
xefi = container_of(item, struct xfs_extent_free_item, xefi_list);
+ agbno = XFS_FSB_TO_AGBNO(mp, xefi->xefi_startblock);
+
+ oinfo.oi_owner = xefi->xefi_owner;
+ if (xefi->xefi_flags & XFS_EFI_ATTR_FORK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_ATTR_FORK;
+ if (xefi->xefi_flags & XFS_EFI_BMBT_BLOCK)
+ oinfo.oi_flags |= XFS_OWNER_INFO_BMBT_BLOCK;
- error = xfs_trans_free_extent(tp, EFD_ITEM(done), xefi);
+ trace_xfs_bmap_free_deferred(tp->t_mountp, xefi->xefi_pag->pag_agno, 0,
+ agbno, xefi->xefi_blockcount);
/*
- * Don't free the XEFI if we need a new transaction to complete
- * processing of it.
+ * If we need a new transaction to make progress, the caller will log a
+ * new EFI with the current contents. It will also log an EFD to cancel
+ * the existing EFI, and so we need to copy all the unprocessed extents
+ * in this EFI to the EFD so this works correctly.
*/
- if (error == -EAGAIN)
+ if (!(xefi->xefi_flags & XFS_EFI_CANCELLED))
+ error = __xfs_free_extent(tp, xefi->xefi_pag, agbno,
+ xefi->xefi_blockcount, &oinfo, xefi->xefi_agresv,
+ xefi->xefi_flags & XFS_EFI_SKIP_DISCARD);
+ if (error == -EAGAIN) {
+ xfs_efd_from_efi(efdp);
return error;
+ }
+
+ /* Add the work we finished to the EFD, even though nobody uses that */
+ next_extent = efdp->efd_next_extent;
+ ASSERT(next_extent < efdp->efd_format.efd_nextents);
+ extp = &(efdp->efd_format.efd_extents[next_extent]);
+ extp->ext_start = xefi->xefi_startblock;
+ extp->ext_len = xefi->xefi_blockcount;
+ efdp->efd_next_extent++;
xfs_extent_free_put_group(xefi);
kmem_cache_free(xfs_extfree_item_cache, xefi);
@@ -567,15 +516,6 @@ xfs_extent_free_cancel_item(
kmem_cache_free(xfs_extfree_item_cache, xefi);
}
-const struct xfs_defer_op_type xfs_extent_free_defer_type = {
- .max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .create_intent = xfs_extent_free_create_intent,
- .abort_intent = xfs_extent_free_abort_intent,
- .create_done = xfs_extent_free_create_done,
- .finish_item = xfs_extent_free_finish_item,
- .cancel_item = xfs_extent_free_cancel_item,
-};
-
/*
* AGFL blocks are accounted differently in the reserve pools and are not
* inserted into the busy extent list.
@@ -610,16 +550,6 @@ xfs_agfl_free_finish_item(
error = xfs_free_agfl_block(tp, xefi->xefi_pag->pag_agno,
agbno, agbp, &oinfo);
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the EFI and frees the EFD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
-
next_extent = efdp->efd_next_extent;
ASSERT(next_extent < efdp->efd_format.efd_nextents);
extp = &(efdp->efd_format.efd_extents[next_extent]);
@@ -632,16 +562,6 @@ xfs_agfl_free_finish_item(
return error;
}
-/* sub-type with special handling for AGFL deferred frees */
-const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
- .max_items = XFS_EFI_MAX_FAST_EXTENTS,
- .create_intent = xfs_extent_free_create_intent,
- .abort_intent = xfs_extent_free_abort_intent,
- .create_done = xfs_extent_free_create_done,
- .finish_item = xfs_agfl_free_finish_item,
- .cancel_item = xfs_extent_free_cancel_item,
-};
-
/* Is this recovered EFI ok? */
static inline bool
xfs_efi_validate_ext(
@@ -651,23 +571,41 @@ xfs_efi_validate_ext(
return xfs_verify_fsbext(mp, extp->ext_start, extp->ext_len);
}
+static inline void
+xfs_efi_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_extent *extp)
+{
+ struct xfs_extent_free_item *xefi;
+
+ xefi = kmem_cache_zalloc(xfs_extfree_item_cache,
+ GFP_KERNEL | __GFP_NOFAIL);
+ xefi->xefi_startblock = extp->ext_start;
+ xefi->xefi_blockcount = extp->ext_len;
+ xefi->xefi_agresv = XFS_AG_RESV_NONE;
+ xefi->xefi_owner = XFS_RMAP_OWN_UNKNOWN;
+ xfs_extent_free_get_group(mp, xefi);
+
+ xfs_defer_add_item(dfp, &xefi->xefi_list);
+}
+
/*
* Process an extent free intent item that was recovered from
* the log. We need to free the extents that it describes.
*/
STATIC int
-xfs_efi_item_recover(
- struct xfs_log_item *lip,
+xfs_extent_free_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_efi_log_item *efip = EFI_ITEM(lip);
struct xfs_mount *mp = lip->li_log->l_mp;
- struct xfs_efd_log_item *efdp;
struct xfs_trans *tp;
int i;
int error = 0;
- bool requeue_only = false;
/*
* First check the validity of the extents described by the
@@ -682,55 +620,22 @@ xfs_efi_item_recover(
sizeof(efip->efi_format));
return -EFSCORRUPTED;
}
+
+ xfs_efi_recover_work(mp, dfp, &efip->efi_format.efi_extents[i]);
}
resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
error = xfs_trans_alloc(mp, &resv, 0, 0, 0, &tp);
if (error)
return error;
- efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
-
- for (i = 0; i < efip->efi_format.efi_nextents; i++) {
- struct xfs_extent_free_item fake = {
- .xefi_owner = XFS_RMAP_OWN_UNKNOWN,
- .xefi_agresv = XFS_AG_RESV_NONE,
- };
- struct xfs_extent *extp;
- extp = &efip->efi_format.efi_extents[i];
-
- fake.xefi_startblock = extp->ext_start;
- fake.xefi_blockcount = extp->ext_len;
-
- if (!requeue_only) {
- xfs_extent_free_get_group(mp, &fake);
- error = xfs_trans_free_extent(tp, efdp, &fake);
- xfs_extent_free_put_group(&fake);
- }
-
- /*
- * If we can't free the extent without potentially deadlocking,
- * requeue the rest of the extents to a new so that they get
- * run again later with a new transaction context.
- */
- if (error == -EAGAIN || requeue_only) {
- error = xfs_free_extent_later(tp, fake.xefi_startblock,
- fake.xefi_blockcount,
- &XFS_RMAP_OINFO_ANY_OWNER,
- fake.xefi_agresv);
- if (!error) {
- requeue_only = true;
- continue;
- }
- }
-
- if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- extp, sizeof(*extp));
- if (error)
- goto abort_error;
-
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &efip->efi_format,
+ sizeof(efip->efi_format));
+ if (error)
+ goto abort_error;
return xfs_defer_ops_capture_and_commit(tp, capture_list);
@@ -739,21 +644,14 @@ abort_error:
return error;
}
-STATIC bool
-xfs_efi_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_efi_item_relog(
+xfs_extent_free_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_efd_log_item *efdp;
+ struct xfs_efd_log_item *efdp = EFD_ITEM(done_item);
struct xfs_efi_log_item *efip;
struct xfs_extent *extp;
unsigned int count;
@@ -761,29 +659,56 @@ xfs_efi_item_relog(
count = EFI_ITEM(intent)->efi_format.efi_nextents;
extp = EFI_ITEM(intent)->efi_format.efi_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- efdp = xfs_trans_get_efd(tp, EFI_ITEM(intent), count);
efdp->efd_next_extent = count;
memcpy(efdp->efd_format.efd_extents, extp, count * sizeof(*extp));
- set_bit(XFS_LI_DIRTY, &efdp->efd_item.li_flags);
efip = xfs_efi_init(tp->t_mountp, count);
memcpy(efip->efi_format.efi_extents, extp, count * sizeof(*extp));
atomic_set(&efip->efi_next_extent, count);
- xfs_trans_add_item(tp, &efip->efi_item);
- set_bit(XFS_LI_DIRTY, &efip->efi_item.li_flags);
+
return &efip->efi_item;
}
+const struct xfs_defer_op_type xfs_extent_free_defer_type = {
+ .name = "extent_free",
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_extent_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+ .recover_work = xfs_extent_free_recover_work,
+ .relog_intent = xfs_extent_free_relog_intent,
+};
+
+/* sub-type with special handling for AGFL deferred frees */
+const struct xfs_defer_op_type xfs_agfl_free_defer_type = {
+ .name = "agfl_free",
+ .max_items = XFS_EFI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_extent_free_create_intent,
+ .abort_intent = xfs_extent_free_abort_intent,
+ .create_done = xfs_extent_free_create_done,
+ .finish_item = xfs_agfl_free_finish_item,
+ .cancel_item = xfs_extent_free_cancel_item,
+ .recover_work = xfs_extent_free_recover_work,
+ .relog_intent = xfs_extent_free_relog_intent,
+};
+
+STATIC bool
+xfs_efi_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return EFI_ITEM(lip)->efi_format.efi_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_efi_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_efi_item_size,
.iop_format = xfs_efi_item_format,
.iop_unpin = xfs_efi_item_unpin,
.iop_release = xfs_efi_item_release,
- .iop_recover = xfs_efi_item_recover,
.iop_match = xfs_efi_item_match,
- .iop_relog = xfs_efi_item_relog,
};
/*
@@ -820,12 +745,9 @@ xlog_recover_efi_commit_pass2(
return error;
}
atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &efip->efi_item, lsn);
- xfs_efi_release(efip);
+
+ xlog_recover_intent_item(log, &efip->efi_item, lsn,
+ &xfs_extent_free_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 7cb75cb6b8e9..83f708f62ed9 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -134,6 +134,10 @@ xfs_growfs_data_private(
if (delta < 0 && nagcount < 2)
return -EINVAL;
+ /* No work to do */
+ if (delta == 0)
+ return 0;
+
oagcount = mp->m_sb.sb_agcount;
/* allocate the new per-ag structures */
if (nagcount > oagcount) {
@@ -153,7 +157,7 @@ xfs_growfs_data_private(
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_growdata, -delta, 0,
0, &tp);
if (error)
- return error;
+ goto out_free_unused_perag;
last_pag = xfs_perag_get(mp, oagcount - 1);
if (delta > 0) {
@@ -227,6 +231,9 @@ xfs_growfs_data_private(
out_trans_cancel:
xfs_trans_cancel(tp);
+out_free_unused_perag:
+ if (nagcount > oagcount)
+ xfs_free_unused_perag_range(mp, oagcount, nagcount);
return error;
}
@@ -344,59 +351,20 @@ xfs_growfs_log(
}
/*
- * exported through ioctl XFS_IOC_FSCOUNTS
- */
-
-void
-xfs_fs_counts(
- xfs_mount_t *mp,
- xfs_fsop_counts_t *cnt)
-{
- cnt->allocino = percpu_counter_read_positive(&mp->m_icount);
- cnt->freeino = percpu_counter_read_positive(&mp->m_ifree);
- cnt->freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
- xfs_fdblocks_unavailable(mp);
- cnt->freertx = percpu_counter_read_positive(&mp->m_frextents);
-}
-
-/*
- * exported through ioctl XFS_IOC_SET_RESBLKS & XFS_IOC_GET_RESBLKS
- *
- * xfs_reserve_blocks is called to set m_resblks
- * in the in-core mount table. The number of unused reserved blocks
- * is kept in m_resblks_avail.
- *
* Reserve the requested number of blocks if available. Otherwise return
* as many as possible to satisfy the request. The actual number
- * reserved are returned in outval
- *
- * A null inval pointer indicates that only the current reserved blocks
- * available should be returned no settings are changed.
+ * reserved are returned in outval.
*/
-
int
xfs_reserve_blocks(
- xfs_mount_t *mp,
- uint64_t *inval,
- xfs_fsop_resblks_t *outval)
+ struct xfs_mount *mp,
+ uint64_t request)
{
int64_t lcounter, delta;
int64_t fdblks_delta = 0;
- uint64_t request;
int64_t free;
int error = 0;
- /* If inval is null, report current values and return */
- if (inval == (uint64_t *)NULL) {
- if (!outval)
- return -EINVAL;
- outval->resblks = mp->m_resblks;
- outval->resblks_avail = mp->m_resblks_avail;
- return 0;
- }
-
- request = *inval;
-
/*
* With per-cpu counters, this becomes an interesting problem. we need
* to work out if we are freeing or allocation blocks first, then we can
@@ -466,11 +434,6 @@ xfs_reserve_blocks(
spin_lock(&mp->m_sb_lock);
}
out:
- if (outval) {
- outval->resblks = mp->m_resblks;
- outval->resblks_avail = mp->m_resblks_avail;
- }
-
spin_unlock(&mp->m_sb_lock);
return error;
}
@@ -482,9 +445,9 @@ xfs_fs_goingdown(
{
switch (inflags) {
case XFS_FSOP_GOING_FLAGS_DEFAULT: {
- if (!freeze_bdev(mp->m_super->s_bdev)) {
+ if (!bdev_freeze(mp->m_super->s_bdev)) {
xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
- thaw_bdev(mp->m_super->s_bdev);
+ bdev_thaw(mp->m_super->s_bdev);
}
break;
}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index 2cffe51a31e8..44457b0a0593 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -6,14 +6,12 @@
#ifndef __XFS_FSOPS_H__
#define __XFS_FSOPS_H__
-extern int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
-extern int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
-extern void xfs_fs_counts(xfs_mount_t *mp, xfs_fsop_counts_t *cnt);
-extern int xfs_reserve_blocks(xfs_mount_t *mp, uint64_t *inval,
- xfs_fsop_resblks_t *outval);
-extern int xfs_fs_goingdown(xfs_mount_t *mp, uint32_t inflags);
+int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in);
+int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in);
+int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request);
+int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags);
-extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
-extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
+int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
#endif /* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 9edc1f2bc939..f18fec0adf66 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -44,4 +44,16 @@ struct xfs_globals xfs_globals = {
.pwork_threads = -1, /* automatic thread detection */
.larp = false, /* log attribute replay */
#endif
+
+ /*
+ * Leave this many record slots empty when bulk loading btrees. By
+ * default we load new btree leaf blocks 75% full.
+ */
+ .bload_leaf_slack = -1,
+
+ /*
+ * Leave this many key/ptr slots empty when bulk loading btrees. By
+ * default we load new btree node blocks 75% full.
+ */
+ .bload_node_slack = -1,
};
diff --git a/fs/xfs/xfs_health.c b/fs/xfs/xfs_health.c
index 72a075bb2c10..9a57afee9338 100644
--- a/fs/xfs/xfs_health.c
+++ b/fs/xfs/xfs_health.c
@@ -222,7 +222,7 @@ xfs_inode_mark_sick(
struct xfs_inode *ip,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+ ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
trace_xfs_inode_mark_sick(ip, mask);
spin_lock(&ip->i_flags_lock);
@@ -246,7 +246,7 @@ xfs_inode_mark_healthy(
struct xfs_inode *ip,
unsigned int mask)
{
- ASSERT(!(mask & ~XFS_SICK_INO_PRIMARY));
+ ASSERT(!(mask & ~(XFS_SICK_INO_PRIMARY | XFS_SICK_INO_ZAPPED)));
trace_xfs_inode_mark_healthy(ip, mask);
spin_lock(&ip->i_flags_lock);
@@ -369,6 +369,10 @@ static const struct ioctl_sick_map ino_map[] = {
{ XFS_SICK_INO_XATTR, XFS_BS_SICK_XATTR },
{ XFS_SICK_INO_SYMLINK, XFS_BS_SICK_SYMLINK },
{ XFS_SICK_INO_PARENT, XFS_BS_SICK_PARENT },
+ { XFS_SICK_INO_BMBTD_ZAPPED, XFS_BS_SICK_BMBTD },
+ { XFS_SICK_INO_BMBTA_ZAPPED, XFS_BS_SICK_BMBTA },
+ { XFS_SICK_INO_DIR_ZAPPED, XFS_BS_SICK_DIR },
+ { XFS_SICK_INO_SYMLINK_ZAPPED, XFS_BS_SICK_SYMLINK },
{ 0, 0 },
};
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index c0f1c89786c2..1fd94958aa97 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -37,15 +37,10 @@
#include "xfs_reflink.h"
#include "xfs_ag.h"
#include "xfs_log_priv.h"
+#include "xfs_health.h"
struct kmem_cache *xfs_inode_cache;
-/*
- * Used in xfs_itruncate_extents(). This is the maximum number of extents
- * freed from a file in a single transaction.
- */
-#define XFS_ITRUNC_MAX_EXTENTS 2
-
STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
STATIC int xfs_iunlink_remove(struct xfs_trans *tp, struct xfs_perag *pag,
struct xfs_inode *);
@@ -661,6 +656,8 @@ xfs_lookup(
if (xfs_is_shutdown(dp->i_mount))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
if (error)
@@ -875,7 +872,7 @@ xfs_init_new_inode(
case S_IFLNK:
ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
ip->i_df.if_bytes = 0;
- ip->i_df.if_u1.if_root = NULL;
+ ip->i_df.if_data = NULL;
break;
default:
ASSERT(0);
@@ -978,6 +975,8 @@ xfs_create(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
prid = xfs_get_initial_prid(dp);
@@ -1217,6 +1216,8 @@ xfs_link(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(tdp, XFS_DATA_FORK))
+ return -EIO;
error = xfs_qm_dqattach(sip);
if (error)
@@ -1339,7 +1340,6 @@ xfs_itruncate_extents_flags(
struct xfs_mount *mp = ip->i_mount;
struct xfs_trans *tp = *tpp;
xfs_fileoff_t first_unmap_block;
- xfs_filblks_t unmap_len;
int error = 0;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -1371,19 +1371,10 @@ xfs_itruncate_extents_flags(
return 0;
}
- unmap_len = XFS_MAX_FILEOFF - first_unmap_block + 1;
- while (unmap_len > 0) {
- ASSERT(tp->t_highest_agno == NULLAGNUMBER);
- error = __xfs_bunmapi(tp, ip, first_unmap_block, &unmap_len,
- flags, XFS_ITRUNC_MAX_EXTENTS);
- if (error)
- goto out;
-
- /* free the just unmapped extents */
- error = xfs_defer_finish(&tp);
- if (error)
- goto out;
- }
+ error = xfs_bunmapi_range(&tp, ip, flags, first_unmap_block,
+ XFS_MAX_FILEOFF);
+ if (error)
+ goto out;
if (whichfork == XFS_DATA_FORK) {
/* Remove all pending CoW reservations. */
@@ -2387,8 +2378,8 @@ xfs_ifree(
* already been freed by xfs_attr_inactive.
*/
if (ip->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
- kmem_free(ip->i_df.if_u1.if_data);
- ip->i_df.if_u1.if_data = NULL;
+ kmem_free(ip->i_df.if_data);
+ ip->i_df.if_data = NULL;
ip->i_df.if_bytes = 0;
}
@@ -2506,6 +2497,8 @@ xfs_remove(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
+ return -EIO;
error = xfs_qm_dqattach(dp);
if (error)
@@ -3758,3 +3751,29 @@ xfs_inode_reload_unlinked(
return error;
}
+
+/* Has this inode fork been zapped by repair? */
+bool
+xfs_ifork_zapped(
+ const struct xfs_inode *ip,
+ int whichfork)
+{
+ unsigned int datamask = 0;
+
+ switch (whichfork) {
+ case XFS_DATA_FORK:
+ switch (ip->i_vnode.i_mode & S_IFMT) {
+ case S_IFDIR:
+ datamask = XFS_SICK_INO_DIR_ZAPPED;
+ break;
+ case S_IFLNK:
+ datamask = XFS_SICK_INO_SYMLINK_ZAPPED;
+ break;
+ }
+ return ip->i_sick & (XFS_SICK_INO_BMBTD_ZAPPED | datamask);
+ case XFS_ATTR_FORK:
+ return ip->i_sick & XFS_SICK_INO_BMBTA_ZAPPED;
+ default:
+ return false;
+ }
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 3beb470f1892..97f63bacd4c2 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -622,4 +622,6 @@ xfs_inode_unlinked_incomplete(
int xfs_inode_reload_unlinked_bucket(struct xfs_trans *tp, struct xfs_inode *ip);
int xfs_inode_reload_unlinked(struct xfs_inode *ip);
+bool xfs_ifork_zapped(const struct xfs_inode *ip, int whichfork);
+
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index cd7803fda8b1..0aee97ba0be8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -352,11 +352,10 @@ xfs_inode_item_format_data_fork(
~(XFS_ILOG_DEXT | XFS_ILOG_DBROOT | XFS_ILOG_DEV);
if ((iip->ili_fields & XFS_ILOG_DDATA) &&
ip->i_df.if_bytes > 0) {
- ASSERT(ip->i_df.if_u1.if_data != NULL);
+ ASSERT(ip->i_df.if_data != NULL);
ASSERT(ip->i_disk_size > 0);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_ILOCAL,
- ip->i_df.if_u1.if_data,
- ip->i_df.if_bytes);
+ ip->i_df.if_data, ip->i_df.if_bytes);
ilf->ilf_dsize = (unsigned)ip->i_df.if_bytes;
ilf->ilf_size++;
} else {
@@ -431,10 +430,9 @@ xfs_inode_item_format_attr_fork(
if ((iip->ili_fields & XFS_ILOG_ADATA) &&
ip->i_af.if_bytes > 0) {
- ASSERT(ip->i_af.if_u1.if_data != NULL);
+ ASSERT(ip->i_af.if_data != NULL);
xlog_copy_iovec(lv, vecp, XLOG_REG_TYPE_IATTR_LOCAL,
- ip->i_af.if_u1.if_data,
- ip->i_af.if_bytes);
+ ip->i_af.if_data, ip->i_af.if_bytes);
ilf->ilf_asize = (unsigned)ip->i_af.if_bytes;
ilf->ilf_size++;
} else {
@@ -557,6 +555,9 @@ xfs_inode_to_log_dinode(
memset(to->di_pad2, 0, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &ip->i_mount->m_sb.sb_meta_uuid);
to->di_v3_pad = 0;
+
+ /* dummy value for initialisation */
+ to->di_crc = 0;
} else {
to->di_version = 2;
to->di_flushiter = ip->i_flushiter;
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 6c3919687ea6..f02b6e558af5 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -1872,6 +1872,63 @@ xfs_fs_eofblocks_from_user(
return 0;
}
+static int
+xfs_ioctl_getset_resblocks(
+ struct file *filp,
+ unsigned int cmd,
+ void __user *arg)
+{
+ struct xfs_mount *mp = XFS_I(file_inode(filp))->i_mount;
+ struct xfs_fsop_resblks fsop = { };
+ int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (cmd == XFS_IOC_SET_RESBLKS) {
+ if (xfs_is_readonly(mp))
+ return -EROFS;
+
+ if (copy_from_user(&fsop, arg, sizeof(fsop)))
+ return -EFAULT;
+
+ error = mnt_want_write_file(filp);
+ if (error)
+ return error;
+ error = xfs_reserve_blocks(mp, fsop.resblks);
+ mnt_drop_write_file(filp);
+ if (error)
+ return error;
+ }
+
+ spin_lock(&mp->m_sb_lock);
+ fsop.resblks = mp->m_resblks;
+ fsop.resblks_avail = mp->m_resblks_avail;
+ spin_unlock(&mp->m_sb_lock);
+
+ if (copy_to_user(arg, &fsop, sizeof(fsop)))
+ return -EFAULT;
+ return 0;
+}
+
+static int
+xfs_ioctl_fs_counts(
+ struct xfs_mount *mp,
+ struct xfs_fsop_counts __user *uarg)
+{
+ struct xfs_fsop_counts out = {
+ .allocino = percpu_counter_read_positive(&mp->m_icount),
+ .freeino = percpu_counter_read_positive(&mp->m_ifree),
+ .freedata = percpu_counter_read_positive(&mp->m_fdblocks) -
+ xfs_fdblocks_unavailable(mp),
+ .freertx = percpu_counter_read_positive(&mp->m_frextents),
+ };
+
+ if (copy_to_user(uarg, &out, sizeof(out)))
+ return -EFAULT;
+ return 0;
+}
+
/*
* These long-unused ioctls were removed from the official ioctl API in 5.17,
* but retain these definitions so that we can log warnings about them.
@@ -2008,60 +2065,12 @@ xfs_file_ioctl(
return error;
}
- case XFS_IOC_FSCOUNTS: {
- xfs_fsop_counts_t out;
+ case XFS_IOC_FSCOUNTS:
+ return xfs_ioctl_fs_counts(mp, arg);
- xfs_fs_counts(mp, &out);
-
- if (copy_to_user(arg, &out, sizeof(out)))
- return -EFAULT;
- return 0;
- }
-
- case XFS_IOC_SET_RESBLKS: {
- xfs_fsop_resblks_t inout;
- uint64_t in;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (xfs_is_readonly(mp))
- return -EROFS;
-
- if (copy_from_user(&inout, arg, sizeof(inout)))
- return -EFAULT;
-
- error = mnt_want_write_file(filp);
- if (error)
- return error;
-
- /* input parameter is passed in resblks field of structure */
- in = inout.resblks;
- error = xfs_reserve_blocks(mp, &in, &inout);
- mnt_drop_write_file(filp);
- if (error)
- return error;
-
- if (copy_to_user(arg, &inout, sizeof(inout)))
- return -EFAULT;
- return 0;
- }
-
- case XFS_IOC_GET_RESBLKS: {
- xfs_fsop_resblks_t out;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- error = xfs_reserve_blocks(mp, NULL, &out);
- if (error)
- return error;
-
- if (copy_to_user(arg, &out, sizeof(out)))
- return -EFAULT;
-
- return 0;
- }
+ case XFS_IOC_SET_RESBLKS:
+ case XFS_IOC_GET_RESBLKS:
+ return xfs_ioctl_getset_resblocks(filp, cmd, arg);
case XFS_IOC_FSGROWFSDATA: {
struct xfs_growfs_data in;
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index ee206facf0dc..a1650fc81382 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1542,6 +1542,7 @@ xlog_alloc_log(
log->l_covered_state = XLOG_STATE_COVER_IDLE;
set_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
+ INIT_LIST_HEAD(&log->r_dfops);
log->l_prev_block = -1;
/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index fa3ad1d7b31c..e30c06ec20e3 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -407,6 +407,7 @@ struct xlog {
long l_opstate; /* operational state */
uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
struct list_head *l_buf_cancel_table;
+ struct list_head r_dfops; /* recovered log intent items */
int l_iclog_hsize; /* size of iclog header */
int l_iclog_heads; /* # of iclog header sectors */
uint l_sectBBsize; /* sector size in BBs (2^n) */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a1e18b24971a..1251c81e55f9 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1723,30 +1723,24 @@ xlog_clear_stale_blocks(
*/
void
xlog_recover_release_intent(
- struct xlog *log,
- unsigned short intent_type,
- uint64_t intent_id)
+ struct xlog *log,
+ unsigned short intent_type,
+ uint64_t intent_id)
{
- struct xfs_ail_cursor cur;
- struct xfs_log_item *lip;
- struct xfs_ail *ailp = log->l_ailp;
+ struct xfs_defer_pending *dfp, *n;
+
+ list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+ struct xfs_log_item *lip = dfp->dfp_intent;
- spin_lock(&ailp->ail_lock);
- for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0); lip != NULL;
- lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
if (lip->li_type != intent_type)
continue;
if (!lip->li_ops->iop_match(lip, intent_id))
continue;
- spin_unlock(&ailp->ail_lock);
- lip->li_ops->iop_release(lip);
- spin_lock(&ailp->ail_lock);
- break;
- }
+ ASSERT(xlog_item_is_intent(lip));
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
+ xfs_defer_cancel_recovery(log->l_mp, dfp);
+ }
}
int
@@ -1939,6 +1933,29 @@ xlog_buf_readahead(
xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
}
+/*
+ * Create a deferred work structure for resuming and tracking the progress of a
+ * log intent item that was found during recovery.
+ */
+void
+xlog_recover_intent_item(
+ struct xlog *log,
+ struct xfs_log_item *lip,
+ xfs_lsn_t lsn,
+ const struct xfs_defer_op_type *ops)
+{
+ ASSERT(xlog_item_is_intent(lip));
+
+ xfs_defer_start_recovery(lip, &log->r_dfops, ops);
+
+ /*
+ * Insert the intent into the AIL directly and drop one reference so
+ * that finishing or canceling the work will drop the other.
+ */
+ xfs_trans_ail_insert(log->l_ailp, lip, lsn);
+ lip->li_ops->iop_unpin(lip, 0);
+}
+
STATIC int
xlog_recover_items_pass2(
struct xlog *log,
@@ -2533,36 +2550,26 @@ xlog_abort_defer_ops(
*/
STATIC int
xlog_recover_process_intents(
- struct xlog *log)
+ struct xlog *log)
{
LIST_HEAD(capture_list);
- struct xfs_ail_cursor cur;
- struct xfs_log_item *lip;
- struct xfs_ail *ailp;
- int error = 0;
+ struct xfs_defer_pending *dfp, *n;
+ int error = 0;
#if defined(DEBUG) || defined(XFS_WARN)
- xfs_lsn_t last_lsn;
-#endif
+ xfs_lsn_t last_lsn;
- ailp = log->l_ailp;
- spin_lock(&ailp->ail_lock);
-#if defined(DEBUG) || defined(XFS_WARN)
last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
#endif
- for (lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- lip != NULL;
- lip = xfs_trans_ail_cursor_next(ailp, &cur)) {
- const struct xfs_item_ops *ops;
- if (!xlog_item_is_intent(lip))
- break;
+ list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+ ASSERT(xlog_item_is_intent(dfp->dfp_intent));
/*
* We should never see a redo item with a LSN higher than
* the last transaction we found in the log at the start
* of recovery.
*/
- ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
+ ASSERT(XFS_LSN_CMP(last_lsn, dfp->dfp_intent->li_lsn) >= 0);
/*
* NOTE: If your intent processing routine can create more
@@ -2571,21 +2578,14 @@ xlog_recover_process_intents(
* replayed in the wrong order!
*
* The recovery function can free the log item, so we must not
- * access lip after it returns.
+ * access dfp->dfp_intent after it returns. It must dispose of
+ * @dfp if it returns 0.
*/
- spin_unlock(&ailp->ail_lock);
- ops = lip->li_ops;
- error = ops->iop_recover(lip, &capture_list);
- spin_lock(&ailp->ail_lock);
- if (error) {
- trace_xlog_intent_recovery_failed(log->l_mp, error,
- ops->iop_recover);
+ error = xfs_defer_finish_recovery(log->l_mp, dfp,
+ &capture_list);
+ if (error)
break;
- }
}
-
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
if (error)
goto err;
@@ -2606,27 +2606,34 @@ err:
*/
STATIC void
xlog_recover_cancel_intents(
- struct xlog *log)
+ struct xlog *log)
{
- struct xfs_log_item *lip;
- struct xfs_ail_cursor cur;
- struct xfs_ail *ailp;
-
- ailp = log->l_ailp;
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
- while (lip != NULL) {
- if (!xlog_item_is_intent(lip))
- break;
+ struct xfs_defer_pending *dfp, *n;
- spin_unlock(&ailp->ail_lock);
- lip->li_ops->iop_release(lip);
- spin_lock(&ailp->ail_lock);
- lip = xfs_trans_ail_cursor_next(ailp, &cur);
+ list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
+ ASSERT(xlog_item_is_intent(dfp->dfp_intent));
+
+ xfs_defer_cancel_recovery(log->l_mp, dfp);
}
+}
- xfs_trans_ail_cursor_done(&cur);
- spin_unlock(&ailp->ail_lock);
+/*
+ * Transfer ownership of the recovered pending work to the recovery transaction
+ * and try to finish the work. If there is more work to be done, the dfp will
+ * remain attached to the transaction. If not, the dfp is freed.
+ */
+int
+xlog_recover_finish_intent(
+ struct xfs_trans *tp,
+ struct xfs_defer_pending *dfp)
+{
+ int error;
+
+ list_move(&dfp->dfp_list, &tp->t_dfops);
+ error = xfs_defer_finish_one(tp, dfp);
+ if (error == -EAGAIN)
+ return 0;
+ return error;
}
/*
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index aed5be5508fe..aabb25dc3efa 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -637,7 +637,6 @@ xfs_mountfs(
struct xfs_sb *sbp = &(mp->m_sb);
struct xfs_inode *rip;
struct xfs_ino_geometry *igeo = M_IGEO(mp);
- uint64_t resblks;
uint quotamount = 0;
uint quotaflags = 0;
int error = 0;
@@ -974,8 +973,7 @@ xfs_mountfs(
* we were already there on the last unmount. Warn if this occurs.
*/
if (!xfs_is_readonly(mp)) {
- resblks = xfs_default_resblks(mp);
- error = xfs_reserve_blocks(mp, &resblks, NULL);
+ error = xfs_reserve_blocks(mp, xfs_default_resblks(mp));
if (error)
xfs_warn(mp,
"Unable to allocate reserve blocks. Continuing without reserve pool.");
@@ -1053,7 +1051,6 @@ void
xfs_unmountfs(
struct xfs_mount *mp)
{
- uint64_t resblks;
int error;
/*
@@ -1090,8 +1087,7 @@ xfs_unmountfs(
* we only every apply deltas to the superblock and hence the incore
* value does not matter....
*/
- resblks = 0;
- error = xfs_reserve_blocks(mp, &resblks, NULL);
+ error = xfs_reserve_blocks(mp, 0);
if (error)
xfs_warn(mp, "Unable to free reserved block pool. "
"Freespace may not be correct on next mount.");
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index a7daa522e00f..fa50e5308292 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -22,6 +22,7 @@
#include <linux/mm.h>
#include <linux/dax.h>
+#include <linux/fs.h>
struct xfs_failure_info {
xfs_agblock_t startblock;
@@ -73,10 +74,16 @@ xfs_dax_failure_fn(
struct xfs_mount *mp = cur->bc_mp;
struct xfs_inode *ip;
struct xfs_failure_info *notify = data;
+ struct address_space *mapping;
+ pgoff_t pgoff;
+ unsigned long pgcnt;
int error = 0;
if (XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) ||
(rec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))) {
+ /* Continue the query because this isn't a failure. */
+ if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+ return 0;
notify->want_shutdown = true;
return 0;
}
@@ -92,15 +99,61 @@ xfs_dax_failure_fn(
return 0;
}
- error = mf_dax_kill_procs(VFS_I(ip)->i_mapping,
- xfs_failure_pgoff(mp, rec, notify),
- xfs_failure_pgcnt(mp, rec, notify),
- notify->mf_flags);
+ mapping = VFS_I(ip)->i_mapping;
+ pgoff = xfs_failure_pgoff(mp, rec, notify);
+ pgcnt = xfs_failure_pgcnt(mp, rec, notify);
+
+ /* Continue the rmap query if the inode isn't a dax file. */
+ if (dax_mapping(mapping))
+ error = mf_dax_kill_procs(mapping, pgoff, pgcnt,
+ notify->mf_flags);
+
+ /* Invalidate the cache in dax pages. */
+ if (notify->mf_flags & MF_MEM_PRE_REMOVE)
+ invalidate_inode_pages2_range(mapping, pgoff,
+ pgoff + pgcnt - 1);
+
xfs_irele(ip);
return error;
}
static int
+xfs_dax_notify_failure_freeze(
+ struct xfs_mount *mp)
+{
+ struct super_block *sb = mp->m_super;
+ int error;
+
+ error = freeze_super(sb, FREEZE_HOLDER_KERNEL);
+ if (error)
+ xfs_emerg(mp, "already frozen by kernel, err=%d", error);
+
+ return error;
+}
+
+static void
+xfs_dax_notify_failure_thaw(
+ struct xfs_mount *mp,
+ bool kernel_frozen)
+{
+ struct super_block *sb = mp->m_super;
+ int error;
+
+ if (kernel_frozen) {
+ error = thaw_super(sb, FREEZE_HOLDER_KERNEL);
+ if (error)
+ xfs_emerg(mp, "still frozen after notify failure, err=%d",
+ error);
+ }
+
+ /*
+ * Also thaw userspace call anyway because the device is about to be
+ * removed immediately.
+ */
+ thaw_super(sb, FREEZE_HOLDER_USERSPACE);
+}
+
+static int
xfs_dax_notify_ddev_failure(
struct xfs_mount *mp,
xfs_daddr_t daddr,
@@ -112,15 +165,29 @@ xfs_dax_notify_ddev_failure(
struct xfs_btree_cur *cur = NULL;
struct xfs_buf *agf_bp = NULL;
int error = 0;
+ bool kernel_frozen = false;
xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, daddr);
xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, fsbno);
xfs_fsblock_t end_fsbno = XFS_DADDR_TO_FSB(mp,
daddr + bblen - 1);
xfs_agnumber_t end_agno = XFS_FSB_TO_AGNO(mp, end_fsbno);
+ if (mf_flags & MF_MEM_PRE_REMOVE) {
+ xfs_info(mp, "Device is about to be removed!");
+ /*
+ * Freeze fs to prevent new mappings from being created.
+ * - Keep going on if others already hold the kernel forzen.
+ * - Keep going on if other errors too because this device is
+ * starting to fail.
+ * - If kernel frozen state is hold successfully here, thaw it
+ * here as well at the end.
+ */
+ kernel_frozen = xfs_dax_notify_failure_freeze(mp) == 0;
+ }
+
error = xfs_trans_alloc_empty(mp, &tp);
if (error)
- return error;
+ goto out;
for (; agno <= end_agno; agno++) {
struct xfs_rmap_irec ri_low = { };
@@ -165,11 +232,26 @@ xfs_dax_notify_ddev_failure(
}
xfs_trans_cancel(tp);
- if (error || notify.want_shutdown) {
+
+ /*
+ * Shutdown fs from a force umount in pre-remove case which won't fail,
+ * so errors can be ignored. Otherwise, shutdown the filesystem with
+ * CORRUPT flag if error occured or notify.want_shutdown was set during
+ * RMAP querying.
+ */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ xfs_force_shutdown(mp, SHUTDOWN_FORCE_UMOUNT);
+ else if (error || notify.want_shutdown) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
if (!error)
error = -EFSCORRUPTED;
}
+
+out:
+ /* Thaw the fs if it has been frozen before. */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ xfs_dax_notify_failure_thaw(mp, kernel_frozen);
+
return error;
}
@@ -197,6 +279,14 @@ xfs_dax_notify_failure(
if (mp->m_logdev_targp && mp->m_logdev_targp->bt_daxdev == dax_dev &&
mp->m_logdev_targp != mp->m_ddev_targp) {
+ /*
+ * In the pre-remove case the failure notification is attempting
+ * to trigger a force unmount. The expectation is that the
+ * device is still present, but its removal is in progress and
+ * can not be cancelled, proceed with accessing the log device.
+ */
+ if (mf_flags & MF_MEM_PRE_REMOVE)
+ return 0;
xfs_err(mp, "ondisk log corrupt, shutting down fs!");
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_ONDISK);
return -EFSCORRUPTED;
@@ -210,6 +300,12 @@ xfs_dax_notify_failure(
ddev_start = mp->m_ddev_targp->bt_dax_part_off;
ddev_end = ddev_start + bdev_nr_bytes(mp->m_ddev_targp->bt_bdev) - 1;
+ /* Notify failure on the whole device. */
+ if (offset == 0 && len == U64_MAX) {
+ offset = ddev_start;
+ len = bdev_nr_bytes(mp->m_ddev_targp->bt_bdev);
+ }
+
/* Ignore the range out of filesystem area */
if (offset + len - 1 < ddev_start)
return -ENXIO;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 94a7932ac570..67d0a8564ff3 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -171,7 +171,7 @@ xfs_qm_dqpurge(
* hits zero, so it really should be on the freelist here.
*/
ASSERT(!list_empty(&dqp->q_lru));
- list_lru_del(&qi->qi_lru, &dqp->q_lru);
+ list_lru_del_obj(&qi->qi_lru, &dqp->q_lru);
XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
xfs_qm_dqdestroy(dqp);
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index dcc785fdd345..e0d56489f3b2 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -127,7 +127,10 @@ xfs_qm_vop_dqalloc(struct xfs_inode *ip, kuid_t kuid, kgid_t kgid,
}
#define xfs_trans_dup_dqinfo(tp, tp2)
#define xfs_trans_free_dqinfo(tp)
-#define xfs_trans_mod_dquot_byino(tp, ip, fields, delta) do { } while (0)
+static inline void xfs_trans_mod_dquot_byino(struct xfs_trans *tp,
+ struct xfs_inode *ip, uint field, int64_t delta)
+{
+}
#define xfs_trans_apply_dquot_deltas(tp)
#define xfs_trans_unreserve_and_mod_dquots(tp)
static inline int xfs_trans_reserve_quota_nblks(struct xfs_trans *tp,
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index 2d4444d61e98..20ad8086da60 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -227,52 +227,6 @@ static const struct xfs_item_ops xfs_cud_item_ops = {
.iop_intent = xfs_cud_item_intent,
};
-static struct xfs_cud_log_item *
-xfs_trans_get_cud(
- struct xfs_trans *tp,
- struct xfs_cui_log_item *cuip)
-{
- struct xfs_cud_log_item *cudp;
-
- cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
- &xfs_cud_item_ops);
- cudp->cud_cuip = cuip;
- cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
-
- xfs_trans_add_item(tp, &cudp->cud_item);
- return cudp;
-}
-
-/*
- * Finish an refcount update and log it to the CUD. Note that the
- * transaction is marked dirty regardless of whether the refcount
- * update succeeds or fails to support the CUI/CUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_refcount_update(
- struct xfs_trans *tp,
- struct xfs_cud_log_item *cudp,
- struct xfs_refcount_intent *ri,
- struct xfs_btree_cur **pcur)
-{
- int error;
-
- error = xfs_refcount_finish_one(tp, ri, pcur);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the CUI and frees the CUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
-
- return error;
-}
-
/* Sort refcount intents by AG. */
static int
xfs_refcount_update_diff_items(
@@ -318,9 +272,6 @@ xfs_refcount_update_log_item(
uint next_extent;
struct xfs_phys_extent *pmap;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -347,7 +298,6 @@ xfs_refcount_update_create_intent(
ASSERT(count > 0);
- xfs_trans_add_item(tp, &cuip->cui_item);
if (sort)
list_sort(mp, items, xfs_refcount_update_diff_items);
list_for_each_entry(ri, items, ri_list)
@@ -362,7 +312,16 @@ xfs_refcount_update_create_done(
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_cud(tp, CUI_ITEM(intent))->cud_item;
+ struct xfs_cui_log_item *cuip = CUI_ITEM(intent);
+ struct xfs_cud_log_item *cudp;
+
+ cudp = kmem_cache_zalloc(xfs_cud_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(tp->t_mountp, &cudp->cud_item, XFS_LI_CUD,
+ &xfs_cud_item_ops);
+ cudp->cud_cuip = cuip;
+ cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
+
+ return &cudp->cud_item;
}
/* Take a passive ref to the AG containing the space we're refcounting. */
@@ -397,10 +356,9 @@ xfs_refcount_update_finish_item(
int error;
ri = container_of(item, struct xfs_refcount_intent, ri_list);
- error = xfs_trans_log_finish_refcount_update(tp, CUD_ITEM(done), ri,
- state);
/* Did we run out of reservation? Requeue what we didn't finish. */
+ error = xfs_refcount_finish_one(tp, ri, state);
if (!error && ri->ri_blockcount > 0) {
ASSERT(ri->ri_type == XFS_REFCOUNT_INCREASE ||
ri->ri_type == XFS_REFCOUNT_DECREASE);
@@ -433,16 +391,6 @@ xfs_refcount_update_cancel_item(
kmem_cache_free(xfs_refcount_intent_cache, ri);
}
-const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
- .max_items = XFS_CUI_MAX_FAST_EXTENTS,
- .create_intent = xfs_refcount_update_create_intent,
- .abort_intent = xfs_refcount_update_abort_intent,
- .create_done = xfs_refcount_update_create_done,
- .finish_item = xfs_refcount_update_finish_item,
- .finish_cleanup = xfs_refcount_finish_one_cleanup,
- .cancel_item = xfs_refcount_update_cancel_item,
-};
-
/* Is this recovered CUI ok? */
static inline bool
xfs_cui_validate_phys(
@@ -468,23 +416,38 @@ xfs_cui_validate_phys(
return xfs_verify_fsbext(mp, pmap->pe_startblock, pmap->pe_len);
}
+static inline void
+xfs_cui_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ struct xfs_phys_extent *pmap)
+{
+ struct xfs_refcount_intent *ri;
+
+ ri = kmem_cache_alloc(xfs_refcount_intent_cache,
+ GFP_NOFS | __GFP_NOFAIL);
+ ri->ri_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
+ ri->ri_startblock = pmap->pe_startblock;
+ ri->ri_blockcount = pmap->pe_len;
+ xfs_refcount_update_get_group(mp, ri);
+
+ xfs_defer_add_item(dfp, &ri->ri_list);
+}
+
/*
* Process a refcount update intent item that was recovered from the log.
* We need to update the refcountbt.
*/
STATIC int
-xfs_cui_item_recover(
- struct xfs_log_item *lip,
+xfs_refcount_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_cui_log_item *cuip = CUI_ITEM(lip);
- struct xfs_cud_log_item *cudp;
struct xfs_trans *tp;
- struct xfs_btree_cur *rcur = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
- unsigned int refc_type;
- bool requeue_only = false;
int i;
int error = 0;
@@ -501,6 +464,8 @@ xfs_cui_item_recover(
sizeof(cuip->cui_format));
return -EFSCORRUPTED;
}
+
+ xfs_cui_recover_work(mp, dfp, &cuip->cui_format.cui_extents[i]);
}
/*
@@ -521,100 +486,28 @@ xfs_cui_item_recover(
if (error)
return error;
- cudp = xfs_trans_get_cud(tp, cuip);
-
- for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
- struct xfs_refcount_intent fake = { };
- struct xfs_phys_extent *pmap;
-
- pmap = &cuip->cui_format.cui_extents[i];
- refc_type = pmap->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
- switch (refc_type) {
- case XFS_REFCOUNT_INCREASE:
- case XFS_REFCOUNT_DECREASE:
- case XFS_REFCOUNT_ALLOC_COW:
- case XFS_REFCOUNT_FREE_COW:
- fake.ri_type = refc_type;
- break;
- default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- &cuip->cui_format,
- sizeof(cuip->cui_format));
- error = -EFSCORRUPTED;
- goto abort_error;
- }
-
- fake.ri_startblock = pmap->pe_startblock;
- fake.ri_blockcount = pmap->pe_len;
-
- if (!requeue_only) {
- xfs_refcount_update_get_group(mp, &fake);
- error = xfs_trans_log_finish_refcount_update(tp, cudp,
- &fake, &rcur);
- xfs_refcount_update_put_group(&fake);
- }
- if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- &cuip->cui_format,
- sizeof(cuip->cui_format));
- if (error)
- goto abort_error;
-
- /* Requeue what we didn't finish. */
- if (fake.ri_blockcount > 0) {
- struct xfs_bmbt_irec irec = {
- .br_startblock = fake.ri_startblock,
- .br_blockcount = fake.ri_blockcount,
- };
-
- switch (fake.ri_type) {
- case XFS_REFCOUNT_INCREASE:
- xfs_refcount_increase_extent(tp, &irec);
- break;
- case XFS_REFCOUNT_DECREASE:
- xfs_refcount_decrease_extent(tp, &irec);
- break;
- case XFS_REFCOUNT_ALLOC_COW:
- xfs_refcount_alloc_cow_extent(tp,
- irec.br_startblock,
- irec.br_blockcount);
- break;
- case XFS_REFCOUNT_FREE_COW:
- xfs_refcount_free_cow_extent(tp,
- irec.br_startblock,
- irec.br_blockcount);
- break;
- default:
- ASSERT(0);
- }
- requeue_only = true;
- }
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &cuip->cui_format,
+ sizeof(cuip->cui_format));
+ if (error)
+ goto abort_error;
- xfs_refcount_finish_one_cleanup(tp, rcur, error);
return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
- xfs_refcount_finish_one_cleanup(tp, rcur, error);
xfs_trans_cancel(tp);
return error;
}
-STATIC bool
-xfs_cui_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_cui_item_relog(
+xfs_refcount_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_cud_log_item *cudp;
struct xfs_cui_log_item *cuip;
struct xfs_phys_extent *pmap;
unsigned int count;
@@ -622,27 +515,41 @@ xfs_cui_item_relog(
count = CUI_ITEM(intent)->cui_format.cui_nextents;
pmap = CUI_ITEM(intent)->cui_format.cui_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- cudp = xfs_trans_get_cud(tp, CUI_ITEM(intent));
- set_bit(XFS_LI_DIRTY, &cudp->cud_item.li_flags);
-
cuip = xfs_cui_init(tp->t_mountp, count);
memcpy(cuip->cui_format.cui_extents, pmap, count * sizeof(*pmap));
atomic_set(&cuip->cui_next_extent, count);
- xfs_trans_add_item(tp, &cuip->cui_item);
- set_bit(XFS_LI_DIRTY, &cuip->cui_item.li_flags);
+
return &cuip->cui_item;
}
+const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+ .name = "refcount",
+ .max_items = XFS_CUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_refcount_update_create_intent,
+ .abort_intent = xfs_refcount_update_abort_intent,
+ .create_done = xfs_refcount_update_create_done,
+ .finish_item = xfs_refcount_update_finish_item,
+ .finish_cleanup = xfs_refcount_finish_one_cleanup,
+ .cancel_item = xfs_refcount_update_cancel_item,
+ .recover_work = xfs_refcount_recover_work,
+ .relog_intent = xfs_refcount_relog_intent,
+};
+
+STATIC bool
+xfs_cui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return CUI_ITEM(lip)->cui_format.cui_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_cui_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_cui_item_size,
.iop_format = xfs_cui_item_format,
.iop_unpin = xfs_cui_item_unpin,
.iop_release = xfs_cui_item_release,
- .iop_recover = xfs_cui_item_recover,
.iop_match = xfs_cui_item_match,
- .iop_relog = xfs_cui_item_relog,
};
static inline void
@@ -696,12 +603,9 @@ xlog_recover_cui_commit_pass2(
cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
xfs_cui_copy_format(&cuip->cui_format, cui_formatp);
atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &cuip->cui_item, lsn);
- xfs_cui_release(cuip);
+
+ xlog_recover_intent_item(log, &cuip->cui_item, lsn,
+ &xfs_refcount_update_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index e5b62dc28466..d5ca8bcae65b 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -618,7 +618,7 @@ xfs_reflink_cancel_cow_blocks(
error = xfs_free_extent_later(*tpp, del.br_startblock,
del.br_blockcount, NULL,
- XFS_AG_RESV_NONE);
+ XFS_AG_RESV_NONE, false);
if (error)
break;
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 0e0e747028da..79ad0087aeca 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -225,23 +225,6 @@ static const struct xfs_item_ops xfs_rud_item_ops = {
.iop_intent = xfs_rud_item_intent,
};
-static struct xfs_rud_log_item *
-xfs_trans_get_rud(
- struct xfs_trans *tp,
- struct xfs_rui_log_item *ruip)
-{
- struct xfs_rud_log_item *rudp;
-
- rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
- xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
- &xfs_rud_item_ops);
- rudp->rud_ruip = ruip;
- rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
-
- xfs_trans_add_item(tp, &rudp->rud_item);
- return rudp;
-}
-
/* Set the map extent flags for this reverse mapping. */
static void
xfs_trans_set_rmap_flags(
@@ -285,35 +268,6 @@ xfs_trans_set_rmap_flags(
}
}
-/*
- * Finish an rmap update and log it to the RUD. Note that the transaction is
- * marked dirty regardless of whether the rmap update succeeds or fails to
- * support the RUI/RUD lifecycle rules.
- */
-static int
-xfs_trans_log_finish_rmap_update(
- struct xfs_trans *tp,
- struct xfs_rud_log_item *rudp,
- struct xfs_rmap_intent *ri,
- struct xfs_btree_cur **pcur)
-{
- int error;
-
- error = xfs_rmap_finish_one(tp, ri, pcur);
-
- /*
- * Mark the transaction dirty, even on error. This ensures the
- * transaction is aborted, which:
- *
- * 1.) releases the RUI and frees the RUD
- * 2.) shuts down the filesystem
- */
- tp->t_flags |= XFS_TRANS_DIRTY | XFS_TRANS_HAS_INTENT_DONE;
- set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
-
- return error;
-}
-
/* Sort rmap intents by AG. */
static int
xfs_rmap_update_diff_items(
@@ -340,9 +294,6 @@ xfs_rmap_update_log_item(
uint next_extent;
struct xfs_map_extent *map;
- tp->t_flags |= XFS_TRANS_DIRTY;
- set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
-
/*
* atomic_inc_return gives us the value after the increment;
* we want to use it as an array index so we need to subtract 1 from
@@ -372,7 +323,6 @@ xfs_rmap_update_create_intent(
ASSERT(count > 0);
- xfs_trans_add_item(tp, &ruip->rui_item);
if (sort)
list_sort(mp, items, xfs_rmap_update_diff_items);
list_for_each_entry(ri, items, ri_list)
@@ -387,7 +337,16 @@ xfs_rmap_update_create_done(
struct xfs_log_item *intent,
unsigned int count)
{
- return &xfs_trans_get_rud(tp, RUI_ITEM(intent))->rud_item;
+ struct xfs_rui_log_item *ruip = RUI_ITEM(intent);
+ struct xfs_rud_log_item *rudp;
+
+ rudp = kmem_cache_zalloc(xfs_rud_cache, GFP_KERNEL | __GFP_NOFAIL);
+ xfs_log_item_init(tp->t_mountp, &rudp->rud_item, XFS_LI_RUD,
+ &xfs_rud_item_ops);
+ rudp->rud_ruip = ruip;
+ rudp->rud_format.rud_rui_id = ruip->rui_format.rui_id;
+
+ return &rudp->rud_item;
}
/* Take a passive ref to the AG containing the space we're rmapping. */
@@ -423,8 +382,7 @@ xfs_rmap_update_finish_item(
ri = container_of(item, struct xfs_rmap_intent, ri_list);
- error = xfs_trans_log_finish_rmap_update(tp, RUD_ITEM(done), ri,
- state);
+ error = xfs_rmap_finish_one(tp, ri, state);
xfs_rmap_update_put_group(ri);
kmem_cache_free(xfs_rmap_intent_cache, ri);
@@ -452,16 +410,6 @@ xfs_rmap_update_cancel_item(
kmem_cache_free(xfs_rmap_intent_cache, ri);
}
-const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
- .max_items = XFS_RUI_MAX_FAST_EXTENTS,
- .create_intent = xfs_rmap_update_create_intent,
- .abort_intent = xfs_rmap_update_abort_intent,
- .create_done = xfs_rmap_update_create_done,
- .finish_item = xfs_rmap_update_finish_item,
- .finish_cleanup = xfs_rmap_finish_one_cleanup,
- .cancel_item = xfs_rmap_update_cancel_item,
-};
-
/* Is this recovered RUI ok? */
static inline bool
xfs_rui_validate_map(
@@ -498,20 +446,72 @@ xfs_rui_validate_map(
return xfs_verify_fsbext(mp, map->me_startblock, map->me_len);
}
+static inline void
+xfs_rui_recover_work(
+ struct xfs_mount *mp,
+ struct xfs_defer_pending *dfp,
+ const struct xfs_map_extent *map)
+{
+ struct xfs_rmap_intent *ri;
+
+ ri = kmem_cache_alloc(xfs_rmap_intent_cache, GFP_NOFS | __GFP_NOFAIL);
+
+ switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
+ case XFS_RMAP_EXTENT_MAP:
+ ri->ri_type = XFS_RMAP_MAP;
+ break;
+ case XFS_RMAP_EXTENT_MAP_SHARED:
+ ri->ri_type = XFS_RMAP_MAP_SHARED;
+ break;
+ case XFS_RMAP_EXTENT_UNMAP:
+ ri->ri_type = XFS_RMAP_UNMAP;
+ break;
+ case XFS_RMAP_EXTENT_UNMAP_SHARED:
+ ri->ri_type = XFS_RMAP_UNMAP_SHARED;
+ break;
+ case XFS_RMAP_EXTENT_CONVERT:
+ ri->ri_type = XFS_RMAP_CONVERT;
+ break;
+ case XFS_RMAP_EXTENT_CONVERT_SHARED:
+ ri->ri_type = XFS_RMAP_CONVERT_SHARED;
+ break;
+ case XFS_RMAP_EXTENT_ALLOC:
+ ri->ri_type = XFS_RMAP_ALLOC;
+ break;
+ case XFS_RMAP_EXTENT_FREE:
+ ri->ri_type = XFS_RMAP_FREE;
+ break;
+ default:
+ ASSERT(0);
+ return;
+ }
+
+ ri->ri_owner = map->me_owner;
+ ri->ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
+ XFS_ATTR_FORK : XFS_DATA_FORK;
+ ri->ri_bmap.br_startblock = map->me_startblock;
+ ri->ri_bmap.br_startoff = map->me_startoff;
+ ri->ri_bmap.br_blockcount = map->me_len;
+ ri->ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
+ XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+ xfs_rmap_update_get_group(mp, ri);
+
+ xfs_defer_add_item(dfp, &ri->ri_list);
+}
+
/*
* Process an rmap update intent item that was recovered from the log.
* We need to update the rmapbt.
*/
STATIC int
-xfs_rui_item_recover(
- struct xfs_log_item *lip,
+xfs_rmap_recover_work(
+ struct xfs_defer_pending *dfp,
struct list_head *capture_list)
{
struct xfs_trans_res resv;
+ struct xfs_log_item *lip = dfp->dfp_intent;
struct xfs_rui_log_item *ruip = RUI_ITEM(lip);
- struct xfs_rud_log_item *rudp;
struct xfs_trans *tp;
- struct xfs_btree_cur *rcur = NULL;
struct xfs_mount *mp = lip->li_log->l_mp;
int i;
int error = 0;
@@ -529,6 +529,8 @@ xfs_rui_item_recover(
sizeof(ruip->rui_format));
return -EFSCORRUPTED;
}
+
+ xfs_rui_recover_work(mp, dfp, &ruip->rui_format.rui_extents[i]);
}
resv = xlog_recover_resv(&M_RES(mp)->tr_itruncate);
@@ -536,91 +538,29 @@ xfs_rui_item_recover(
XFS_TRANS_RESERVE, &tp);
if (error)
return error;
- rudp = xfs_trans_get_rud(tp, ruip);
- for (i = 0; i < ruip->rui_format.rui_nextents; i++) {
- struct xfs_rmap_intent fake = { };
- struct xfs_map_extent *map;
-
- map = &ruip->rui_format.rui_extents[i];
- switch (map->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
- case XFS_RMAP_EXTENT_MAP:
- fake.ri_type = XFS_RMAP_MAP;
- break;
- case XFS_RMAP_EXTENT_MAP_SHARED:
- fake.ri_type = XFS_RMAP_MAP_SHARED;
- break;
- case XFS_RMAP_EXTENT_UNMAP:
- fake.ri_type = XFS_RMAP_UNMAP;
- break;
- case XFS_RMAP_EXTENT_UNMAP_SHARED:
- fake.ri_type = XFS_RMAP_UNMAP_SHARED;
- break;
- case XFS_RMAP_EXTENT_CONVERT:
- fake.ri_type = XFS_RMAP_CONVERT;
- break;
- case XFS_RMAP_EXTENT_CONVERT_SHARED:
- fake.ri_type = XFS_RMAP_CONVERT_SHARED;
- break;
- case XFS_RMAP_EXTENT_ALLOC:
- fake.ri_type = XFS_RMAP_ALLOC;
- break;
- case XFS_RMAP_EXTENT_FREE:
- fake.ri_type = XFS_RMAP_FREE;
- break;
- default:
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- &ruip->rui_format,
- sizeof(ruip->rui_format));
- error = -EFSCORRUPTED;
- goto abort_error;
- }
-
- fake.ri_owner = map->me_owner;
- fake.ri_whichfork = (map->me_flags & XFS_RMAP_EXTENT_ATTR_FORK) ?
- XFS_ATTR_FORK : XFS_DATA_FORK;
- fake.ri_bmap.br_startblock = map->me_startblock;
- fake.ri_bmap.br_startoff = map->me_startoff;
- fake.ri_bmap.br_blockcount = map->me_len;
- fake.ri_bmap.br_state = (map->me_flags & XFS_RMAP_EXTENT_UNWRITTEN) ?
- XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
-
- xfs_rmap_update_get_group(mp, &fake);
- error = xfs_trans_log_finish_rmap_update(tp, rudp, &fake,
- &rcur);
- if (error == -EFSCORRUPTED)
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
- map, sizeof(*map));
- xfs_rmap_update_put_group(&fake);
- if (error)
- goto abort_error;
-
- }
+ error = xlog_recover_finish_intent(tp, dfp);
+ if (error == -EFSCORRUPTED)
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
+ &ruip->rui_format,
+ sizeof(ruip->rui_format));
+ if (error)
+ goto abort_error;
- xfs_rmap_finish_one_cleanup(tp, rcur, error);
return xfs_defer_ops_capture_and_commit(tp, capture_list);
abort_error:
- xfs_rmap_finish_one_cleanup(tp, rcur, error);
xfs_trans_cancel(tp);
return error;
}
-STATIC bool
-xfs_rui_item_match(
- struct xfs_log_item *lip,
- uint64_t intent_id)
-{
- return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
-}
-
/* Relog an intent item to push the log tail forward. */
static struct xfs_log_item *
-xfs_rui_item_relog(
+xfs_rmap_relog_intent(
+ struct xfs_trans *tp,
struct xfs_log_item *intent,
- struct xfs_trans *tp)
+ struct xfs_log_item *done_item)
{
- struct xfs_rud_log_item *rudp;
struct xfs_rui_log_item *ruip;
struct xfs_map_extent *map;
unsigned int count;
@@ -628,27 +568,41 @@ xfs_rui_item_relog(
count = RUI_ITEM(intent)->rui_format.rui_nextents;
map = RUI_ITEM(intent)->rui_format.rui_extents;
- tp->t_flags |= XFS_TRANS_DIRTY;
- rudp = xfs_trans_get_rud(tp, RUI_ITEM(intent));
- set_bit(XFS_LI_DIRTY, &rudp->rud_item.li_flags);
-
ruip = xfs_rui_init(tp->t_mountp, count);
memcpy(ruip->rui_format.rui_extents, map, count * sizeof(*map));
atomic_set(&ruip->rui_next_extent, count);
- xfs_trans_add_item(tp, &ruip->rui_item);
- set_bit(XFS_LI_DIRTY, &ruip->rui_item.li_flags);
+
return &ruip->rui_item;
}
+const struct xfs_defer_op_type xfs_rmap_update_defer_type = {
+ .name = "rmap",
+ .max_items = XFS_RUI_MAX_FAST_EXTENTS,
+ .create_intent = xfs_rmap_update_create_intent,
+ .abort_intent = xfs_rmap_update_abort_intent,
+ .create_done = xfs_rmap_update_create_done,
+ .finish_item = xfs_rmap_update_finish_item,
+ .finish_cleanup = xfs_rmap_finish_one_cleanup,
+ .cancel_item = xfs_rmap_update_cancel_item,
+ .recover_work = xfs_rmap_recover_work,
+ .relog_intent = xfs_rmap_relog_intent,
+};
+
+STATIC bool
+xfs_rui_item_match(
+ struct xfs_log_item *lip,
+ uint64_t intent_id)
+{
+ return RUI_ITEM(lip)->rui_format.rui_id == intent_id;
+}
+
static const struct xfs_item_ops xfs_rui_item_ops = {
.flags = XFS_ITEM_INTENT,
.iop_size = xfs_rui_item_size,
.iop_format = xfs_rui_item_format,
.iop_unpin = xfs_rui_item_unpin,
.iop_release = xfs_rui_item_release,
- .iop_recover = xfs_rui_item_recover,
.iop_match = xfs_rui_item_match,
- .iop_relog = xfs_rui_item_relog,
};
static inline void
@@ -702,12 +656,9 @@ xlog_recover_rui_commit_pass2(
ruip = xfs_rui_init(mp, rui_formatp->rui_nextents);
xfs_rui_copy_format(&ruip->rui_format, rui_formatp);
atomic_set(&ruip->rui_next_extent, rui_formatp->rui_nextents);
- /*
- * Insert the intent into the AIL directly and drop one reference so
- * that finishing or canceling the work will drop the other.
- */
- xfs_trans_ail_insert(log->l_ailp, &ruip->rui_item, lsn);
- xfs_rui_release(ruip);
+
+ xlog_recover_intent_item(log, &ruip->rui_item, lsn,
+ &xfs_rmap_update_defer_type);
return 0;
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 88c48de5c9c8..8649d981a097 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -14,28 +14,14 @@
#include "xfs_inode.h"
#include "xfs_bmap.h"
#include "xfs_bmap_btree.h"
+#include "xfs_bmap_util.h"
#include "xfs_trans.h"
#include "xfs_trans_space.h"
#include "xfs_icache.h"
#include "xfs_rtalloc.h"
#include "xfs_sb.h"
#include "xfs_rtbitmap.h"
-
-/*
- * Read and return the summary information for a given extent size,
- * bitmap block combination.
- * Keeps track of a current summary block, so we don't keep reading
- * it from the buffer cache.
- */
-static int
-xfs_rtget_summary(
- struct xfs_rtalloc_args *args,
- int log, /* log2 of extent size */
- xfs_fileoff_t bbno, /* bitmap block number */
- xfs_suminfo_t *sum) /* out: summary info for this block */
-{
- return xfs_rtmodify_summary_int(args, log, bbno, 0, sum);
-}
+#include "xfs_quota.h"
/*
* Return whether there are any free extents in the size range given
@@ -154,56 +140,55 @@ xfs_rtallocate_range(
* properly update the summary.
*/
error = xfs_rtfind_back(args, start, 0, &preblock);
- if (error) {
+ if (error)
return error;
- }
+
/*
* Find the next allocated block (end of free extent).
*/
error = xfs_rtfind_forw(args, end, mp->m_sb.sb_rextents - 1,
&postblock);
- if (error) {
+ if (error)
return error;
- }
+
/*
* Decrement the summary information corresponding to the entire
* (old) free extent.
*/
error = xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(postblock + 1 - preblock),
+ xfs_highbit64(postblock + 1 - preblock),
xfs_rtx_to_rbmblock(mp, preblock), -1);
- if (error) {
+ if (error)
return error;
- }
+
/*
* If there are blocks not being allocated at the front of the
* old extent, add summary data for them to be free.
*/
if (preblock < start) {
error = xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(start - preblock),
+ xfs_highbit64(start - preblock),
xfs_rtx_to_rbmblock(mp, preblock), 1);
- if (error) {
+ if (error)
return error;
- }
}
+
/*
* If there are blocks not being allocated at the end of the
* old extent, add summary data for them to be free.
*/
if (postblock > end) {
error = xfs_rtmodify_summary(args,
- XFS_RTBLOCKLOG(postblock - end),
+ xfs_highbit64(postblock - end),
xfs_rtx_to_rbmblock(mp, end + 1), 1);
- if (error) {
+ if (error)
return error;
- }
}
+
/*
* Modify the bitmap to mark this extent allocated.
*/
- error = xfs_rtmodify_range(args, start, len, 0);
- return error;
+ return xfs_rtmodify_range(args, start, len, 0);
}
/*
@@ -265,21 +250,17 @@ xfs_rtallocate_extent_block(
* If it's not so then next will contain the first non-free.
*/
error = xfs_rtcheck_range(args, i, maxlen, 1, &next, &stat);
- if (error) {
+ if (error)
return error;
- }
if (stat) {
/*
* i for maxlen is all free, allocate and return that.
*/
- error = xfs_rtallocate_range(args, i, maxlen);
- if (error) {
- return error;
- }
- *len = maxlen;
- *rtx = i;
- return 0;
+ bestlen = maxlen;
+ besti = i;
+ goto allocate;
}
+
/*
* In the case where we have a variable-sized allocation
* request, figure out how big this free piece is,
@@ -298,45 +279,44 @@ xfs_rtallocate_extent_block(
/*
* If not done yet, find the start of the next free space.
*/
- if (next < end) {
- error = xfs_rtfind_forw(args, next, end, &i);
- if (error) {
- return error;
- }
- } else
+ if (next >= end)
break;
+ error = xfs_rtfind_forw(args, next, end, &i);
+ if (error)
+ return error;
}
+
/*
* Searched the whole thing & didn't find a maxlen free extent.
*/
- if (minlen < maxlen && besti != -1) {
- xfs_rtxlen_t p; /* amount to trim length by */
-
+ if (minlen > maxlen || besti == -1) {
/*
- * If size should be a multiple of prod, make that so.
+ * Allocation failed. Set *nextp to the next block to try.
*/
- if (prod > 1) {
- div_u64_rem(bestlen, prod, &p);
- if (p)
- bestlen -= p;
- }
+ *nextp = next;
+ return -ENOSPC;
+ }
- /*
- * Allocate besti for bestlen & return that.
- */
- error = xfs_rtallocate_range(args, besti, bestlen);
- if (error) {
- return error;
- }
- *len = bestlen;
- *rtx = besti;
- return 0;
+ /*
+ * If size should be a multiple of prod, make that so.
+ */
+ if (prod > 1) {
+ xfs_rtxlen_t p; /* amount to trim length by */
+
+ div_u64_rem(bestlen, prod, &p);
+ if (p)
+ bestlen -= p;
}
+
/*
- * Allocation failed. Set *nextp to the next block to try.
+ * Allocate besti for bestlen & return that.
*/
- *nextp = next;
- *rtx = NULLRTEXTNO;
+allocate:
+ error = xfs_rtallocate_range(args, besti, bestlen);
+ if (error)
+ return error;
+ *len = bestlen;
+ *rtx = besti;
return 0;
}
@@ -367,52 +347,33 @@ xfs_rtallocate_extent_exact(
* Check if the range in question (for maxlen) is free.
*/
error = xfs_rtcheck_range(args, start, maxlen, 1, &next, &isfree);
- if (error) {
+ if (error)
return error;
- }
- if (isfree) {
+
+ if (!isfree) {
/*
- * If it is, allocate it and return success.
+ * If not, allocate what there is, if it's at least minlen.
*/
- error = xfs_rtallocate_range(args, start, maxlen);
- if (error) {
- return error;
- }
- *len = maxlen;
- *rtx = start;
- return 0;
- }
- /*
- * If not, allocate what there is, if it's at least minlen.
- */
- maxlen = next - start;
- if (maxlen < minlen) {
+ maxlen = next - start;
+ if (maxlen < minlen)
+ return -ENOSPC;
+
/*
- * Failed, return failure status.
+ * Trim off tail of extent, if prod is specified.
*/
- *rtx = NULLRTEXTNO;
- return 0;
- }
- /*
- * Trim off tail of extent, if prod is specified.
- */
- if (prod > 1 && (i = maxlen % prod)) {
- maxlen -= i;
- if (maxlen < minlen) {
- /*
- * Now we can't do it, return failure status.
- */
- *rtx = NULLRTEXTNO;
- return 0;
+ if (prod > 1 && (i = maxlen % prod)) {
+ maxlen -= i;
+ if (maxlen < minlen)
+ return -ENOSPC;
}
}
+
/*
* Allocate what we can and return it.
*/
error = xfs_rtallocate_range(args, start, maxlen);
- if (error) {
+ if (error)
return error;
- }
*len = maxlen;
*rtx = start;
return 0;
@@ -441,7 +402,6 @@ xfs_rtallocate_extent_near(
int j; /* secondary loop control */
int log2len; /* log2 of minlen */
xfs_rtxnum_t n; /* next rtext to try */
- xfs_rtxnum_t r; /* result rtext */
ASSERT(minlen % prod == 0);
ASSERT(maxlen % prod == 0);
@@ -455,26 +415,18 @@ xfs_rtallocate_extent_near(
/* Make sure we don't run off the end of the rt volume. */
maxlen = xfs_rtallocate_clamp_len(mp, start, maxlen, prod);
- if (maxlen < minlen) {
- *rtx = NULLRTEXTNO;
- return 0;
- }
+ if (maxlen < minlen)
+ return -ENOSPC;
/*
* Try the exact allocation first.
*/
error = xfs_rtallocate_extent_exact(args, start, minlen, maxlen, len,
- prod, &r);
- if (error) {
+ prod, rtx);
+ if (error != -ENOSPC)
return error;
- }
- /*
- * If the exact allocation worked, return that.
- */
- if (r != NULLRTEXTNO) {
- *rtx = r;
- return 0;
- }
+
+
bbno = xfs_rtx_to_rbmblock(mp, start);
i = 0;
j = -1;
@@ -490,9 +442,9 @@ xfs_rtallocate_extent_near(
*/
error = xfs_rtany_summary(args, log2len, mp->m_rsumlevels - 1,
bbno + i, &maxlog);
- if (error) {
+ if (error)
return error;
- }
+
/*
* If there are any useful extents starting here, try
* allocating one.
@@ -511,17 +463,9 @@ xfs_rtallocate_extent_near(
*/
error = xfs_rtallocate_extent_block(args,
bbno + i, minlen, maxavail, len,
- &n, prod, &r);
- if (error) {
+ &n, prod, rtx);
+ if (error != -ENOSPC)
return error;
- }
- /*
- * If it worked, return it.
- */
- if (r != NULLRTEXTNO) {
- *rtx = r;
- return 0;
- }
}
/*
* On the negative side of the starting location.
@@ -555,17 +499,9 @@ xfs_rtallocate_extent_near(
error = xfs_rtallocate_extent_block(args,
bbno + j, minlen,
maxavail, len, &n, prod,
- &r);
- if (error) {
+ rtx);
+ if (error != -ENOSPC)
return error;
- }
- /*
- * If it works, return the extent.
- */
- if (r != NULLRTEXTNO) {
- *rtx = r;
- return 0;
- }
}
}
}
@@ -599,8 +535,53 @@ xfs_rtallocate_extent_near(
else
break;
}
- *rtx = NULLRTEXTNO;
- return 0;
+ return -ENOSPC;
+}
+
+static int
+xfs_rtalloc_sumlevel(
+ struct xfs_rtalloc_args *args,
+ int l, /* level number */
+ xfs_rtxlen_t minlen, /* minimum length to allocate */
+ xfs_rtxlen_t maxlen, /* maximum length to allocate */
+ xfs_rtxlen_t prod, /* extent product factor */
+ xfs_rtxlen_t *len, /* out: actual length allocated */
+ xfs_rtxnum_t *rtx) /* out: start rtext allocated */
+{
+ xfs_fileoff_t i; /* bitmap block number */
+
+ for (i = 0; i < args->mp->m_sb.sb_rbmblocks; i++) {
+ xfs_suminfo_t sum; /* summary information for extents */
+ xfs_rtxnum_t n; /* next rtext to be tried */
+ int error;
+
+ error = xfs_rtget_summary(args, l, i, &sum);
+ if (error)
+ return error;
+
+ /*
+ * Nothing there, on to the next block.
+ */
+ if (!sum)
+ continue;
+
+ /*
+ * Try allocating the extent.
+ */
+ error = xfs_rtallocate_extent_block(args, i, minlen, maxlen,
+ len, &n, prod, rtx);
+ if (error != -ENOSPC)
+ return error;
+
+ /*
+ * If the "next block to try" returned from the allocator is
+ * beyond the next bitmap block, skip to that bitmap block.
+ */
+ if (xfs_rtx_to_rbmblock(args->mp, n) > i + 1)
+ i = xfs_rtx_to_rbmblock(args->mp, n) - 1;
+ }
+
+ return -ENOSPC;
}
/*
@@ -617,13 +598,8 @@ xfs_rtallocate_extent_size(
xfs_rtxlen_t prod, /* extent product factor */
xfs_rtxnum_t *rtx) /* out: start rtext allocated */
{
- struct xfs_mount *mp = args->mp;
int error;
- xfs_fileoff_t i; /* bitmap block number */
int l; /* level number (loop control) */
- xfs_rtxnum_t n; /* next rtext to be tried */
- xfs_rtxnum_t r; /* result rtext number */
- xfs_suminfo_t sum; /* summary information for extents */
ASSERT(minlen % prod == 0);
ASSERT(maxlen % prod == 0);
@@ -631,119 +607,46 @@ xfs_rtallocate_extent_size(
/*
* Loop over all the levels starting with maxlen.
- * At each level, look at all the bitmap blocks, to see if there
- * are extents starting there that are long enough (>= maxlen).
- * Note, only on the initial level can the allocation fail if
- * the summary says there's an extent.
+ *
+ * At each level, look at all the bitmap blocks, to see if there are
+ * extents starting there that are long enough (>= maxlen).
+ *
+ * Note, only on the initial level can the allocation fail if the
+ * summary says there's an extent.
*/
- for (l = xfs_highbit32(maxlen); l < mp->m_rsumlevels; l++) {
- /*
- * Loop over all the bitmap blocks.
- */
- for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
- /*
- * Get the summary for this level/block.
- */
- error = xfs_rtget_summary(args, l, i, &sum);
- if (error) {
- return error;
- }
- /*
- * Nothing there, on to the next block.
- */
- if (!sum)
- continue;
- /*
- * Try allocating the extent.
- */
- error = xfs_rtallocate_extent_block(args, i, maxlen,
- maxlen, len, &n, prod, &r);
- if (error) {
- return error;
- }
- /*
- * If it worked, return that.
- */
- if (r != NULLRTEXTNO) {
- *rtx = r;
- return 0;
- }
- /*
- * If the "next block to try" returned from the
- * allocator is beyond the next bitmap block,
- * skip to that bitmap block.
- */
- if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
- i = xfs_rtx_to_rbmblock(mp, n) - 1;
- }
+ for (l = xfs_highbit32(maxlen); l < args->mp->m_rsumlevels; l++) {
+ error = xfs_rtalloc_sumlevel(args, l, minlen, maxlen, prod, len,
+ rtx);
+ if (error != -ENOSPC)
+ return error;
}
+
/*
- * Didn't find any maxlen blocks. Try smaller ones, unless
- * we're asking for a fixed size extent.
+ * Didn't find any maxlen blocks. Try smaller ones, unless we are
+ * looking for a fixed size extent.
*/
- if (minlen > --maxlen) {
- *rtx = NULLRTEXTNO;
- return 0;
- }
+ if (minlen > --maxlen)
+ return -ENOSPC;
ASSERT(minlen != 0);
ASSERT(maxlen != 0);
/*
* Loop over sizes, from maxlen down to minlen.
- * This time, when we do the allocations, allow smaller ones
- * to succeed.
+ *
+ * This time, when we do the allocations, allow smaller ones to succeed,
+ * but make sure the specified minlen/maxlen are in the possible range
+ * for this summary level.
*/
for (l = xfs_highbit32(maxlen); l >= xfs_highbit32(minlen); l--) {
- /*
- * Loop over all the bitmap blocks, try an allocation
- * starting in that block.
- */
- for (i = 0; i < mp->m_sb.sb_rbmblocks; i++) {
- /*
- * Get the summary information for this level/block.
- */
- error = xfs_rtget_summary(args, l, i, &sum);
- if (error) {
- return error;
- }
- /*
- * If nothing there, go on to next.
- */
- if (!sum)
- continue;
- /*
- * Try the allocation. Make sure the specified
- * minlen/maxlen are in the possible range for
- * this summary level.
- */
- error = xfs_rtallocate_extent_block(args, i,
- XFS_RTMAX(minlen, 1 << l),
- XFS_RTMIN(maxlen, (1 << (l + 1)) - 1),
- len, &n, prod, &r);
- if (error) {
- return error;
- }
- /*
- * If it worked, return that extent.
- */
- if (r != NULLRTEXTNO) {
- *rtx = r;
- return 0;
- }
- /*
- * If the "next block to try" returned from the
- * allocator is beyond the next bitmap block,
- * skip to that bitmap block.
- */
- if (xfs_rtx_to_rbmblock(mp, n) > i + 1)
- i = xfs_rtx_to_rbmblock(mp, n) - 1;
- }
+ error = xfs_rtalloc_sumlevel(args, l,
+ max_t(xfs_rtxlen_t, minlen, 1 << l),
+ min_t(xfs_rtxlen_t, maxlen, (1 << (l + 1)) - 1),
+ prod, len, rtx);
+ if (error != -ENOSPC)
+ return error;
}
- /*
- * Got nothing, return failure.
- */
- *rtx = NULLRTEXTNO;
- return 0;
+
+ return -ENOSPC;
}
/*
@@ -963,8 +866,10 @@ xfs_growfs_rt(
*/
nrextents = nrblocks;
do_div(nrextents, in->extsize);
+ if (!xfs_validate_rtextents(nrextents))
+ return -EINVAL;
nrbmblocks = xfs_rtbitmap_blockcount(mp, nrextents);
- nrextslog = xfs_highbit32(nrextents);
+ nrextslog = xfs_compute_rextslog(nrextents);
nrsumlevels = nrextslog + 1;
nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels, nrbmblocks);
nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
@@ -1031,11 +936,14 @@ xfs_growfs_rt(
nsbp->sb_rblocks = min(nrblocks, nrblocks_step);
nsbp->sb_rextents = xfs_rtb_to_rtx(nmp, nsbp->sb_rblocks);
ASSERT(nsbp->sb_rextents != 0);
- nsbp->sb_rextslog = xfs_highbit32(nsbp->sb_rextents);
+ nsbp->sb_rextslog = xfs_compute_rextslog(nsbp->sb_rextents);
nrsumlevels = nmp->m_rsumlevels = nsbp->sb_rextslog + 1;
nrsumblocks = xfs_rtsummary_blockcount(mp, nrsumlevels,
nsbp->sb_rbmblocks);
nmp->m_rsumsize = nrsumsize = XFS_FSB_TO_B(mp, nrsumblocks);
+ /* recompute growfsrt reservation from new rsumsize */
+ xfs_trans_resv_calc(nmp, &nmp->m_resv);
+
/*
* Start a transaction, get the log reservation.
*/
@@ -1122,6 +1030,8 @@ error_cancel:
*/
mp->m_rsumlevels = nrsumlevels;
mp->m_rsumsize = nrsumsize;
+ /* recompute growfsrt reservation from new rsumsize */
+ xfs_trans_resv_calc(mp, &mp->m_resv);
error = xfs_trans_commit(tp);
if (error)
@@ -1160,81 +1070,6 @@ out_free:
}
/*
- * Allocate an extent in the realtime subvolume, with the usual allocation
- * parameters. The length units are all in realtime extents, as is the
- * result block number.
- */
-int
-xfs_rtallocate_extent(
- struct xfs_trans *tp,
- xfs_rtxnum_t start, /* starting rtext number to allocate */
- xfs_rtxlen_t minlen, /* minimum length to allocate */
- xfs_rtxlen_t maxlen, /* maximum length to allocate */
- xfs_rtxlen_t *len, /* out: actual length allocated */
- int wasdel, /* was a delayed allocation extent */
- xfs_rtxlen_t prod, /* extent product factor */
- xfs_rtxnum_t *rtblock) /* out: start rtext allocated */
-{
- struct xfs_rtalloc_args args = {
- .mp = tp->t_mountp,
- .tp = tp,
- };
- int error; /* error value */
- xfs_rtxnum_t r; /* result allocated rtext */
-
- ASSERT(xfs_isilocked(args.mp->m_rbmip, XFS_ILOCK_EXCL));
- ASSERT(minlen > 0 && minlen <= maxlen);
-
- /*
- * If prod is set then figure out what to do to minlen and maxlen.
- */
- if (prod > 1) {
- xfs_rtxlen_t i;
-
- if ((i = maxlen % prod))
- maxlen -= i;
- if ((i = minlen % prod))
- minlen += prod - i;
- if (maxlen < minlen) {
- *rtblock = NULLRTEXTNO;
- return 0;
- }
- }
-
-retry:
- if (start == 0) {
- error = xfs_rtallocate_extent_size(&args, minlen,
- maxlen, len, prod, &r);
- } else {
- error = xfs_rtallocate_extent_near(&args, start, minlen,
- maxlen, len, prod, &r);
- }
-
- xfs_rtbuf_cache_relse(&args);
- if (error)
- return error;
-
- /*
- * If it worked, update the superblock.
- */
- if (r != NULLRTEXTNO) {
- long slen = (long)*len;
-
- ASSERT(*len >= minlen && *len <= maxlen);
- if (wasdel)
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FREXTENTS, -slen);
- else
- xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, -slen);
- } else if (prod > 1) {
- prod = 1;
- goto retry;
- }
-
- *rtblock = r;
- return 0;
-}
-
-/*
* Initialize realtime fields in the mount structure.
*/
int /* error */
@@ -1412,7 +1247,7 @@ xfs_rtunmount_inodes(
* of rtextents and the fraction.
* The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
*/
-int /* error */
+static int
xfs_rtpick_extent(
xfs_mount_t *mp, /* file system mount point */
xfs_trans_t *tp, /* transaction pointer */
@@ -1451,3 +1286,177 @@ xfs_rtpick_extent(
*pick = b;
return 0;
}
+
+static void
+xfs_rtalloc_align_minmax(
+ xfs_rtxlen_t *raminlen,
+ xfs_rtxlen_t *ramaxlen,
+ xfs_rtxlen_t *prod)
+{
+ xfs_rtxlen_t newmaxlen = *ramaxlen;
+ xfs_rtxlen_t newminlen = *raminlen;
+ xfs_rtxlen_t slack;
+
+ slack = newmaxlen % *prod;
+ if (slack)
+ newmaxlen -= slack;
+ slack = newminlen % *prod;
+ if (slack)
+ newminlen += *prod - slack;
+
+ /*
+ * If adjusting for extent size hint alignment produces an invalid
+ * min/max len combination, go ahead without it.
+ */
+ if (newmaxlen < newminlen) {
+ *prod = 1;
+ return;
+ }
+ *ramaxlen = newmaxlen;
+ *raminlen = newminlen;
+}
+
+int
+xfs_bmap_rtalloc(
+ struct xfs_bmalloca *ap)
+{
+ struct xfs_mount *mp = ap->ip->i_mount;
+ xfs_fileoff_t orig_offset = ap->offset;
+ xfs_rtxnum_t start; /* allocation hint rtextent no */
+ xfs_rtxnum_t rtx; /* actually allocated rtextent no */
+ xfs_rtxlen_t prod = 0; /* product factor for allocators */
+ xfs_extlen_t mod = 0; /* product factor for allocators */
+ xfs_rtxlen_t ralen = 0; /* realtime allocation length */
+ xfs_extlen_t align; /* minimum allocation alignment */
+ xfs_extlen_t orig_length = ap->length;
+ xfs_extlen_t minlen = mp->m_sb.sb_rextsize;
+ xfs_rtxlen_t raminlen;
+ bool rtlocked = false;
+ bool ignore_locality = false;
+ struct xfs_rtalloc_args args = {
+ .mp = mp,
+ .tp = ap->tp,
+ };
+ int error;
+
+ align = xfs_get_extsz_hint(ap->ip);
+retry:
+ error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
+ align, 1, ap->eof, 0,
+ ap->conv, &ap->offset, &ap->length);
+ if (error)
+ return error;
+ ASSERT(ap->length);
+ ASSERT(xfs_extlen_to_rtxmod(mp, ap->length) == 0);
+
+ /*
+ * If we shifted the file offset downward to satisfy an extent size
+ * hint, increase minlen by that amount so that the allocator won't
+ * give us an allocation that's too short to cover at least one of the
+ * blocks that the caller asked for.
+ */
+ if (ap->offset != orig_offset)
+ minlen += orig_offset - ap->offset;
+
+ /*
+ * Set ralen to be the actual requested length in rtextents.
+ *
+ * If the old value was close enough to XFS_BMBT_MAX_EXTLEN that
+ * we rounded up to it, cut it back so it's valid again.
+ * Note that if it's a really large request (bigger than
+ * XFS_BMBT_MAX_EXTLEN), we don't hear about that number, and can't
+ * adjust the starting point to match it.
+ */
+ ralen = xfs_extlen_to_rtxlen(mp, min(ap->length, XFS_MAX_BMBT_EXTLEN));
+ raminlen = max_t(xfs_rtxlen_t, 1, xfs_extlen_to_rtxlen(mp, minlen));
+ ASSERT(raminlen > 0);
+ ASSERT(raminlen <= ralen);
+
+ /*
+ * Lock out modifications to both the RT bitmap and summary inodes
+ */
+ if (!rtlocked) {
+ xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL|XFS_ILOCK_RTBITMAP);
+ xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
+ xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL|XFS_ILOCK_RTSUM);
+ xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
+ rtlocked = true;
+ }
+
+ if (ignore_locality) {
+ start = 0;
+ } else if (xfs_bmap_adjacent(ap)) {
+ start = xfs_rtb_to_rtx(mp, ap->blkno);
+ } else if (ap->eof && ap->offset == 0) {
+ /*
+ * If it's an allocation to an empty file at offset 0, pick an
+ * extent that will space things out in the rt area.
+ */
+ error = xfs_rtpick_extent(mp, ap->tp, ralen, &start);
+ if (error)
+ return error;
+ } else {
+ start = 0;
+ }
+
+ /*
+ * Only bother calculating a real prod factor if offset & length are
+ * perfectly aligned, otherwise it will just get us in trouble.
+ */
+ div_u64_rem(ap->offset, align, &mod);
+ if (mod || ap->length % align) {
+ prod = 1;
+ } else {
+ prod = xfs_extlen_to_rtxlen(mp, align);
+ if (prod > 1)
+ xfs_rtalloc_align_minmax(&raminlen, &ralen, &prod);
+ }
+
+ if (start) {
+ error = xfs_rtallocate_extent_near(&args, start, raminlen,
+ ralen, &ralen, prod, &rtx);
+ } else {
+ error = xfs_rtallocate_extent_size(&args, raminlen,
+ ralen, &ralen, prod, &rtx);
+ }
+ xfs_rtbuf_cache_relse(&args);
+
+ if (error == -ENOSPC) {
+ if (align > mp->m_sb.sb_rextsize) {
+ /*
+ * We previously enlarged the request length to try to
+ * satisfy an extent size hint. The allocator didn't
+ * return anything, so reset the parameters to the
+ * original values and try again without alignment
+ * criteria.
+ */
+ ap->offset = orig_offset;
+ ap->length = orig_length;
+ minlen = align = mp->m_sb.sb_rextsize;
+ goto retry;
+ }
+
+ if (!ignore_locality && start != 0) {
+ /*
+ * If we can't allocate near a specific rt extent, try
+ * again without locality criteria.
+ */
+ ignore_locality = true;
+ goto retry;
+ }
+
+ ap->blkno = NULLFSBLOCK;
+ ap->length = 0;
+ return 0;
+ }
+ if (error)
+ return error;
+
+ xfs_trans_mod_sb(ap->tp, ap->wasdel ?
+ XFS_TRANS_SB_RES_FREXTENTS : XFS_TRANS_SB_FREXTENTS,
+ -(long)ralen);
+ ap->blkno = xfs_rtx_to_rtb(mp, rtx);
+ ap->length = xfs_rtxlen_to_extlen(mp, ralen);
+ xfs_bmap_alloc_account(ap);
+ return 0;
+}
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index f7cb9ffe51ca..a6836da9bebe 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -13,27 +13,6 @@ struct xfs_trans;
#ifdef CONFIG_XFS_RT
/*
- * Function prototypes for exported functions.
- */
-
-/*
- * Allocate an extent in the realtime subvolume, with the usual allocation
- * parameters. The length units are all in realtime extents, as is the
- * result block number.
- */
-int /* error */
-xfs_rtallocate_extent(
- struct xfs_trans *tp, /* transaction pointer */
- xfs_rtxnum_t start, /* starting rtext number to allocate */
- xfs_rtxlen_t minlen, /* minimum length to allocate */
- xfs_rtxlen_t maxlen, /* maximum length to allocate */
- xfs_rtxlen_t *len, /* out: actual length allocated */
- int wasdel, /* was a delayed allocation extent */
- xfs_rtxlen_t prod, /* extent product factor */
- xfs_rtxnum_t *rtblock); /* out: start rtext allocated */
-
-
-/*
* Initialize realtime fields in the mount structure.
*/
int /* error */
@@ -52,20 +31,6 @@ xfs_rtmount_inodes(
struct xfs_mount *mp); /* file system mount structure */
/*
- * Pick an extent for allocation at the start of a new realtime file.
- * Use the sequence number stored in the atime field of the bitmap inode.
- * Translate this to a fraction of the rtextents, and return the product
- * of rtextents and the fraction.
- * The fraction sequence is 0, 1/2, 1/4, 3/4, 1/8, ..., 7/8, 1/16, ...
- */
-int /* error */
-xfs_rtpick_extent(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- xfs_rtxlen_t len, /* allocation length (rtextents) */
- xfs_rtxnum_t *pick); /* result rt extent */
-
-/*
* Grow the realtime area of the filesystem.
*/
int
@@ -75,8 +40,6 @@ xfs_growfs_rt(
int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp);
#else
-# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (-ENOSYS)
-# define xfs_rtpick_extent(m,t,l,rb) (-ENOSYS)
# define xfs_growfs_rt(mp,in) (-ENOSYS)
# define xfs_rtalloc_reinit_frextents(m) (0)
static inline int /* error */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 764304595e8b..98401de832ee 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -350,7 +350,6 @@ xfs_setup_dax_always(
return -EINVAL;
}
- xfs_warn(mp, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
return 0;
disable_dax:
@@ -366,8 +365,9 @@ xfs_blkdev_get(
{
int error = 0;
- *handlep = bdev_open_by_path(name, BLK_OPEN_READ | BLK_OPEN_WRITE,
- mp->m_super, &fs_holder_ops);
+ *handlep = bdev_open_by_path(name,
+ BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES,
+ mp->m_super, &fs_holder_ops);
if (IS_ERR(*handlep)) {
error = PTR_ERR(*handlep);
*handlep = NULL;
@@ -439,18 +439,12 @@ xfs_open_devices(
int error;
/*
- * blkdev_put() can't be called under s_umount, see the comment
- * in get_tree_bdev() for more details
- */
- up_write(&sb->s_umount);
-
- /*
* Open real time and log devices - order is important.
*/
if (mp->m_logname) {
error = xfs_blkdev_get(mp, mp->m_logname, &logdev_handle);
if (error)
- goto out_relock;
+ return error;
}
if (mp->m_rtname) {
@@ -493,10 +487,7 @@ xfs_open_devices(
bdev_release(logdev_handle);
}
- error = 0;
-out_relock:
- down_write(&sb->s_umount);
- return error;
+ return 0;
out_free_rtdev_targ:
if (mp->m_rtdev_targp)
@@ -509,7 +500,7 @@ out_relock:
out_close_logdev:
if (logdev_handle)
bdev_release(logdev_handle);
- goto out_relock;
+ return error;
}
/*
@@ -759,10 +750,6 @@ static void
xfs_mount_free(
struct xfs_mount *mp)
{
- /*
- * Free the buftargs here because blkdev_put needs to be called outside
- * of sb->s_umount, which is held around the call to ->put_super.
- */
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
xfs_free_buftarg(mp->m_logdev_targp);
if (mp->m_rtdev_targp)
@@ -906,10 +893,8 @@ xfs_fs_statfs(
STATIC void
xfs_save_resvblks(struct xfs_mount *mp)
{
- uint64_t resblks = 0;
-
mp->m_resblks_save = mp->m_resblks;
- xfs_reserve_blocks(mp, &resblks, NULL);
+ xfs_reserve_blocks(mp, 0);
}
STATIC void
@@ -923,7 +908,7 @@ xfs_restore_resvblks(struct xfs_mount *mp)
} else
resblks = xfs_default_resblks(mp);
- xfs_reserve_blocks(mp, &resblks, NULL);
+ xfs_reserve_blocks(mp, resblks);
}
/*
@@ -1510,6 +1495,18 @@ xfs_fs_fill_super(
mp->m_super = sb;
+ /*
+ * Copy VFS mount flags from the context now that all parameter parsing
+ * is guaranteed to have been completed by either the old mount API or
+ * the newer fsopen/fsconfig API.
+ */
+ if (fc->sb_flags & SB_RDONLY)
+ set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
+ if (fc->sb_flags & SB_DIRSYNC)
+ mp->m_features |= XFS_FEAT_DIRSYNC;
+ if (fc->sb_flags & SB_SYNCHRONOUS)
+ mp->m_features |= XFS_FEAT_WSYNC;
+
error = xfs_fs_validate_params(mp);
if (error)
return error;
@@ -1979,6 +1976,11 @@ static const struct fs_context_operations xfs_context_ops = {
.free = xfs_fs_free,
};
+/*
+ * WARNING: do not initialise any parameters in this function that depend on
+ * mount option parsing having already been performed as this can be called from
+ * fsopen() before any parameters have been set.
+ */
static int xfs_init_fs_context(
struct fs_context *fc)
{
@@ -2010,16 +2012,6 @@ static int xfs_init_fs_context(
mp->m_logbsize = -1;
mp->m_allocsize_log = 16; /* 64k */
- /*
- * Copy binary VFS mount flags we are interested in.
- */
- if (fc->sb_flags & SB_RDONLY)
- set_bit(XFS_OPSTATE_READONLY, &mp->m_opstate);
- if (fc->sb_flags & SB_DIRSYNC)
- mp->m_features |= XFS_FEAT_DIRSYNC;
- if (fc->sb_flags & SB_SYNCHRONOUS)
- mp->m_features |= XFS_FEAT_WSYNC;
-
fc->s_fs_info = mp;
fc->ops = &xfs_context_ops;
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 85e433df6a3f..92974a4414c8 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -23,6 +23,7 @@
#include "xfs_trans.h"
#include "xfs_ialloc.h"
#include "xfs_error.h"
+#include "xfs_health.h"
/* ----- Kernel only functions below ----- */
int
@@ -108,6 +109,8 @@ xfs_readlink(
if (xfs_is_shutdown(mp))
return -EIO;
+ if (xfs_ifork_zapped(ip, XFS_DATA_FORK))
+ return -EIO;
xfs_ilock(ip, XFS_ILOCK_SHARED);
@@ -128,10 +131,10 @@ xfs_readlink(
* The VFS crashes on a NULL pointer, so return -EFSCORRUPTED
* if if_data is junk.
*/
- if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_u1.if_data))
+ if (XFS_IS_CORRUPT(ip->i_mount, !ip->i_df.if_data))
goto out;
- memcpy(link, ip->i_df.if_u1.if_data, pathlen + 1);
+ memcpy(link, ip->i_df.if_data, pathlen + 1);
error = 0;
} else {
error = xfs_readlink_bmap_ilocked(ip, link);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index fade33735393..a191f6560f98 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -206,8 +206,6 @@ static struct ctl_table xfs_table[] = {
.extra2 = &xfs_params.stats_clear.max
},
#endif /* CONFIG_PROC_FS */
-
- {}
};
int
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index f78ad6b10ea5..276696a07040 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -85,6 +85,8 @@ struct xfs_globals {
int pwork_threads; /* parallel workqueue threads */
bool larp; /* log attribute replay */
#endif
+ int bload_leaf_slack; /* btree bulk load leaf slack */
+ int bload_node_slack; /* btree bulk load node slack */
int log_recovery_delay; /* log recovery delay (secs) */
int mount_delay; /* mount setup delay (secs) */
bool bug_on_assert; /* BUG() the kernel on assert failure */
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index a3c6b1548723..17485666b672 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -229,6 +229,15 @@ pwork_threads_show(
}
XFS_SYSFS_ATTR_RW(pwork_threads);
+/*
+ * The "LARP" (Logged extended Attribute Recovery Persistence) debugging knob
+ * sets the XFS_DA_OP_LOGGED flag on all xfs_attr_set operations performed on
+ * V5 filesystems. As a result, the intermediate progress of all setxattr and
+ * removexattr operations are tracked via the log and can be restarted during
+ * recovery. This is useful for testing xattr recovery prior to merging of the
+ * parent pointer feature which requires it to maintain consistency, and may be
+ * enabled for userspace xattrs in the future.
+ */
static ssize_t
larp_store(
struct kobject *kobject,
@@ -253,6 +262,58 @@ larp_show(
XFS_SYSFS_ATTR_RW(larp);
#endif /* DEBUG */
+STATIC ssize_t
+bload_leaf_slack_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ xfs_globals.bload_leaf_slack = val;
+ return count;
+}
+
+STATIC ssize_t
+bload_leaf_slack_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_leaf_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_leaf_slack);
+
+STATIC ssize_t
+bload_node_slack_store(
+ struct kobject *kobject,
+ const char *buf,
+ size_t count)
+{
+ int ret;
+ int val;
+
+ ret = kstrtoint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ xfs_globals.bload_node_slack = val;
+ return count;
+}
+
+STATIC ssize_t
+bload_node_slack_show(
+ struct kobject *kobject,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.bload_node_slack);
+}
+XFS_SYSFS_ATTR_RW(bload_node_slack);
+
static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(bug_on_assert),
ATTR_LIST(log_recovery_delay),
@@ -262,6 +323,8 @@ static struct attribute *xfs_dbg_attrs[] = {
ATTR_LIST(pwork_threads),
ATTR_LIST(larp),
#endif
+ ATTR_LIST(bload_leaf_slack),
+ ATTR_LIST(bload_node_slack),
NULL,
};
ATTRIBUTE_GROUPS(xfs_dbg);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 3926cf7f2a6e..0984a1c884c7 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -67,6 +67,7 @@ struct xfs_buf_log_format;
struct xfs_inode_log_format;
struct xfs_bmbt_irec;
struct xfs_btree_cur;
+struct xfs_defer_op_type;
struct xfs_refcount_irec;
struct xfs_fsmap;
struct xfs_rmap_irec;
@@ -145,21 +146,23 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
TRACE_EVENT(xlog_intent_recovery_failed,
- TP_PROTO(struct xfs_mount *mp, int error, void *function),
- TP_ARGS(mp, error, function),
+ TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops,
+ int error),
+ TP_ARGS(mp, ops, error),
TP_STRUCT__entry(
__field(dev_t, dev)
+ __string(name, ops->name)
__field(int, error)
- __field(void *, function)
),
TP_fast_assign(
__entry->dev = mp->m_super->s_dev;
+ __assign_str(name, ops->name);
__entry->error = error;
- __entry->function = function;
),
- TP_printk("dev %d:%d error %d function %pS",
+ TP_printk("dev %d:%d optype %s error %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->error, __entry->function)
+ __get_str(name),
+ __entry->error)
);
DECLARE_EVENT_CLASS(xfs_perag_class,
@@ -2549,22 +2552,25 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class,
TP_ARGS(mp, dfp),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(int, type)
+ __string(name, dfp->dfp_ops->name)
__field(void *, intent)
+ __field(unsigned int, flags)
__field(char, committed)
__field(int, nr)
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __entry->type = dfp->dfp_type;
+ __assign_str(name, dfp->dfp_ops->name);
__entry->intent = dfp->dfp_intent;
+ __entry->flags = dfp->dfp_flags;
__entry->committed = dfp->dfp_done != NULL;
__entry->nr = dfp->dfp_count;
),
- TP_printk("dev %d:%d optype %d intent %p committed %d nr %d",
+ TP_printk("dev %d:%d optype %s intent %p flags %s committed %d nr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __get_str(name),
__entry->intent,
+ __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS),
__entry->committed,
__entry->nr)
)
@@ -2675,6 +2681,9 @@ DEFINE_DEFER_PENDING_EVENT(xfs_defer_cancel_list);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
DEFINE_DEFER_PENDING_EVENT(xfs_defer_relog_intent);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_isolate_paused);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_pause);
+DEFINE_DEFER_PENDING_EVENT(xfs_defer_item_unpause);
#define DEFINE_BMAP_FREE_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
DEFINE_BMAP_FREE_DEFERRED_EVENT(xfs_bmap_free_defer);
@@ -2688,25 +2697,28 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_item_class,
TP_ARGS(mp, dfp, item),
TP_STRUCT__entry(
__field(dev_t, dev)
- __field(int, type)
+ __string(name, dfp->dfp_ops->name)
__field(void *, intent)
__field(void *, item)
__field(char, committed)
+ __field(unsigned int, flags)
__field(int, nr)
),
TP_fast_assign(
__entry->dev = mp ? mp->m_super->s_dev : 0;
- __entry->type = dfp->dfp_type;
+ __assign_str(name, dfp->dfp_ops->name);
__entry->intent = dfp->dfp_intent;
__entry->item = item;
__entry->committed = dfp->dfp_done != NULL;
+ __entry->flags = dfp->dfp_flags;
__entry->nr = dfp->dfp_count;
),
- TP_printk("dev %d:%d optype %d intent %p item %p committed %d nr %d",
+ TP_printk("dev %d:%d optype %s intent %p item %p flags %s committed %d nr %d",
MAJOR(__entry->dev), MINOR(__entry->dev),
- __entry->type,
+ __get_str(name),
__entry->intent,
__entry->item,
+ __print_flags(__entry->flags, "|", XFS_DEFER_PENDING_STRINGS),
__entry->committed,
__entry->nr)
)
@@ -4399,8 +4411,6 @@ DEFINE_DAS_STATE_EVENT(xfs_attr_remove_iter_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_alloc);
DEFINE_DAS_STATE_EVENT(xfs_attr_rmtval_remove_return);
DEFINE_DAS_STATE_EVENT(xfs_attr_defer_add);
-DEFINE_DAS_STATE_EVENT(xfs_attr_defer_replace);
-DEFINE_DAS_STATE_EVENT(xfs_attr_defer_remove);
TRACE_EVENT(xfs_force_shutdown,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 305c9d07bf1b..12d45e93f07d 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -1237,6 +1237,68 @@ out_cancel:
}
/*
+ * Try to reserve more blocks for a transaction.
+ *
+ * This is for callers that need to attach resources to a transaction, scan
+ * those resources to determine the space reservation requirements, and then
+ * modify the attached resources. In other words, online repair. This can
+ * fail due to ENOSPC, so the caller must be able to cancel the transaction
+ * without shutting down the fs.
+ */
+int
+xfs_trans_reserve_more(
+ struct xfs_trans *tp,
+ unsigned int blocks,
+ unsigned int rtextents)
+{
+ struct xfs_trans_res resv = { };
+
+ return xfs_trans_reserve(tp, &resv, blocks, rtextents);
+}
+
+/*
+ * Try to reserve more blocks and file quota for a transaction. Same
+ * conditions of usage as xfs_trans_reserve_more.
+ */
+int
+xfs_trans_reserve_more_inode(
+ struct xfs_trans *tp,
+ struct xfs_inode *ip,
+ unsigned int dblocks,
+ unsigned int rblocks,
+ bool force_quota)
+{
+ struct xfs_trans_res resv = { };
+ struct xfs_mount *mp = ip->i_mount;
+ unsigned int rtx = xfs_extlen_to_rtxlen(mp, rblocks);
+ int error;
+
+ ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+
+ error = xfs_trans_reserve(tp, &resv, dblocks, rtx);
+ if (error)
+ return error;
+
+ if (!XFS_IS_QUOTA_ON(mp) || xfs_is_quota_inode(&mp->m_sb, ip->i_ino))
+ return 0;
+
+ if (tp->t_flags & XFS_TRANS_RESERVE)
+ force_quota = true;
+
+ error = xfs_trans_reserve_quota_nblks(tp, ip, dblocks, rblocks,
+ force_quota);
+ if (!error)
+ return 0;
+
+ /* Quota failed, give back the new reservation. */
+ xfs_mod_fdblocks(mp, dblocks, tp->t_flags & XFS_TRANS_RESERVE);
+ tp->t_blk_res -= dblocks;
+ xfs_mod_frextents(mp, rtx);
+ tp->t_rtx_res -= rtx;
+ return error;
+}
+
+/*
* Allocate an transaction in preparation for inode creation by reserving quota
* against the given dquots. Callers are not required to hold any inode locks.
*/
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 6e3646d524ce..08ce757c7454 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -78,11 +78,7 @@ struct xfs_item_ops {
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
void (*iop_release)(struct xfs_log_item *);
- int (*iop_recover)(struct xfs_log_item *lip,
- struct list_head *capture_list);
bool (*iop_match)(struct xfs_log_item *item, uint64_t id);
- struct xfs_log_item *(*iop_relog)(struct xfs_log_item *intent,
- struct xfs_trans *tp);
struct xfs_log_item *(*iop_intent)(struct xfs_log_item *intent_done);
};
@@ -168,6 +164,8 @@ typedef struct xfs_trans {
int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp,
uint blocks, uint rtextents, uint flags,
struct xfs_trans **tpp);
+int xfs_trans_reserve_more(struct xfs_trans *tp,
+ unsigned int blocks, unsigned int rtextents);
int xfs_trans_alloc_empty(struct xfs_mount *mp,
struct xfs_trans **tpp);
void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t);
@@ -247,19 +245,13 @@ void xfs_trans_buf_copy_type(struct xfs_buf *dst_bp,
extern struct kmem_cache *xfs_trans_cache;
-static inline struct xfs_log_item *
-xfs_trans_item_relog(
- struct xfs_log_item *lip,
- struct xfs_trans *tp)
-{
- return lip->li_ops->iop_relog(lip, tp);
-}
-
struct xfs_dquot;
int xfs_trans_alloc_inode(struct xfs_inode *ip, struct xfs_trans_res *resv,
unsigned int dblocks, unsigned int rblocks, bool force,
struct xfs_trans **tpp);
+int xfs_trans_reserve_more_inode(struct xfs_trans *tp, struct xfs_inode *ip,
+ unsigned int dblocks, unsigned int rblocks, bool force_quota);
int xfs_trans_alloc_icreate(struct xfs_mount *mp, struct xfs_trans_res *resv,
struct xfs_dquot *udqp, struct xfs_dquot *gdqp,
struct xfs_dquot *pdqp, unsigned int dblocks,
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 987843f84d03..364104e1b38a 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -136,6 +136,9 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *unused,
};
int error;
+ if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK))
+ return -EIO;
+
error = xfs_attr_get(&args);
if (error)
return error;
@@ -294,6 +297,9 @@ xfs_vn_listxattr(
struct inode *inode = d_inode(dentry);
int error;
+ if (xfs_ifork_zapped(XFS_I(inode), XFS_ATTR_FORK))
+ return -EIO;
+
/*
* First read the regular on-disk attributes.
*/
diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c
index b2c9b35df8f7..dba5dcb62bef 100644
--- a/fs/zonefs/file.c
+++ b/fs/zonefs/file.c
@@ -180,7 +180,7 @@ const struct address_space_operations zonefs_file_aops = {
.invalidate_folio = iomap_invalidate_folio,
.migrate_folio = filemap_migrate_folio,
.is_partially_uptodate = iomap_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.swap_activate = zonefs_swap_activate,
};
@@ -348,7 +348,12 @@ static int zonefs_file_write_dio_end_io(struct kiocb *iocb, ssize_t size,
struct zonefs_inode_info *zi = ZONEFS_I(inode);
if (error) {
- zonefs_io_error(inode, true);
+ /*
+ * For Sync IOs, error recovery is called from
+ * zonefs_file_dio_write().
+ */
+ if (!is_sync_kiocb(iocb))
+ zonefs_io_error(inode, true);
return error;
}
@@ -491,6 +496,14 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = -EINVAL;
goto inode_unlock;
}
+ /*
+ * Advance the zone write pointer offset. This assumes that the
+ * IO will succeed, which is OK to do because we do not allow
+ * partial writes (IOMAP_DIO_PARTIAL is not set) and if the IO
+ * fails, the error path will correct the write pointer offset.
+ */
+ z->z_wpoffset += count;
+ zonefs_inode_account_active(inode);
mutex_unlock(&zi->i_truncate_mutex);
}
@@ -504,20 +517,19 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
if (ret == -ENOTBLK)
ret = -EBUSY;
- if (zonefs_zone_is_seq(z) &&
- (ret > 0 || ret == -EIOCBQUEUED)) {
- if (ret > 0)
- count = ret;
-
- /*
- * Update the zone write pointer offset assuming the write
- * operation succeeded. If it did not, the error recovery path
- * will correct it. Also do active seq file accounting.
- */
- mutex_lock(&zi->i_truncate_mutex);
- z->z_wpoffset += count;
- zonefs_inode_account_active(inode);
- mutex_unlock(&zi->i_truncate_mutex);
+ /*
+ * For a failed IO or partial completion, trigger error recovery
+ * to update the zone write pointer offset to a correct value.
+ * For asynchronous IOs, zonefs_file_write_dio_end_io() may already
+ * have executed error recovery if the IO already completed when we
+ * reach here. However, we cannot know that and execute error recovery
+ * again (that will not change anything).
+ */
+ if (zonefs_zone_is_seq(z)) {
+ if (ret > 0 && ret != count)
+ ret = -EIO;
+ if (ret < 0 && ret != -EIOCBQUEUED)
+ zonefs_io_error(inode, true);
}
inode_unlock:
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index e6a75401677d..b6e8e7c96251 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -246,16 +246,18 @@ static void zonefs_inode_update_mode(struct inode *inode)
z->z_mode = inode->i_mode;
}
-struct zonefs_ioerr_data {
- struct inode *inode;
- bool write;
-};
-
static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
void *data)
{
- struct zonefs_ioerr_data *err = data;
- struct inode *inode = err->inode;
+ struct blk_zone *z = data;
+
+ *z = *zone;
+ return 0;
+}
+
+static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone,
+ bool write)
+{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
@@ -270,8 +272,8 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
data_size = zonefs_check_zone_condition(sb, z, zone);
isize = i_size_read(inode);
if (!(z->z_flags & (ZONEFS_ZONE_READONLY | ZONEFS_ZONE_OFFLINE)) &&
- !err->write && isize == data_size)
- return 0;
+ !write && isize == data_size)
+ return;
/*
* At this point, we detected either a bad zone or an inconsistency
@@ -292,7 +294,7 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
* In all cases, warn about inode size inconsistency and handle the
* IO error according to the zone condition and to the mount options.
*/
- if (zonefs_zone_is_seq(z) && isize != data_size)
+ if (isize != data_size)
zonefs_warn(sb,
"inode %lu: invalid size %lld (should be %lld)\n",
inode->i_ino, isize, data_size);
@@ -352,8 +354,6 @@ static int zonefs_io_error_cb(struct blk_zone *zone, unsigned int idx,
zonefs_i_size_write(inode, data_size);
z->z_wpoffset = data_size;
zonefs_inode_account_active(inode);
-
- return 0;
}
/*
@@ -367,23 +367,25 @@ void __zonefs_io_error(struct inode *inode, bool write)
{
struct zonefs_zone *z = zonefs_inode_zone(inode);
struct super_block *sb = inode->i_sb;
- struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
unsigned int noio_flag;
- unsigned int nr_zones = 1;
- struct zonefs_ioerr_data err = {
- .inode = inode,
- .write = write,
- };
+ struct blk_zone zone;
int ret;
/*
- * The only files that have more than one zone are conventional zone
- * files with aggregated conventional zones, for which the inode zone
- * size is always larger than the device zone size.
+ * Conventional zone have no write pointer and cannot become read-only
+ * or offline. So simply fake a report for a single or aggregated zone
+ * and let zonefs_handle_io_error() correct the zone inode information
+ * according to the mount options.
*/
- if (z->z_size > bdev_zone_sectors(sb->s_bdev))
- nr_zones = z->z_size >>
- (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+ if (!zonefs_zone_is_seq(z)) {
+ zone.start = z->z_sector;
+ zone.len = z->z_size >> SECTOR_SHIFT;
+ zone.wp = zone.start + zone.len;
+ zone.type = BLK_ZONE_TYPE_CONVENTIONAL;
+ zone.cond = BLK_ZONE_COND_NOT_WP;
+ zone.capacity = zone.len;
+ goto handle_io_error;
+ }
/*
* Memory allocations in blkdev_report_zones() can trigger a memory
@@ -394,12 +396,20 @@ void __zonefs_io_error(struct inode *inode, bool write)
* the GFP_NOIO context avoids both problems.
*/
noio_flag = memalloc_noio_save();
- ret = blkdev_report_zones(sb->s_bdev, z->z_sector, nr_zones,
- zonefs_io_error_cb, &err);
- if (ret != nr_zones)
+ ret = blkdev_report_zones(sb->s_bdev, z->z_sector, 1,
+ zonefs_io_error_cb, &zone);
+ memalloc_noio_restore(noio_flag);
+
+ if (ret != 1) {
zonefs_err(sb, "Get inode %lu zone information failed %d\n",
inode->i_ino, ret);
- memalloc_noio_restore(noio_flag);
+ zonefs_warn(sb, "remounting filesystem read-only\n");
+ sb->s_flags |= SB_RDONLY;
+ return;
+ }
+
+handle_io_error:
+ zonefs_handle_io_error(inode, &zone, write);
}
static struct kmem_cache *zonefs_inode_cachep;
@@ -747,8 +757,6 @@ static struct dentry *zonefs_lookup(struct inode *dir, struct dentry *dentry,
inode = zonefs_get_dir_inode(dir, dentry);
else
inode = zonefs_get_file_inode(dir, dentry);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
return d_splice_alias(inode, dentry);
}